Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_protein_fasta_only():
genome_only_proteins = Genome(
reference_name="GRCm38",
annotation_name="_test_mouse_ensembl81_subset",
protein_fasta_paths_or_urls=[MOUSE_ENSMUSG00000017167_PROTEIN_FASTA_PATH])
genome_only_proteins.index()
eq_(4, len(genome_only_proteins.protein_sequences.fasta_dictionary))
with assert_raises(ValueError) as cm:
genome_only_proteins.genes()
no_gtf_(cm)
with assert_raises(ValueError) as cm:
genome_only_proteins.transcript_sequence("DOES_NOT_EXIST")
no_transcript_(cm)
def test_gtf_only():
genome_only_gtf = Genome(
reference_name="GRCm38",
annotation_name="_test_mouse_ensembl81_subset",
gtf_path_or_url=MOUSE_ENSMUSG00000017167_PATH)
genome_only_gtf.index()
eq_(1, len(genome_only_gtf.genes()))
with assert_raises(ValueError) as cm:
genome_only_gtf.transcript_sequence("DOES_NOT_EXIST")
no_transcript_(cm)
with assert_raises(ValueError) as cm:
genome_only_gtf.protein_sequence("genome_only_gtf")
no_protein_(cm)
def test_gtf_transcript_only():
genome_gtf_with_cdna = Genome(
reference_name="GRCm38",
annotation_name="_test_mouse_ensembl81_subset",
gtf_path_or_url=MOUSE_ENSMUSG00000017167_PATH,
transcript_fasta_paths_or_urls=[MOUSE_ENSMUSG00000017167_TRANSCRIPT_FASTA_PATH])
genome_gtf_with_cdna.index()
eq_(1, len(genome_gtf_with_cdna.genes()))
transcript = genome_gtf_with_cdna.transcripts()[0]
ok_(transcript.sequence)
with assert_raises(ValueError) as cm:
transcript.protein_sequence
no_protein_(cm)
from .data import data_path
MOUSE_ENSEMBL_RELEASE = 95
SERVER = "ftp://ftp.ensembl.org"
MOUSE_GTF_PATH = \
SERVER + "/pub/release-%d/gtf/mus_musculus/Mus_musculus.GRCm38.%d.gtf.gz" % (
MOUSE_ENSEMBL_RELEASE, MOUSE_ENSEMBL_RELEASE)
MOUSE_TRANSCRIPT_FASTA_PATH = \
SERVER + "/pub/release-%d/fasta/mus_musculus/cdna/Mus_musculus.GRCm38.cdna.all.fa.gz"
MOUSE_PROTEIN_FASTA_PATH = \
SERVER + "/pub/release-%d/fasta/mus_musculus/pep/Mus_musculus.GRCm38.pep.all.fa.gz" % (
MOUSE_ENSEMBL_RELEASE)
MOUSE_VCF = data_path("mouse_vcf_dbsnp_chr1_partial.vcf")
explicit_url_genome = Genome(
reference_name="GRCm38",
annotation_name="ensembl",
annotation_version=MOUSE_ENSEMBL_RELEASE,
gtf_path_or_url=MOUSE_GTF_PATH,
transcript_fasta_paths_or_urls=[MOUSE_TRANSCRIPT_FASTA_PATH],
protein_fasta_paths_or_urls=[MOUSE_PROTEIN_FASTA_PATH])
ensembl_mouse_genome = EnsemblRelease(MOUSE_ENSEMBL_RELEASE, species="mouse")
def test_load_vcf_mouse_with_explicit_urls():
variants = load_vcf(MOUSE_VCF, genome=explicit_url_genome)
eq_(len(variants), 217)
def test_load_vcf_mouse_with_ensembl_release():
variants = load_vcf(MOUSE_VCF, genome=ensembl_mouse_genome)
eq_(len(variants), 217)
def test_transcript_fasta_only():
genome = Genome(
reference_name="GRCm38",
annotation_name="_test_mouse_ensembl81_subset",
transcript_fasta_paths_or_urls=[MOUSE_ENSMUSG00000017167_TRANSCRIPT_FASTA_PATH])
genome.index()
eq_(2, len(genome.transcript_sequences.fasta_dictionary))
with assert_raises(ValueError) as cm:
genome.genes()
no_gtf_(cm)
with assert_raises(ValueError) as cm:
genome.gene_ids()
no_gtf_(cm)
with assert_raises(ValueError) as cm:
# grep "ENSMUSG00000017167" Mus_musculus.GRCm38.pep.all.fa -A 50
# Tested against:
# http://useast.ensembl.org/Mus_musculus/Gene/Summary?db=core;g=ENSMUSG00000017167
MOUSE_ENSMUSG00000017167_PATH = data_path(
"mouse.ensembl.81.partial.ENSMUSG00000017167.gtf")
MOUSE_ENSMUSG00000017167_TRANSCRIPT_FASTA_PATH = data_path(
"mouse.ensembl.81.partial.ENSMUSG00000017167.fa")
MOUSE_ENSMUSG00000088969_NCRNA_FASTA_PATH = data_path(
"mouse.ensembl.81.partial.ncrna.ENSMUSG00000017167.fa")
MOUSE_ENSMUSG00000017167_PROTEIN_FASTA_PATH = data_path(
"mouse.ensembl.81.partial.ENSMUSG00000017167.pep")
custom_mouse_genome_grcm38_subset = Genome(
reference_name="GRCm38",
annotation_name="_test_mouse_ensembl81_subset",
gtf_path_or_url=MOUSE_ENSMUSG00000017167_PATH,
transcript_fasta_paths_or_urls=[MOUSE_ENSMUSG00000017167_TRANSCRIPT_FASTA_PATH],
protein_fasta_paths_or_urls=[MOUSE_ENSMUSG00000017167_PROTEIN_FASTA_PATH])
def setup_init_custom_mouse_genome():
"""
If a unit test needs to start from a cleared cache, add this to the test
setup.
"""
custom_mouse_genome_grcm38_subset.clear_cache()
custom_mouse_genome_grcm38_subset.index()
def test_gtf_protein_only():
genome_gtf_with_proteins = Genome(
reference_name="GRCm38",
annotation_name="_test_mouse_ensembl81_subset",
gtf_path_or_url=MOUSE_ENSMUSG00000017167_PATH,
protein_fasta_paths_or_urls=[MOUSE_ENSMUSG00000017167_PROTEIN_FASTA_PATH])
genome_gtf_with_proteins.index()
eq_(1, len(genome_gtf_with_proteins.genes()))
transcript = genome_gtf_with_proteins.transcripts()[0]
ok_(transcript.protein_sequence)
with assert_raises(ValueError) as cm:
transcript.sequence
no_transcript_(cm)
def test_ucsc_refseq_genome():
"""
Test Genome object with a small RefSeq GTF file downloaded from
http://genome.ucsc.edu/cgi-bin/hgTables
"""
with TemporaryDirectory() as tmpdir:
genome = Genome(
reference_name="GRCh38",
annotation_name="ucsc_test",
gtf_path_or_url=UCSC_REFSEQ_PATH,
cache_directory_path=tmpdir)
genome.index()
genes = genome.genes()
for gene in genes:
assert gene.id, \
"Gene with missing ID in %s" % (genome.db._load_gtf_as_dataframe(),)
assert len(genes) == 2, \
"Expected 2 genes, got %d: %s" % (
len(genes), genes)
transcripts = genome.transcripts()
for transcript in transcripts:
assert transcript.id, \
"Transcript with missing ID in %s" % (genome.db._load_gtf_as_dataframe(),)
def test_ucsc_gencode_genome():
"""
Testing with a small GENCODE GTF file downloaded from
http://genome.ucsc.edu/cgi-bin/hgTables
"""
with TemporaryDirectory() as tmpdir:
genome = Genome(
reference_name="GRCh38",
annotation_name="ucsc_test",
gtf_path_or_url=UCSC_GENCODE_PATH,
cache_directory_path=tmpdir)
genome.index()
genes = genome.genes()
for gene in genes:
assert gene.id, \
"Gene with missing ID in %s" % (genome.gtf.dataframe(),)
assert len(genes) == 7, \
"Expected 7 genes, got %d: %s" % (
len(genes), genes)
transcripts = genome.transcripts()
for transcript in transcripts:
assert transcript.id, \
"Transcript with missing ID in %s" % (genome.gtf.dataframe(),)
def infer_genome(genome_object_string_or_int):
"""
If given an integer, return associated human EnsemblRelease for that
Ensembl version.
If given a string, return latest EnsemblRelease which has a reference
of the same name.
If given a PyEnsembl Genome, simply return it.
"""
if isinstance(genome_object_string_or_int, Genome):
return genome_object_string_or_int
if is_integer(genome_object_string_or_int):
return cached_release(genome_object_string_or_int)
elif is_string(genome_object_string_or_int):
# first infer the canonical reference name, e.g. mapping hg19 -> GRCh37
# and then get the associated PyEnsembl Genome object
reference_name = infer_reference_name(genome_object_string_or_int)
return genome_for_reference_name(reference_name)
else:
raise TypeError(
("Expected genome to be an int, string, or pyensembl.Genome "
"instance, got %s : %s") % (
str(genome_object_string_or_int),
type(genome_object_string_or_int)))