Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
"""
Tests for methods which return collections of transcript IDs that aren't
converting from some type of name or ID.
"""
from __future__ import absolute_import
from pyensembl import genome_for_reference_name
from nose.tools import eq_
from .common import test_ensembl_releases
grch38 = genome_for_reference_name("GRCh38")
# subset of transcript IDs for HLA-A
HLA_A_TRANSCRIPT_IDS = [
'ENST00000396634',
'ENST00000376809',
'ENST00000376806',
'ENST00000376802',
'ENST00000496081',
'ENST00000495183',
'ENST00000461903',
'ENST00000479320',
]
def test_transcript_ids_ensembl_grch38_hla_a():
# chr6:29,945,884 is a position for HLA-A
# based on:
from __future__ import print_function, absolute_import
from pyensembl import genome_for_reference_name
from tinytimer import benchmark
ensembl = genome_for_reference_name("GRCh38")
contigs = [str(i + 1) for i in range(22)] + ["X", "Y"]
def make_repeat_lookup_fn(lookup_fn, n_positions):
"""
Make a thunk which calls the lookup_fn at a number of loci
for each human chromosome (excluding MT).
"""
def repeat_lookup_fn():
for contig in contigs:
for position in [10 ** 6 + i * 10 ** 6 for i in range(n_positions)]:
lookup_fn(contig, position)
return repeat_lookup_fn
def run_benchmark(lookup_fn, n_positions_per_contig=20, time_limit=60.0):
"""
Take a lookup functions (such as EnsemblRelease.genes_at_locus) and
def test_reference_sequence_key_from_weird_deletion():
# variant reads into the intron; want to make sure isovar skips over such cases
mouse_genome = genome_for_reference_name("grcm38")
variant = Variant(
"11", 106262686, "GTGAAGG", "", mouse_genome)
transcript = mouse_genome.transcript_by_id("ENSMUST00000021049")
sequence_key = ReferenceSequenceKey.from_variant_and_transcript(
variant=variant,
transcript=transcript,
context_size=10)
assert sequence_key is None, '%s\n%s' % (sequence_key, transcript)
PrematureStop,
FrameShift,
ExonLoss,
ExonicSpliceSite,
FrameShiftTruncation,
# TODO: SpliceDonor, SpliceReceptor
)
from pyensembl import ensembl_grch37, cached_release, genome_for_reference_name
from .common import expect_effect
# tried using more recent releases but found that many of them
# are very specific to Ensembl data between releases 77-81
ensembl_grch38 = cached_release(81)
mouse_genome = genome_for_reference_name("grcm38")
def test_incomplete():
# transcript EGFR-009 (ENST00000450046 in Ensembl 78)
# has an incomplete 3' end
# chrom. 7 starting at 55,109,723
# first exon begins: ATCATTCCTTTGGGCCTAGGA
# change the first nucleotide of the 5' UTR A>T
variant = Variant("7", 55109723, "A", "T", ensembl=ensembl_grch38)
expect_effect(
variant,
transcript_id="ENST00000450046",
effect_class=IncompleteTranscript,
modifies_coding_sequence=False,
modifies_protein_sequence=False)
from __future__ import absolute_import
import functools
from pyensembl import (
genome_for_reference_name,
cached_release,
MAX_ENSEMBL_RELEASE,
)
from nose.tools import nottest
grch37 = genome_for_reference_name("GRCh37")
grch38 = genome_for_reference_name("GRCh38")
major_releases = [grch37, grch38]
contigs = [str(c) for c in range(1, 23)] + ["X", "Y", "M"]
@nottest
def test_ensembl_releases(*versions):
"""
Run a unit test which takes an EnsemblRelease as an argument
for multiple releases (most recent for each reference genome)
"""
if len(versions) == 0:
ensembl_releases = major_releases
else:
from __future__ import absolute_import
import functools
from pyensembl import (
genome_for_reference_name,
cached_release,
MAX_ENSEMBL_RELEASE,
)
from nose.tools import nottest
grch37 = genome_for_reference_name("GRCh37")
grch38 = genome_for_reference_name("GRCh38")
major_releases = [grch37, grch38]
contigs = [str(c) for c in range(1, 23)] + ["X", "Y", "M"]
@nottest
def test_ensembl_releases(*versions):
"""
Run a unit test which takes an EnsemblRelease as an argument
for multiple releases (most recent for each reference genome)
"""
if len(versions) == 0:
ensembl_releases = major_releases
else:
if any(version > MAX_ENSEMBL_RELEASE for version in versions):
"""Make sure we're getting correct transcritp sequence from Ensembl and that
it's a sequence type which correctly implements `complement`
and `reverse_complement`
"""
from __future__ import absolute_import
from nose.tools import eq_
from pyensembl import genome_for_reference_name
grch38 = genome_for_reference_name("GRCh38")
def test_transcript_sequence_ensembl_grch38():
# extremely short TRD gene
seq = grch38.transcript_sequence("ENST00000448914")
expected = "ACTGGGGGATACG"
eq_(seq, expected)
# now try via a Transcript object
eq_(grch38.transcript_by_id("ENST00000448914").sequence, expected)
"""
Test all methods which return collections of gene names that aren't converting
from some other type of name or ID.
"""
from __future__ import absolute_import, print_function
from pyensembl import genome_for_reference_name
from .common import test_ensembl_releases
grch38 = genome_for_reference_name("GRCh38")
KNOWN_GENE_NAMES = [
"TP53",
"ERBB2",
"SMAD4",
"CTAG1A",
"HLA-A",
]
@test_ensembl_releases()
def test_all_gene_names(ensembl):
"""
test_all_gene_names : Make sure some known gene names such as
SMAD4, TP53, ERBB2, &c
"""
gene_names = ensembl.gene_names()
Ensembl version.
If given a string, return latest EnsemblRelease which has a reference
of the same name.
If given a PyEnsembl Genome, simply return it.
"""
if isinstance(genome_object_string_or_int, Genome):
return genome_object_string_or_int
if is_integer(genome_object_string_or_int):
return cached_release(genome_object_string_or_int)
elif is_string(genome_object_string_or_int):
# first infer the canonical reference name, e.g. mapping hg19 -> GRCh37
# and then get the associated PyEnsembl Genome object
reference_name = infer_reference_name(genome_object_string_or_int)
return genome_for_reference_name(reference_name)
else:
raise TypeError(
("Expected genome to be an int, string, or pyensembl.Genome "
"instance, got %s : %s") % (
str(genome_object_string_or_int),
type(genome_object_string_or_int)))
def variant_collection_from_args(args, required=True):
variant_collections = []
if args.genome:
genome = genome_for_reference_name(args.genome)
else:
# no genome specified, assume it can be inferred from the file(s)
# we're loading
genome = None
for vcf_path in args.vcf:
variant_collections.append(load_vcf(vcf_path, genome=genome))
for maf_path in args.maf:
variant_collections.append(load_maf(maf_path))
if args.variant:
if not genome:
raise ValueError(
"--reference-name must be specified when using --variant")
variants = [