Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_pandas_and_pyvcf_implementations_equivalent():
paths = [
{'path': data_path("somatic_hg19_14muts.vcf")},
{'path': data_path("somatic_hg19_14muts.space_in_sample_name.vcf")},
{'path': "/" + data_path("somatic_hg19_14muts.vcf")},
{'path': data_path("somatic_hg19_14muts.vcf.gz")},
{'path': data_path("multiallelic.vcf")},
{'path': data_path("mutect-example.vcf")},
{'path': data_path("strelka-example.vcf")},
{'path': data_path("mutect-example-headerless.vcf"),
'genome': cached_release(75)},
]
if RUN_TESTS_REQUIRING_INTERNET:
paths.append({'path': VCF_EXTERNAL_URL})
paths.append({'path': VCF_EXTERNAL_URL + ".gz"})
def do_test(kwargs):
vcf_pandas = load_vcf_fast(**kwargs)
vcf_pyvcf = load_vcf(**kwargs)
eq_(vcf_pandas, vcf_pyvcf)
eq_(len(vcf_pandas), len(vcf_pyvcf))
eq_(vcf_pandas.elements, vcf_pyvcf.elements)
eq_(vcf_pandas.metadata, vcf_pyvcf.metadata)
assert len(vcf_pandas) > 1
assert len(vcf_pyvcf) > 1
for kwargs in paths:
"""
Test all methods which return collections of gene IDs that aren't converting
from some other type of name or ID.
TODO: Implement tests for EnsemblRelease.gene_ids
"""
from __future__ import absolute_import
from nose.tools import assert_raises, ok_
from pyensembl import ensembl_grch38, cached_release
from .common import test_ensembl_releases
ensembl77 = cached_release(77, "human")
def test_gene_ids_grch38_hla_a():
# chr6:29,945,884 is a position for HLA-A
# Gene ID = ENSG00000206503
# based on:
# http://useast.ensembl.org/Homo_sapiens/Gene/
# Summary?db=core;g=ENSG00000206503;r=6:29941260-29945884
ids = ensembl_grch38.gene_ids_at_locus(6, 29945884)
expected = "ENSG00000206503"
assert ids == ["ENSG00000206503"], \
"Expected HLA-A, gene ID = %s, got: %s" % (expected, ids)
def test_gene_ids_of_gene_name_hla_grch38():
hla_a_gene_ids = ensembl_grch38.gene_ids_of_gene_name("HLA-A")
assert 'ENSG00000206503' in hla_a_gene_ids, hla_a_gene_ids
"""
Exon IDs of the TP53 gene and one of its transcripts (TP53-026) were copied
from the Ensembl website, make sure same IDs are found by pyensembl.
"""
from __future__ import absolute_import
from pyensembl import cached_release
ensembl = cached_release(77)
# all exons associated with TP53 gene in Ensembl release 77
TP53_EXON_IDS_RELEASE_77 = [
'ENSE00002337729', 'ENSE00002419584',
'ENSE00003625790', 'ENSE00003518480',
'ENSE00003723991', 'ENSE00003712342',
'ENSE00001657961', 'ENSE00003725258',
'ENSE00003740946', 'ENSE00002204316',
'ENSE00002064269', 'ENSE00003750554',
'ENSE00003634848', 'ENSE00003492844',
'ENSE00003735852', 'ENSE00003545950',
'ENSE00003605891', 'ENSE00002051192',
'ENSE00002084733', 'ENSE00003726882',
'ENSE00001146308', 'ENSE00002667911',
'ENSE00003752869', 'ENSE00003739898',
'ENSE00003753508', 'ENSE00002034209',
StartLoss,
AlternateStartCodon,
PrematureStop,
FrameShift,
ExonLoss,
ExonicSpliceSite,
FrameShiftTruncation,
# TODO: SpliceDonor, SpliceReceptor
)
from pyensembl import ensembl_grch37, cached_release, genome_for_reference_name
from .common import expect_effect
# tried using more recent releases but found that many of them
# are very specific to Ensembl data between releases 77-81
ensembl_grch38 = cached_release(81)
mouse_genome = genome_for_reference_name("grcm38")
def test_incomplete():
# transcript EGFR-009 (ENST00000450046 in Ensembl 78)
# has an incomplete 3' end
# chrom. 7 starting at 55,109,723
# first exon begins: ATCATTCCTTTGGGCCTAGGA
# change the first nucleotide of the 5' UTR A>T
variant = Variant("7", 55109723, "A", "T", ensembl=ensembl_grch38)
expect_effect(
variant,
transcript_id="ENST00000450046",
effect_class=IncompleteTranscript,
modifies_coding_sequence=False,
def test_ensembl_releases(*versions):
"""
Run a unit test which takes an EnsemblRelease as an argument
for multiple releases (most recent for each reference genome)
"""
if len(versions) == 0:
ensembl_releases = major_releases
else:
ensembl_releases = [cached_release(version) for version in versions]
def decorator(test_fn):
@functools.wraps(test_fn)
def new_test_fn():
for ensembl in ensembl_releases:
test_fn(ensembl)
return new_test_fn
return decorator
def test_ensembl_releases(*versions):
"""
Run a unit test which takes an EnsemblRelease as an argument
for multiple releases (most recent for each reference genome)
"""
if len(versions) == 0:
ensembl_releases = major_releases
else:
if any(version > MAX_ENSEMBL_RELEASE for version in versions):
raise ValueError("Invalid ensembl release numbers: %s" % (versions,))
ensembl_releases = [cached_release(version) for version in versions]
def decorator(test_fn):
@functools.wraps(test_fn)
def new_test_fn():
for ensembl in ensembl_releases:
test_fn(ensembl)
return new_test_fn
return decorator
from .common import test_ensembl_releases
from .data import (
FOXP3_001_transcript_id,
CTNNBIP1_004_transcript_id,
CTNNBIP1_004_UTR5,
CTNNBIP1_004_UTR3,
CTNNBIP1_004_CDS,
CTNNBIP1_004_locus,
CTTNNIP1_004_exon_lengths,
CTTNNIP1_004_exon_ids,
EGFR_001_protein_sequence,
TP53_gene_id,
)
ensembl77 = cached_release(77)
def test_transcript_start_codon():
"""
test_transcript_start_codon : Check that fields Transcript
(for transcript named CTNNBIP1-004) matches known values.
"""
CTNNBIP1_004_transcript = ensembl77.transcript_by_id(
CTNNBIP1_004_transcript_id)
assert Locus.__eq__(CTNNBIP1_004_transcript, CTNNBIP1_004_locus), \
"Expected locus %s but got %s" % (
CTNNBIP1_004_locus, Locus.__str__(CTNNBIP1_004_transcript))
start_offsets = CTNNBIP1_004_transcript.start_codon_spliced_offsets
assert len(start_offsets) == 3, \
def test_genome_arg_to_load_vcf():
variants = load_vcf(VCF_FILENAME)
eq_(variants, load_vcf(VCF_FILENAME, genome=75))
eq_(variants, load_vcf(VCF_FILENAME, genome=cached_release(75)))
eq_(variants, load_vcf(VCF_FILENAME, genome="grch37"))
eq_(variants, load_vcf(VCF_FILENAME, genome="GRCh37"))
eq_(variants, load_vcf(VCF_FILENAME, genome="b37"))
# TODO: actually make hg19 different from b37! They should use
# different MT sequences
eq_(variants, load_vcf(VCF_FILENAME, genome="hg19"))
"""
Check that pyensembl returns correct exon objects for exon IDs
and loci. Make sure the information on the exon object matches
the expected gene ID and location.
"""
from __future__ import absolute_import
from pyensembl import cached_release
ensembl = cached_release(77)
def test_exon_object_by_id():
"""
test_exon_object_by_id : check properties of exon 4 of CTNNB1 when looked
up by ID in Ensembl 77.
"""
exon = ensembl.exon_by_id("ENSE00003464041")
assert exon.gene_name == "CTNNB1", \
"Unexpected gene name: %s" % exon.gene_name
assert exon.contig == "3", exon.contig
assert exon.strand == "+"
assert exon.on_forward_strand
assert exon.on_positive_strand
assert exon.start == 41224526, "Unexpected exon start: %s" % exon.start
assert exon.end == 41224753, "Unexpected exon end: %s" % exon.end
assert exon.length == len(exon) == 228
Returns
-------
kallisto_data : Pandas dataframe
Pandas dataframe with Kallisto data for all patients
columns include patient_id, gene_name, est_counts
"""
kallisto_data = pd.concat(
[self._load_single_patient_kallisto(patient) for patient in self],
copy=False
)
if self.kallisto_ensembl_version is None:
raise ValueError("Required a kallisto_ensembl_version but none was specified")
ensembl_release = cached_release(self.kallisto_ensembl_version)
kallisto_data["gene_name"] = \
kallisto_data["target_id"].map(lambda t: ensembl_release.gene_name_of_transcript_id(t))
# sum counts across genes
kallisto_data = \
kallisto_data.groupby(["patient_id", "gene_name"])[["est_counts"]].sum().reset_index()
return kallisto_data