How to use the pyensembl.cached_release function in pyensembl

To help you get started, we’ve selected a few pyensembl examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github openvax / varcode / test / test_vcf.py View on Github external
def test_pandas_and_pyvcf_implementations_equivalent():
    paths = [
        {'path': data_path("somatic_hg19_14muts.vcf")},
        {'path': data_path("somatic_hg19_14muts.space_in_sample_name.vcf")},
        {'path': "/" + data_path("somatic_hg19_14muts.vcf")},
        {'path': data_path("somatic_hg19_14muts.vcf.gz")},
        {'path': data_path("multiallelic.vcf")},
        {'path': data_path("mutect-example.vcf")},
        {'path': data_path("strelka-example.vcf")},
        {'path': data_path("mutect-example-headerless.vcf"),
            'genome': cached_release(75)},
    ]
    if RUN_TESTS_REQUIRING_INTERNET:
        paths.append({'path': VCF_EXTERNAL_URL})
        paths.append({'path': VCF_EXTERNAL_URL + ".gz"})

    def do_test(kwargs):
        vcf_pandas = load_vcf_fast(**kwargs)
        vcf_pyvcf = load_vcf(**kwargs)
        eq_(vcf_pandas, vcf_pyvcf)
        eq_(len(vcf_pandas), len(vcf_pyvcf))
        eq_(vcf_pandas.elements, vcf_pyvcf.elements)
        eq_(vcf_pandas.metadata, vcf_pyvcf.metadata)
        assert len(vcf_pandas) > 1
        assert len(vcf_pyvcf) > 1

    for kwargs in paths:
github openvax / pyensembl / test / test_gene_ids.py View on Github external
"""
Test all methods which return collections of gene IDs that aren't converting
from some other type of name or ID.

TODO: Implement tests for EnsemblRelease.gene_ids
"""
from __future__ import absolute_import

from nose.tools import assert_raises, ok_
from pyensembl import ensembl_grch38, cached_release

from .common import test_ensembl_releases

ensembl77 = cached_release(77, "human")

def test_gene_ids_grch38_hla_a():
    # chr6:29,945,884  is a position for HLA-A
    # Gene ID = ENSG00000206503
    # based on:
    # http://useast.ensembl.org/Homo_sapiens/Gene/
    # Summary?db=core;g=ENSG00000206503;r=6:29941260-29945884
    ids = ensembl_grch38.gene_ids_at_locus(6, 29945884)
    expected = "ENSG00000206503"
    assert ids == ["ENSG00000206503"], \
        "Expected HLA-A, gene ID = %s, got: %s" % (expected, ids)

def test_gene_ids_of_gene_name_hla_grch38():
    hla_a_gene_ids = ensembl_grch38.gene_ids_of_gene_name("HLA-A")
    assert 'ENSG00000206503' in hla_a_gene_ids, hla_a_gene_ids
github openvax / pyensembl / test / test_exon_id.py View on Github external
"""
Exon IDs of the TP53 gene and one of its transcripts (TP53-026) were copied
from the Ensembl website, make sure same IDs are found by pyensembl.
"""
from __future__ import absolute_import

from pyensembl import cached_release

ensembl = cached_release(77)

# all exons associated with TP53 gene in Ensembl release 77
TP53_EXON_IDS_RELEASE_77 = [
    'ENSE00002337729', 'ENSE00002419584',
    'ENSE00003625790', 'ENSE00003518480',
    'ENSE00003723991', 'ENSE00003712342',
    'ENSE00001657961', 'ENSE00003725258',
    'ENSE00003740946', 'ENSE00002204316',
    'ENSE00002064269', 'ENSE00003750554',
    'ENSE00003634848', 'ENSE00003492844',
    'ENSE00003735852', 'ENSE00003545950',
    'ENSE00003605891', 'ENSE00002051192',
    'ENSE00002084733', 'ENSE00003726882',
    'ENSE00001146308', 'ENSE00002667911',
    'ENSE00003752869', 'ENSE00003739898',
    'ENSE00003753508', 'ENSE00002034209',
github openvax / varcode / test / test_effect_classes.py View on Github external
StartLoss,
    AlternateStartCodon,
    PrematureStop,
    FrameShift,
    ExonLoss,
    ExonicSpliceSite,
    FrameShiftTruncation,
    # TODO: SpliceDonor, SpliceReceptor
)
from pyensembl import ensembl_grch37, cached_release, genome_for_reference_name

from .common import expect_effect

# tried using more recent releases but found that many of them
# are very specific to Ensembl data between releases 77-81
ensembl_grch38 = cached_release(81)

mouse_genome = genome_for_reference_name("grcm38")

def test_incomplete():
    # transcript EGFR-009 (ENST00000450046 in Ensembl 78)
    # has an incomplete 3' end
    # chrom. 7 starting at 55,109,723
    # first exon begins: ATCATTCCTTTGGGCCTAGGA

    # change the first nucleotide of the 5' UTR A>T
    variant = Variant("7", 55109723, "A", "T", ensembl=ensembl_grch38)
    expect_effect(
        variant,
        transcript_id="ENST00000450046",
        effect_class=IncompleteTranscript,
        modifies_coding_sequence=False,
github openvax / pyensembl / test / common.py View on Github external
def test_ensembl_releases(*versions):
    """
    Run a unit test which takes an EnsemblRelease as an argument
    for multiple releases (most recent for each reference genome)
    """
    if len(versions) == 0:
        ensembl_releases = major_releases
    else:
        ensembl_releases = [cached_release(version) for version in versions]

    def decorator(test_fn):
        @functools.wraps(test_fn)
        def new_test_fn():
            for ensembl in ensembl_releases:
                test_fn(ensembl)
        return new_test_fn
    return decorator
github openvax / pyensembl / test / common.py View on Github external
def test_ensembl_releases(*versions):
    """
    Run a unit test which takes an EnsemblRelease as an argument
    for multiple releases (most recent for each reference genome)
    """

    if len(versions) == 0:
        ensembl_releases = major_releases
    else:
        if any(version > MAX_ENSEMBL_RELEASE for version in versions):
            raise ValueError("Invalid ensembl release numbers: %s" % (versions,))
        ensembl_releases = [cached_release(version) for version in versions]

    def decorator(test_fn):
        @functools.wraps(test_fn)
        def new_test_fn():
            for ensembl in ensembl_releases:
                test_fn(ensembl)
        return new_test_fn
    return decorator
github openvax / pyensembl / test / test_transcript_objects.py View on Github external
from .common import test_ensembl_releases
from .data import (
    FOXP3_001_transcript_id,
    CTNNBIP1_004_transcript_id,
    CTNNBIP1_004_UTR5,
    CTNNBIP1_004_UTR3,
    CTNNBIP1_004_CDS,
    CTNNBIP1_004_locus,
    CTTNNIP1_004_exon_lengths,
    CTTNNIP1_004_exon_ids,
    EGFR_001_protein_sequence,
    TP53_gene_id,
)

ensembl77 = cached_release(77)


def test_transcript_start_codon():
    """
    test_transcript_start_codon : Check that fields Transcript
    (for transcript named CTNNBIP1-004) matches known values.
    """
    CTNNBIP1_004_transcript = ensembl77.transcript_by_id(
        CTNNBIP1_004_transcript_id)

    assert Locus.__eq__(CTNNBIP1_004_transcript, CTNNBIP1_004_locus), \
        "Expected locus %s but got %s" % (
            CTNNBIP1_004_locus, Locus.__str__(CTNNBIP1_004_transcript))

    start_offsets = CTNNBIP1_004_transcript.start_codon_spliced_offsets
    assert len(start_offsets) == 3, \
github openvax / varcode / test / test_vcf.py View on Github external
def test_genome_arg_to_load_vcf():
    variants = load_vcf(VCF_FILENAME)
    eq_(variants, load_vcf(VCF_FILENAME, genome=75))
    eq_(variants, load_vcf(VCF_FILENAME, genome=cached_release(75)))
    eq_(variants, load_vcf(VCF_FILENAME, genome="grch37"))
    eq_(variants, load_vcf(VCF_FILENAME, genome="GRCh37"))
    eq_(variants, load_vcf(VCF_FILENAME, genome="b37"))
    # TODO: actually make hg19 different from b37! They should use
    # different MT sequences
    eq_(variants, load_vcf(VCF_FILENAME, genome="hg19"))
github openvax / pyensembl / test / test_exon_object.py View on Github external
"""
Check that pyensembl returns correct exon objects for exon IDs
and loci. Make sure the information on the exon object matches
the expected gene ID and location.
"""
from __future__ import absolute_import

from pyensembl import cached_release

ensembl = cached_release(77)

def test_exon_object_by_id():
    """
    test_exon_object_by_id : check properties of exon 4 of CTNNB1 when looked
    up by ID in Ensembl 77.
    """
    exon = ensembl.exon_by_id("ENSE00003464041")
    assert exon.gene_name == "CTNNB1", \
        "Unexpected gene name: %s" % exon.gene_name
    assert exon.contig == "3", exon.contig
    assert exon.strand == "+"
    assert exon.on_forward_strand
    assert exon.on_positive_strand
    assert exon.start == 41224526, "Unexpected exon start: %s" % exon.start
    assert exon.end == 41224753, "Unexpected exon end: %s" % exon.end
    assert exon.length == len(exon) == 228
github hammerlab / cohorts / cohorts / cohort.py View on Github external
Returns
        -------
        kallisto_data : Pandas dataframe
            Pandas dataframe with Kallisto data for all patients
            columns include patient_id, gene_name, est_counts
        """
        kallisto_data = pd.concat(
            [self._load_single_patient_kallisto(patient) for patient in self],
            copy=False
        )

        if self.kallisto_ensembl_version is None:
            raise ValueError("Required a kallisto_ensembl_version but none was specified")

        ensembl_release = cached_release(self.kallisto_ensembl_version)

        kallisto_data["gene_name"] = \
            kallisto_data["target_id"].map(lambda t: ensembl_release.gene_name_of_transcript_id(t))

        # sum counts across genes
        kallisto_data = \
            kallisto_data.groupby(["patient_id", "gene_name"])[["est_counts"]].sum().reset_index()

        return kallisto_data