How to use the genomepy.Genome function in genomepy

To help you get started, we’ve selected a few genomepy examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github vanheeringen-lab / genomepy / tests / test_genomepy.py View on Github external
def test_no_fasta_files():
    with pytest.raises(FileNotFoundError):
        genomepy.Genome("empty", "tests/data/genome")
github vanheeringen-lab / genomepy / tests / test_03_download_genomes.py View on Github external
def test_url_genome():
    """Test URL.

    Download S. cerevisiae genome directly from an url from UCSC and retrieve a
    specific sequence.
    """
    tmp = mkdtemp()
    genomepy.install_genome(
        "http://hgdownload.soe.ucsc.edu/goldenPath/ce11/bigZips/chromFa.tar.gz",
        "url",
        genome_dir=tmp,
        localname="url_test",
    )
    g = genomepy.Genome("url_test", genome_dir=tmp)
    assert str(g["chrI"][:12]).lower() == "gcctaagcctaa"
    shutil.rmtree(tmp)
github vanheeringen-lab / genomepy / tests / test_01_basics.py View on Github external
def test_no_fasta_files():
    with pytest.raises(FileNotFoundError):
        genomepy.Genome("empty", "tests/data/genome")
github vanheeringen-lab / genomepy / genomepy / annotation.py View on Github external
Returns
    -------
    pandas.DataFrame or dict
        Chromosome mapping.
    """
    if fmt.lower() not in ["dataframe", "dict"]:
        raise ValueError("Invalid format, should be 'dataframe' or 'dict'")

    logger.info("Loading chromosome mapping.")
    if to.startswith("GCA"):
        if provider is None:
            raise ValueError("Need a provider: NCBI, UCSC or Ensembl")
        asm_acc = to
    else:
        try:
            genome = Genome(to)
            logger.info("Using local genome information")
            asm_acc = genome.assembly_accession
            if provider is None:
                provider = genome.provider
        except Exception:
            logger.info("Searching remote genome information")
            result = [row for row in search(to, provider=provider)]
            if len(result) > 1:
                p = [row[1].decode() for row in result]
                raise ValueError(
                    f"More than one result, need one of these providers: {', '.join(p)}"
                )
            if provider is None:
                provider = result[0][1].decode()
            asm_acc = result[0][2].decode()
github vanheeringen-lab / gimmemotifs / gimmemotifs / scanner.py View on Github external
motif_digest = self.checksum.get(motif_file, None)

        # determine which regions are not in the cache
        scan_regions = regions
        if self.use_cache:
            scan_regions = []
            for region in regions:
                key = str((region, genome, motif_digest, nreport, scan_rc))
                ret = self.cache.get(key)
                if ret == NO_VALUE:
                    scan_regions.append(region)

        # scan the regions that are not in the cache
        if len(scan_regions) > 0:

            g = Genome(genome)

            motifs = [(m, self.threshold[m.id]) for m in read_motifs(self.motifs)]
            scan_func = partial(
                scan_region_mult,
                genome=g,
                motifs=motifs,
                nreport=nreport,
                scan_rc=scan_rc,
            )

            for region, ret in self._scan_jobs(scan_func, scan_regions):
                # return values or store values in cache
                if self.use_cache:
                    # store values in cache
                    key = str(
                        (
github vanheeringen-lab / genomepy / genomepy / annotation.py View on Github external
are: ensembl.gene, entrezgene, symbol, name, refseq, entrezgene. Note that
        refseq will return the protein refseq_id by default, use `product="rna"` to
        return the RNA refseq_id. Currently, mapping to Ensembl transcript ids is
        not supported.

    product : str, optional
        Either "protein" or "rna". Only used when `gene_field="refseq"`

    Returns
    -------
    pandas.DataFrame with gene annotation.
    """
    if product not in ["rna", "protein"]:
        raise ValueError(f"Argument product should be either 'rna' or 'protein'")

    g = Genome(genome)
    for anno_file in [f"{genome}.annotation.bed.gz", f"{genome}.annotation.bed"]:
        bed = os.path.join(os.path.dirname(g.filename), anno_file)
        if os.path.exists(bed):
            break
        else:
            bed = None

    if bed is None:
        logger.info(f"No annotation file found for genome {genome}")
        return

    bed12_fields = [
        "chrom",
        "start",
        "end",
        "name",
github vanheeringen-lab / gimmemotifs / gimmemotifs / background.py View on Github external
def __init__(self, matchfile, genome="hg19", number=None, size=None):
        # Create temporary files
        tmpbed = NamedTemporaryFile(dir=mytmpdir()).name
        tmpfasta = NamedTemporaryFile(dir=mytmpdir()).name

        # Create bed-file with coordinates of random sequences
        matched_gc_bedfile(tmpbed, matchfile, genome, number, size=size)

        # Convert track to fasta
        Genome(genome).track2fasta(tmpbed, fastafile=tmpfasta)

        # Initialize super Fasta object
        Fasta.__init__(self, tmpfasta)

        # Delete the temporary files
        os.remove(tmpbed)
        os.remove(tmpfasta)
github vanheeringen-lab / gimmemotifs / gimmemotifs / background.py View on Github external
def create_random_genomic_bedfile(out, genome, size, n):
    features = Genome(genome).get_random_sequences(n, size)

    # Write result to bedfile
    tmp = open(out, "w")
    for chrom, start, end in features:
        tmp.write("%s\t%d\t%d\n" % (chrom, start, end))
    tmp.flush()
github vanheeringen-lab / gimmemotifs / gimmemotifs / scanner.py View on Github external
def set_genome(self, genome):
        """
        set the genome to be used for:
            - converting regions to sequences
            - background for MOODS
        """
        if not genome:
            return

        # raises error if checks fail
        Genome(genome)

        self.genome = genome