Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_UCSC_annotation(localname=None):
"""Test UCSC annotation"""
tmp = mkdtemp()
p = genomepy.provider.ProviderBase.create("UCSC")
name = "sacCer3"
p.download_annotation(name, tmp, localname=localname)
localname = genomepy.utils.get_localname(name, localname)
gtf = os.path.join(tmp, localname, localname + ".annotation.gtf.gz")
validate_gzipped_gtf(gtf)
bed = os.path.join(tmp, localname, localname + ".annotation.bed.gz")
validate_gzipped_bed(bed)
shutil.rmtree(tmp)
def test_ensembl_annotation(localname=None):
"""Test Ensembl annotation
This annotation is hosted on https://ftp.ensembl.org.
"""
tmp = mkdtemp()
p = genomepy.provider.ProviderBase.create("Ensembl")
# Only test on vertebrates as these are downloaded over HTTPS.
# All others are downloaded over FTP, which is unreliable on Travis.
for name, version in [("KH", 98)]:
p.download_annotation(name, tmp, localname=localname, version=version)
localname = genomepy.utils.get_localname(name, localname)
gtf = os.path.join(tmp, localname, localname + ".annotation.gtf.gz")
validate_gzipped_gtf(gtf)
bed = os.path.join(tmp, localname, localname + ".annotation.bed.gz")
validate_gzipped_bed(bed)
shutil.rmtree(tmp)
def test_url_annotation(localname=None):
"""Test url annotation"""
tmp = mkdtemp()
p = genomepy.provider.ProviderBase.create("URL")
name = "http://ftp.xenbase.org/pub/Genomics/JGI/Xentr9.1/XT9_1.fa.gz"
p.download_annotation(name, tmp, localname=localname)
localname = genomepy.utils.get_localname(name, localname)
gtf = os.path.join(tmp, localname, localname + ".annotation.gtf.gz")
validate_gzipped_gtf(gtf)
bed = os.path.join(tmp, localname, localname + ".annotation.bed.gz")
validate_gzipped_bed(bed)
shutil.rmtree(tmp)
def search_url_for_annotation(url):
"""Attempts to find a gtf or gff3 file in the same location as the genome url"""
urldir = os.path.dirname(url)
sys.stderr.write(
"You have requested gene annotation to be downloaded.\n"
"Genomepy will check the remote directory:\n"
f"{urldir} for annotation files...\n"
)
# try to find a GTF or GFF3 file
name = get_localname(url)
with urlopen(urldir) as f:
for urlline in f.readlines():
urlstr = str(urlline)
if any(
substring in urlstr.lower() for substring in [".gtf", name + ".gff"]
):
break
# retrieve the filename from the HTML line
fname = ""
for split in re.split('>|<|><|/|"', urlstr):
if split.lower().endswith(
(
".gtf",
".gtf.gz",
name + ".gff",
url of where to download genome from
genomes_dir : str
Directory to install annotation
localname : str , optional
Custom name for your genome
kwargs: dict , optional:
Provider specific options.
to_annotation : str , optional
url to annotation file (only required if this not located in the same directory as the fasta)
"""
name = get_localname(url)
localname = get_localname(name, localname)
if kwargs.get("to_annotation"):
link = self.get_annotation_download_link(None, **kwargs)
else:
link = self.search_url_for_annotation(url)
self.attempt_and_report(name, localname, link, genomes_dir)
with the (first header fields of) the genome.fa will not be corrected.
kwargs : dict , optional
Provider specific options.
toplevel : bool , optional
Ensembl only: Always download the toplevel genome. Ignores potential primary assembly.
version : int , optional
Ensembl only: Specify release version. Default is latest.
to_annotation : text , optional
URL only: direct link to annotation file.
Required if this is not the same directory as the fasta.
"""
genomes_dir = get_genomes_dir(genomes_dir, check_exist=False)
localname = get_localname(name, localname)
out_dir = os.path.join(genomes_dir, localname)
# Check if genome already exists, or if downloading is forced
genome_found = (
len([f for f in glob_ext_files(out_dir) if f"{localname}.fa" in f]) >= 1
)
if (not genome_found or force) and not only_annotation:
# Download genome from provider
p = ProviderBase.create(provider)
p.download_genome(
name,
genomes_dir,
mask=mask,
regex=regex,
invert_match=invert_match,
localname=localname,
def _parse_name(name):
"""extract a safe name from file path, url or regular names"""
return os.path.basename(re.sub(".fa(.gz)?$", "", get_localname(name)))