Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
sys.exit()
logfile = os.path.join(DATA_FOLDER, 'entrez_dump.log')
setup_logfile(logfile)
#mark the download starts
src_dump = get_src_dump()
doc = {'_id': 'entrez',
'timestamp': TIMESTAMP,
'data_folder': DATA_FOLDER,
'logfile': logfile,
'status': 'downloading'}
src_dump.save(doc)
t0 = time.time()
download(DATA_FOLDER, no_confirm=no_confirm)
t_download = timesofar(t0)
t1 = time.time()
#mark parsing starts
src_dump.update({'_id': 'entrez'}, {'$set': {'status': 'parsing'}})
parse_gbff(DATA_FOLDER)
t_parsing = timesofar(t1)
t_total = timesofar(t0)
#mark the download finished successfully
_updates = {
'status': 'success',
'time': {
'download': t_download,
'parsing': t_parsing,
'total': t_total
},
'pending_to_upload': True # a flag to trigger data uploading
"fin_ac": fields[170],
"fin_af": fields[171],
"nfe_ac": fields[172],
"nfe_af": fields[173]
},
"clinvar": {
"rs": fields[176],
"clinsig": fields[177],
"trait": fields[178],
"golden_stars": fields[179]
},
"gtex": gtex
}
}
one_snp_json = list_split(dict_sweep(unlist(value_convert_to_number(one_snp_json)), vals=["."]), ";")
one_snp_json["dbnsfp"]["chrom"] = str(one_snp_json["dbnsfp"]["chrom"])
return one_snp_json
"hgvs": [i for i in df["clinvar_hgvs"].split("|") if i != "."],
"omim": [i for i in df["clinvar_OMIM_id"].split("|") if i != "."],
"medgen": [i for i in df["clinvar_MedGen_id"].split("|") if i != "."],
"orphanet": [i for i in df["clinvar_Orphanet_id"].split("|") if i != "."],
"var_source": [i for i in df["clinvar_var_source"].split("|") if i != "."]
},
"hgvsc": list(set(df["HGVSc_ANNOVAR"].split(';') + df["HGVSc_snpEff"].split(';') + df["HGVSc_VEP"].split(';'))),
"hgvsp": list(set(df["HGVSp_ANNOVAR"].split(';') + df["HGVSp_snpEff"].split(';') + df["HGVSp_VEP"].split(';'))),
"gtex": list(gtex),
"geuvadis_eqtl_target_gene": df["Geuvadis_eQTL_target_gene"]
}
}
if include_gnomad:
one_snp_json['dbnsfp'].update(gnomad)
one_snp_json = list_split(dict_sweep(unlist(value_convert_to_number(one_snp_json)), vals=[".", '-', "NA", None], remove_invalid_list=True), ";")
one_snp_json["dbnsfp"]["chrom"] = str(one_snp_json["dbnsfp"]["chrom"])
return one_snp_json
def load_data(input_file):
with open_anyfile(input_file) as in_f:
result = defaultdict(list)
for line in in_f:
pharos_id, _id = line.strip().split(',')
if _id != 'entrez_gene_id' and _id != '0':
result[str(_id)].append(int(pharos_id))
for k, v in result.items():
json_doc = {'_id': str(k),
'pharos': {"target_id": v}}
yield unlist(json_doc)
def download(no_confirm=False):
orig_path = os.getcwd()
try:
os.chdir(DATA_FOLDER)
path, filename = os.path.split(DATAFILE_PATH)
if os.path.exists(filename):
if no_confirm or ask('Remove existing file "%s"?' % filename) == 'Y':
os.remove(filename)
else:
logging.info("Skipped!")
return
logging.info('Downloading "%s"...' % filename)
url = 'ftp://{}/{}'.format(FTP_SERVER, DATAFILE_PATH)
cmdline = 'wget %s -O %s' % (url, filename)
#cmdline = 'axel -a -n 5 %s' % url #faster than wget using 5 connections
return_code = os.system(cmdline)
if return_code == 0:
logging.info("Success.")
else:
logging.info("Failed with return code (%s)." % return_code)
logging.info("=" * 50)
finally:
os.chdir(orig_path)
refseq_release = get_refseq_release()
logging.info(refseq_release)
src_dump = get_src_dump()
doc = src_dump.find_one({'_id': 'refseq'})
if doc and 'release' in doc and refseq_release <= doc['release']:
data_file = os.path.join(doc['data_folder'], 'complete.109.rna.gbff.gz')
if os.path.exists(data_file):
logging.info("No newer release found. Abort now.")
sys.exit(0)
DATA_FOLDER = os.path.join(REFSEQ_FOLDER, str(refseq_release))
if not os.path.exists(DATA_FOLDER):
os.makedirs(DATA_FOLDER)
else:
if not (no_confirm or len(os.listdir(DATA_FOLDER)) == 0 or ask('DATA_FOLDER (%s) is not empty. Continue?' % DATA_FOLDER) == 'Y'):
sys.exit(0)
logfile = os.path.join(DATA_FOLDER, 'refseq_dump.log')
setup_logfile(logfile)
#mark the download starts
doc = {'_id': 'refseq',
'release': refseq_release,
'timestamp': time.strftime('%Y%m%d'),
'data_folder': DATA_FOLDER,
'logfile': logfile,
'status': 'downloading'}
src_dump.save(doc)
t0 = time.time()
try:
"fin_ac": fields[122],
"fin_af": fields[123],
"nfe_ac": fields[124],
"nfe_af": fields[125],
"sas_ac": fields[126],
"sas_af": fields[127]
},
"clinvar": {
"rs": fields[128],
"clinsig": fields[129],
"trait": fields[130]
}
}
}
one_snp_json = list_split(dict_sweep(unlist(value_convert_to_number(one_snp_json)), vals=["."]), ";")
one_snp_json["dbnsfp"]["chrom"] = str(one_snp_json["dbnsfp"]["chrom"])
return one_snp_json
[int(x) for x in ld[10].split(',') if x]))
assert len(exons) == int(ld[8]), (len(exons), int(ld[8]))
ref2exons.setdefault(refseq,[]).append({
'transcript' : refseq,
'chr': chr,
'strand': -1 if ld[3] == '-' else 1,
'txstart': int(ld[4]),
'txend': int(ld[5]),
'cdsstart': int(ld[6]),
'cdsend': int(ld[7]),
'position': exons
})
gene2exons = {}
reflink_file = os.path.join(data_folder, '../hgFixed/database/refLink.txt.gz')
refseq2gene = tab2dict(reflink_file, (2, 6), 0, alwayslist=False)
for refseq in sorted(ref2exons.keys()):
geneid = refseq2gene.get(refseq, None)
if geneid and geneid != '0':
if geneid not in gene2exons:
gene2exons[geneid] = {exons_key: ref2exons[refseq]}
else:
gene2exons[geneid][exons_key].extend(ref2exons[refseq])
load_done('[%d, %s]' % (len(gene2exons), timesofar(t0)))
return gene2exons
def load_broadinstitute_exac_any(one_file,key):
logging.info("Loading file %s (%s)" % (one_file,key))
data = tab2dict(one_file, (0,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21), 0)
exacs = {}
for transcript in data:
tupleexac = data[transcript]
# remove version in key so we can search the dict easily later
exacs[transcript.split(".")[0]] = {"exac" :
{
"transcript" : transcript, # but keep version here
"n_exons" : int(tupleexac[0]),
"cds_start" : int(tupleexac[1]),
"cds_end" : int(tupleexac[2]),
"bp" : int(tupleexac[3]),
key : {
"mu_syn" : float(tupleexac[4]),
"mu_mis" : float(tupleexac[5]),
"mu_lof" : float(tupleexac[6]),
"n_syn" : float(tupleexac[7]),
def load(self, aslist=False):
'''
loading ncbi "homologene.data" file
adding "homologene" field in gene doc
'''
from biothings.utils.hub_db import get_src_dump
homo_d = tab2dict(self.datafile,(2,1),0,header=0)
entrez_doc = get_src_dump().find_one({"_id":"entrez"}) or {}
entrez_dir = entrez_doc.get("download",{}).get("data_folder")
assert entrez_dir, "Can't find Entrez data directory"
DATAFILE = os.path.join(entrez_dir, 'gene_history.gz')
assert os.path.exists(DATAFILE), "gene_history.gz is missing (entrez_dir: %s)" % entrez_dir
retired2gene = tab2dict(DATAFILE, (1, 2), 1, alwayslist=0,includefn=lambda ld: ld[1] != '-')
for id in list(homo_d.keys()):
homo_d[retired2gene.get(id,id)] = homo_d[id]
with open(self.datafile) as df:
homologene_d = {}
doc_li = []
print()
geneid_d = get_geneid_d(entrez_dir, self.species_li,load_cache=False,save_cache=False,only_for=homo_d)
for line in df:
ld = line.strip().split('\t')
hm_id, tax_id, geneid = [int(x) for x in ld[:3]]
if (self.taxid_set is None or tax_id in self.taxid_set) and geneid in geneid_d:
# for selected species only
# and also ignore those geneid does not match any
# existing gene doc