Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def _build_uniprot_entries():
up_entries_file = os.path.dirname(os.path.abspath(__file__)) + \
'/../resources/uniprot_entries.tsv'
uniprot_gene_name = {}
uniprot_mnemonic = {}
uniprot_mnemonic_reverse = {}
uniprot_mgi = {}
uniprot_rgd = {}
uniprot_mgi_reverse = {}
uniprot_rgd_reverse = {}
try:
csv_rows = read_unicode_csv(up_entries_file, delimiter='\t')
# Skip the header row
next(csv_rows)
for row in csv_rows:
up_id, gene_name, up_mnemonic, rgd, mgi = row
uniprot_gene_name[up_id] = gene_name
uniprot_mnemonic[up_id] = up_mnemonic
uniprot_mnemonic_reverse[up_mnemonic] = up_id
if mgi:
mgi_ids = mgi.split(';')
if mgi_ids:
uniprot_mgi[up_id] = mgi_ids[0]
uniprot_mgi_reverse[mgi_ids[0]] = up_id
if rgd:
rgd_ids = rgd.split(';')
if rgd_ids:
uniprot_rgd[up_id] = rgd_ids[0]
def _load_db_mappings(path):
mesh_to_db = {}
db_to_mesh = {}
to_db_ambigs = set()
db_to_ambigs = set()
for _, mesh_id, _, db_ns, db_id, _ in \
read_unicode_csv(path, delimiter='\t'):
# Make sure we don't add any one-to-many mappings
if mesh_id in mesh_to_db:
to_db_ambigs.add(mesh_id)
mesh_to_db.pop(mesh_id, None)
elif mesh_id not in to_db_ambigs:
mesh_to_db[mesh_id] = (db_ns, db_id)
# Make sure we don't add any one-to-many reverse mappings
if (db_ns, db_id) in db_to_mesh:
db_to_ambigs.add((db_ns, db_id))
db_to_mesh.pop((db_ns, db_id), None)
elif (db_ns, db_id) not in db_to_ambigs:
db_to_mesh[(db_ns, db_id)] = mesh_id
return mesh_to_db, db_to_mesh
def _build_uniprot_subcell_loc():
fname = os.path.dirname(os.path.abspath(__file__)) +\
'/../resources/uniprot_subcell_loc.tsv'
try:
csv_rows = read_unicode_csv(fname, delimiter='\t')
# Skip the header row
next(csv_rows)
subcell_loc = {}
for row in csv_rows:
loc_id = row[0]
loc_alias = row[3]
subcell_loc[loc_id] = loc_alias
except IOError:
subcell_loc = {}
return subcell_loc
def _read_tfs():
fname = os.path.join(os.path.dirname(os.path.abspath(__file__)), os.pardir,
'resources', 'transcription_factors.csv')
tf_table = read_unicode_csv(fname)
gene_names = [lin[1] for lin in list(tf_table)[1:]]
return gene_names
if len(pc_ids) > 1:
ck = chebi_client.get_inchi_key(chebi_id)
for pc_id in pc_ids:
pk = pubchem_client.get_inchi_key(pc_id)
if ck == pk:
ik_matches.add((chebi_id, pc_id))
# Looking for InChIKey matches for duplicates in the PubChem -> ChEBI
# direction
for pc_id, chebi_ids in pubchem_chebi.items():
if len(chebi_ids) > 1:
pk = pubchem_client.get_inchi_key(pc_id)
for chebi_id in chebi_ids:
ck = chebi_client.get_inchi_key(chebi_id)
if ck == pk:
ik_matches.add((chebi_id, pc_id))
rows = read_unicode_csv(fname, '\t')
header = next(rows)
header.append('IK_MATCH')
new_rows = [header]
for chebi_id, pc_id in rows:
if (chebi_id, pc_id) in ik_matches:
new_rows.append([chebi_id, pc_id, 'Y'])
else:
new_rows.append([chebi_id, pc_id, ''])
write_unicode_csv(fname, new_rows, '\t')
# Save ChEMBL mapping
fname = os.path.join(path, 'chebi_to_chembl.tsv')
logger.info('Saving into %s' % fname)
df_chembl = df[df['REFERENCE_DB_NAME']=='ChEMBL']
df_chembl.sort_values(['COMPOUND_ID', 'REFERENCE_ID'], ascending=True,
inplace=True)
def _read_famplex_map():
fname = os.path.join(os.path.dirname(os.path.abspath(__file__)),
'../../resources/famplex_map.tsv')
famplex_map = {}
csv_rows = read_unicode_csv(fname, delimiter='\t')
for row in csv_rows:
source_ns = row[0]
source_id = row[1]
be_id = row[2]
famplex_map[(source_ns, source_id)] = be_id
return famplex_map
Path to csv file containing grounding map information. Rows of the file
should be of the form ,,,...
,
lineterminator : Optional[str]
Line terminator used in input csv file. Default: \r\n
hgnc_symbols : Optional[bool]
Set to True if the grounding map file contains HGNC symbols rather than
IDs. In this case, the entries are replaced by IDs. Default: True
Returns
-------
g_map : dict
The grounding map constructed from the given files.
"""
gmap = {}
map_rows = read_unicode_csv(grounding_map_path, delimiter=',',
quotechar='"',
quoting=csv.QUOTE_MINIMAL,
lineterminator=lineterminator)
for row in map_rows:
txt = row[0]
keys = [entry for entry in row[1::2] if entry]
values = [entry for entry in row[2::2] if entry]
if not keys or not values:
logger.warning('Missing grounding entries for %s, skipping.' % txt)
continue
if len(keys) != len(values):
logger.warning('Mismatched keys and values in row %s, skipping.' %
str(row))
continue
gmap[txt] = dict(zip(keys, values))
if hgnc_symbols:
def _read_hgnc_maps():
hgnc_file = os.path.dirname(os.path.abspath(__file__)) + \
'/../resources/hgnc_entries.tsv'
csv_rows = read_unicode_csv(hgnc_file, delimiter='\t', encoding='utf-8')
hgnc_names = {}
hgnc_ids = {}
hgnc_withdrawn = []
uniprot_ids = {}
entrez_ids = {}
entrez_ids_reverse = {}
mouse_map = {}
rat_map = {}
prev_sym_map = {}
ensembl_ids = {}
ensembl_ids_reverse = {}
hgnc_withdrawn_new_ids = {}
# Skip the header
next(csv_rows)
for row in csv_rows:
hgnc_id = row[0][5:]
def _read_resource_csv(fname):
file_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
os.pardir, 'resources', fname)
csv_reader = read_unicode_csv(file_path, delimiter='\t')
return csv_reader