Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
if sortby:
df.sort_values(sortby, inplace=True, ascending=False)
if selection:
selection_str = parse_selection_string(selection, df_name='df')
mask = pd.eval(selection_str)
df = df[mask]
dbase_query_pairs = [(d, q) for d, q in
zip(df['Name'].values, df['ShapeQuery'].values)]
query_names = {q for q in df['ShapeQuery'].values}
query_mol2s = {}
multiconf_query = False
for idx, cont in enumerate(split_multimol2(query_path)):
if idx >= 1:
multiconf_query = True
break
cnt = -1
if query_path.endswith('.gz'):
for id_, cont in split_multimol2(query_path):
cnt += 1
cont = b''.join(cont).decode('utf-8').split('\n')
if multiconf_query:
mol_idx = '%s_%d' % (id_.decode('utf-8'), cnt)
else:
mol_idx = id_
if mol_idx in query_names:
if id_suffix:
if query_path.endswith('.gz'):
for id_, cont in split_multimol2(query_path):
cnt += 1
cont = b''.join(cont).decode('utf-8').split('\n')
if multiconf_query:
mol_idx = '%s_%d' % (id_.decode('utf-8'), cnt)
else:
mol_idx = id_
if mol_idx in query_names:
if id_suffix:
cont[1] = mol_idx + '\n'
query_mol2s[mol_idx] = ''.join(cont)
else:
for id_, cont in split_multimol2(query_path):
cnt += 1
if multiconf_query:
mol_idx = '%s_%d' % (id_, cnt)
else:
mol_idx = id_
if mol_idx in query_names:
if id_suffix:
cont[1] = mol_idx + '\n'
query_mol2s[mol_idx] = ''.join(cont)
out_path_base = os.path.join(output_dir, os.path.basename(inp_mol2_path)
.split('.mol2')[0])
out_path_q = '%s_%s' % (out_path_base, 'query.mol2')
out_path_d = '%s_%s' % (out_path_base, 'dbase.mol2')
with tempfile.TemporaryDirectory() as tmpdirname:
if query_path.endswith('.gz'):
for id_, cont in split_multimol2(query_path):
cnt += 1
cont = b''.join(cont).decode('utf-8').split('\n')
if multiconf_query:
mol_idx = '%s_%d' % (id_.decode('utf-8'), cnt)
else:
mol_idx = id_
"""
with open_file(out_mol2_path, write_mode) as outfile:
prev_molecule = ''
if inp_mol2_path.endswith('.gz'):
for i, (id_, cont) in enumerate(split_multimol2(inp_mol2_path)):
if prev_molecule != id_:
cnt = 0
else:
cnt += 1
mol_idx = b'%s_%d' % (id_, cnt)
cont[1] = mol_idx + b'\n'
outfile.write(b''.join(cont))
prev_molecule = id_
else:
for i, (id_, cont) in enumerate(split_multimol2(inp_mol2_path)):
if prev_molecule != id_:
cnt = 0
else:
def run_rocs(source_file, target_file, n_processes, settings):
prefix = ''.join(target_file.split('.mol2')[:-1])
sys.stdout.write('Processing %s\n' % os.path.basename(source_file))
sys.stdout.flush()
for idx, mol2 in enumerate(split_multimol2(QUERY_FILE)):
if idx >= 1:
mcquery = 'true'
break
if not idx:
mcquery = 'false'
cmd = [EXECUTABLE,
'-ref', QUERY_FILE,
'-dbase', source_file,
'-outputquery', 'false',
'-prefix', prefix,
'-mcquery', mcquery,
'-mpi_np', str(n_processes),
'-oformat', 'mol2']
if settings:
if inp_mol2_path.endswith('.gz'):
for i, (id_, cont) in enumerate(split_multimol2(inp_mol2_path)):
if prev_molecule != id_:
cnt = 0
else:
cnt += 1
mol_idx = b'%s_%d' % (id_, cnt)
cont[1] = mol_idx + b'\n'
outfile.write(b''.join(cont))
prev_molecule = id_
else:
for i, (id_, cont) in enumerate(split_multimol2(inp_mol2_path)):
if prev_molecule != id_:
cnt = 0
else:
cnt += 1
mol_idx = '%s_%d' % (id_, cnt)
cont[1] = mol_idx + '\n'
outfile.write(''.join(cont))
prev_molecule = id_
if verbose:
elapsed = time.time() - start
n_molecules = i + 1
sys.stdout.write(' | scanned %d molecules | %d mol/sec\n' %
(n_molecules, n_molecules / elapsed))
for mol2_file in mol2_files:
if verbose:
start = time.time()
sys.stdout.write('Processing %s' % os.path.basename(mol2_file))
sys.stdout.flush()
cnt = 0
if mol2_file.endswith('.gz'):
data_processor_fn = data_processor_gz
else:
data_processor_fn = data_processor
for chunk in lazy_imap(data_processor=data_processor_fn,
data_generator=split_multimol2(mol2_file),
n_cpus=n_cpus):
_ = [f.write('%s\n' % mol2_id)for mol2_id in chunk if mol2_id]
cnt += len(chunk)
if verbose:
elapsed = time.time() - start
sys.stdout.write(' | %d mol/sec\n' % (cnt / elapsed))
sys.stdout.flush()
dbase_open_file(output_mol2_path_dbase, dbase_write_mode) as opd:
for i in selection_indices:
mol2_q_cont = ('DID NOT FIND %s\n'
% (df_atom.ix[i]['query']))
mol2_d_cont = ('DID NOT FIND %s\n'
% (df_atom.ix[i]['dbase']))
for idx, mol2 in enumerate(split_multimol2(
input_mol2_path_query)):
if idx == i:
mol2_q_cont = mol2[1]
break
for idx, mol2 in enumerate(split_multimol2(
input_mol2_path_dbase)):
if idx == i:
mol2_d_cont = mol2[1]
break
if query_write_mode == 'wb':
opq.write(b''.join(mol2_q_cont))
else:
opq.write(''.join(mol2_q_cont))
if dbase_write_mode == 'wb':
opd.write(b''.join(mol2_d_cont))
else:
opd.write(''.join(mol2_d_cont))
if verbose:
if verbose:
start = time.time()
sys.stdout.write('Processing %s/%s' % (d_base, q_base))
sys.stdout.flush()
cnt = 0
if q_path.endswith('.gz'):
data_processor_fn = data_processor_gz
else:
data_processor_fn = data_processor
for chunk in lazy_imap(data_processor=data_processor_fn,
data_generator=zip(split_multimol2(d_path),
split_multimol2(q_path)),
n_cpus=n_cpus):
for dbase_id, query_id, atoms, charges in chunk:
dct_results['dbase'].append(dbase_id)
dct_results['query'].append(query_id)
dct_results['atoms'].append(atoms)
dct_results['charges'].append(charges)
cnt += len(chunk)
"""
q_pdmol = PandasMol2()
d_pdmol = PandasMol2()
for q_mol2, d_mol2 in zip(split_multimol2(q_path),
split_multimol2(d_path)):
for mol2_file in mol2_files:
if verbose:
start = time.time()
sys.stdout.write('Processing %s' % os.path.basename(mol2_file))
sys.stdout.flush()
cnt = 0
if mol2_file.endswith('.gz'):
data_processor_fn = data_processor_gz
else:
data_processor_fn = data_processor
for chunk in lazy_imap(data_processor=data_processor_fn,
data_generator=split_multimol2(
mol2_file),
n_cpus=n_cpus):
_ = [f.write('%s\n' % mol2_id) for mol2_id
in chunk if mol2_id]
cnt += len(chunk)
if verbose:
elapsed = time.time() - start
sys.stdout.write(' | %d mol/sec\n' % (cnt / elapsed))
sys.stdout.flush()
def mol2_to_idfile(mol2_files, id_file_path, verbose=0):
with open(id_file_path, 'w') as f:
for mol2_file in mol2_files:
if verbose:
sys.stdout.write('Processing %s' % os.path.basename(mol2_file))
sys.stdout.flush()
start = time.time()
for idx, mol2 in enumerate(split_multimol2(mol2_file)):
f.write(mol2[0] + '\n')
if verbose:
elapsed = time.time() - start
n_molecules = idx + 1
sys.stdout.write(' | scanned %d molecules | %d mol/sec\n' %
(n_molecules, n_molecules / elapsed))
sys.stdout.flush()