Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
keep_duplicates = True
counter_sequence_dict = {}
counter = 0
for line in reference_fasta:
if line.startswith('>'):
old_header = line.replace('>','').strip()
counter_sequence_dict.setdefault(counter,old_header)
new_header = '>%i\n' %(counter)
counter += 1
new_reference_fasta.write(new_header)
else:
new_reference_fasta.write(line)
new_reference_fasta.close()
# write the translation dictionary between new numerical identifiers and previous fasta headers to file
header_info_file = os.path.join(args.output,'reference_fasta_header_info.txt')
header_info = pd.DataFrame.from_dict(counter_sequence_dict, orient='index')
header_info.to_csv(header_info_file,sep='\t',header=False,index=True)
# get the fasta headers from the new formatted reference file
exons = [seq.id for seq in SeqIO.parse(open(new_fasta, 'r'), 'fasta')]
sorted_exon_list = list(exons)
# Get the paths to the contig fasta files for all samples
fasta_files = glob.glob(os.path.join(args.contigs, '*.fa*'))
if len(fasta_files) == 0: # multiple subfolders with contigs
fasta_files_dict = {}
for subdir in next(os.walk(args.contigs))[1]:
fasta_files_sub = glob.glob(os.path.join(os.path.join(args.contigs,subdir), '*.fa*'))
for fasta in fasta_files_sub:
sample_id = os.path.basename(fasta).split('.fa')[0]
fasta_files_dict.setdefault(sample_id,[])
fasta_files_dict[sample_id].append(fasta)
sample_ids = list(fasta_files_dict.keys())
else: # single folder with contig
obstacles = obstacles.append(df_new)
# make second obstacle features
time.append(duration[i] + time[i])
duration.append(duration[i])
width.append(1)
type.append(type[i])
if(lineIndex[i] == 1):
lineIndex.append(2)
elif(lineIndex[i] == 2):
lineIndex.append(1)
i = i + 1
# adding second obstacle
new_obstacle2 = {'_time': [time[i]], '_lineIndex': [lineIndex[i]], '_type': [type[i]],
'_duration': [duration[i]], '_width': [width[i]]}
df_new2 = pd.DataFrame.from_dict(new_obstacle2)
obstacles = obstacles.append(df_new2)
numObstacles = numObstacles + 1
# creating our new obstacle if not already made
if (type[i] == 1 or blockType == 1):
new_obstacle = {'_time': [time[i]], '_lineIndex': [lineIndex[i]], '_type': [type[i]], '_duration': [duration[i]], '_width': [width[i]]}
df_new = pd.DataFrame.from_dict(new_obstacle)
obstacles = obstacles.append(df_new)
return obstacles # time, lineIndex, type, duration, width
def run(data):
try:
input_df = pd.DataFrame.from_dict(data)
proba = model.predict_proba(input_df)
result = {"predict_proba":proba.tolist()}
return result
except Exception as e:
error = str(e)
return error
query: Dict[str, str] = {}
if collection_type is not None:
query = {"collection": collection_type.lower()}
payload = {"meta": {"include": ["name", "collection", "tagline", "visibility", "group", "tags"]}, "data": query}
response: List[Dict[str, Any]] = self._automodel_request("collection", "get", payload, full_return=False)
# Rename collection names
repl_name_map = collections_name_map()
for item in response:
item.pop("id", None)
if item["collection"] in repl_name_map:
item["collection"] = repl_name_map[item["collection"]]
df = pd.DataFrame.from_dict(response)
if not show_hidden:
df = df[df["visibility"]]
if group is not None:
df = df[df["group"].str.lower() == group.lower()]
if tag is not None:
if isinstance(tag, str):
tag = [tag]
tag = {t.lower() for t in tag}
df = df[df.apply(lambda x: len({t.lower() for t in x["tags"]} & tag) > 0, axis=1)]
df.drop(["visibility", "group", "tags"], axis=1, inplace=True)
if not aslist:
df.set_index(["collection", "name"], inplace=True)
df.sort_index(inplace=True)
return df
else:
def feature_value_to_df(features, values):
df_dict = {}
for feature, value in zip(features, values): df_dict[feature] = value
df = pd.DataFrame.from_dict(df_dict)
return df
condition = reduce(lambda x,y: np.logical_or(x, y), conditions)
tape = tape[condition]
DictLpc = {}
for ix,text in tape.TEXT.iteritems():
X = self.read_section(*ix)
if "LPC" not in X: continue
if X["LPC"]["INT"] != [2]:
if verbose:
logging.warn("found non-linlin interpolation, skip angular distr. for MAT{}/MF{}/MT{}".format(*ix))
continue
for e,v in X["LPC"]["E"].items():
DictLpc.update({(X["MAT"], X["MT"],e) : pd.Series([1]+v["COEFF"])})
if not DictLpc:
logging.warn("no angular distribution in Legendre expansion was found")
return pd.DataFrame()
frame = pd.DataFrame.from_dict(DictLpc, orient="index")
return Lpc(frame)
def table_schema(self, table_name):
"""
Get the schema for a specific table from a dataset.
Unrolls nested field names into the format that can be copied
directly into queries. For example, for the `github.commits` table,
the this will return `committer.name`.
This is a very different return signature than BigQuery's table.schema.
"""
self.__fetch_table(table_name)
raw_schema = self.tables[table_name].schema
schema = pd.DataFrame.from_dict([x.to_api_repr() for x in raw_schema])
# the api_repr only has the fields column for tables with nested data
if 'fields' in schema.columns:
schema = self.__unpack_all_schema_fields(schema)
# Set the column order
schema = schema[['name', 'type', 'mode', 'description']]
return schema
# Identify the nearest molecules
nearest_atoms = uu.atom_two[uu.atom_two['atom0'].isin(source_atom_idxs) |
uu.atom_two['atom1'].isin(source_atom_idxs)].sort_values('dr')[['atom0', 'atom1']].copy()
nearest_atoms['molecule0'] = nearest_atoms['atom0'].map(uu.atom['molecule'])
nearest_atoms['molecule1'] = nearest_atoms['atom1'].map(uu.atom['molecule'])
nearest_molecules = nearest_atoms[['molecule0', 'molecule1']].stack()
nearest_molecules = nearest_molecules[~nearest_molecules.isin(source_molecule_idxs)].drop_duplicates(keep='first')
# Build the appropriate universes
for nn in sizes:
atom1 = uu.atom.loc[uu.atom['molecule'].isin(nearest_molecules.iloc[:nn].tolist()+source_molecule_idxs),
['symbol', 'x', 'y', 'z']]
adxs, x, y, z, prj = _worker(atom1.index.values.astype(int),
atom1['x'].values.astype(float),
atom1['y'].values.astype(float),
atom1['z'].values.astype(float), a)
patom = pd.DataFrame.from_dict({'atom': adxs, 'x': x, 'y': y, 'z': z, 'prj': prj})
patom['frame'] = patom['atom'].map(uu.atom['frame'])
patom['symbol'] = patom['atom'].map(uu.atom['symbol'])
sliced_u = Universe(atom=patom)
sliced_u.compute_atom_two(dmax=a)
sliced_u.compute_molecule()
source_adxs1 = sliced_u.atom[(sliced_u.atom['prj'] == 13) & sliced_u.atom['atom'].isin(source_atom_idxs)].index
source_mdxs1 = sliced_u.atom.loc[source_adxs1, 'molecule'].unique().tolist()
nearest_atoms1 = sliced_u.atom_two[sliced_u.atom_two['atom0'].isin(source_adxs1) |
sliced_u.atom_two['atom1'].isin(source_adxs1)].sort_values('dr')[['atom0', 'atom1']].copy()
nearest_atoms1['molecule0'] = nearest_atoms1['atom0'].map(sliced_u.atom['molecule'])
nearest_atoms1['molecule1'] = nearest_atoms1['atom1'].map(sliced_u.atom['molecule'])
nearest_molecules1 = nearest_atoms1[['molecule0', 'molecule1']].stack()
nearest_molecules1 = nearest_molecules1[~nearest_molecules1.isin(source_mdxs1)].drop_duplicates(keep='first')
# Its fine to overwrite atom1 above since the uu.atom slice is not necessarily clustered
atom1 = sliced_u.atom.loc[sliced_u.atom['molecule'].isin(nearest_molecules1.iloc[:nn].tolist()+source_mdxs1)].copy()
dct[nn].append(atom1)
def convert(args):
src = lines_to_list(args.src_file)
if args.das:
src = [DA.parse(da_text).to_cambridge_da_string() for da_text in src]
ref = lines_to_list(args.ref_file)
columns = ['mr', 'orig_ref']
df = pd.DataFrame.from_dict({'mr': src, 'orig_ref': ref})
if args.system_output:
sys = lines_to_list(args.system_output)
df['system_ref'] = sys
columns.append('system_ref')
if args.score:
score = [float(score) for score in lines_to_list(args.score)]
df['quality'] = score
columns.append('quality')
df.to_csv(args.out_file, columns=columns, sep=b"\t", index=False, encoding='UTF-8')
signal_entropy = _complexity_optimize_get_differential(signal_embedded, k=1)
# calculate average of surrogates entropy
for i in range(surrogate_iter):
surrogate, iterations, rmsd = _complexity_optimize_iaaft(signal)
surrogate_embedded = complexity_embedding(surrogate, delay=tau, dimension=dimension)
surrogate_entropy = _complexity_optimize_get_differential(surrogate_embedded, k=1)
surrogate_list.append(surrogate_entropy)
surrogate_entropy_average = sum(surrogate_list) / len(surrogate_list)
# entropy ratio for each set of d and tau
entropy_ratio = signal_entropy / surrogate_entropy_average + (dimension * np.log(N)) / N
optimal[dimension].append(entropy_ratio)
# optimal dimension and tau is where entropy_ratio is minimum
optimal_df = pd.DataFrame.from_dict(optimal)
optimal_delay, optimal_dimension = np.unravel_index(np.nanargmin(optimal_df.values), optimal_df.shape)
optimal_delay = optimal_delay + 1 # accounts for zero indexing
return optimal_dimension, optimal_delay