Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
if assembly_1 == assembly_2:
distances[(assembly_1, assembly_2)] = 0.0
return
pattern = re.compile(r'\d+[\w=]')
common_genes = set(gene_seqs[assembly_1]) & set(gene_seqs[assembly_2])
if not common_genes:
distances[(assembly_1, assembly_2)] = 1.0
distances[(assembly_2, assembly_1)] = 1.0
return
alignments = []
for gene in common_genes:
gene_1 = gene_seqs[assembly_1][gene]
gene_2 = gene_seqs[assembly_2][gene]
result = edlib.align(gene_1, gene_2, 'NW', 'path')
cigar = [(int(x[:-1]), x[-1]) for x in pattern.findall(result['cigar'])]
alignment_length = sum(x[0] for x in cigar)
match_count = sum(x[0] for x in cigar if x[1] == '=')
identity = match_count / alignment_length
alignments.append((identity, match_count, alignment_length))
# alignments = sorted(alignments, key=lambda x: x[0])
# discard_count = 0
# while True:
# if (discard_count + 2) / len(alignments) < discard_best_worst_alignments:
# discard_count += 2
# else:
# break
# discard_each_end = discard_count // 2
# alignments = alignments[discard_count:-discard_count]
def parseMappingFileIllumina(input):
fwdprimer = ''
revprimer = ''
samples = []
with open(input, 'r') as inputfile:
for line in inputfile:
line = line.replace('\n', '')
if line.startswith('#'):
continue
cols = line.split('\t')
if not cols[0] in samples:
samples.append(cols[0])
match = edlib.align(cols[1], cols[2], mode="HW", k=0)
if match["editDistance"] == 0:
Trim = match["locations"][0][1]+1
if fwdprimer == '':
fwdprimer = cols[2][Trim:]
revprimer = cols[3]
else:
fwdprimer = cols[2]
revprimer = cols[3]
return (samples, fwdprimer, revprimer)
def trimForPrimer(primer, seq, primer_mismatch):
foralign = edlib.align(primer, seq, mode="HW", k=primer_mismatch, additionalEqualities=degenNuc)
if foralign['editDistance'] < 0:
return 0
else:
CutPos = foralign["locations"][0][1]+1
return CutPos
NoPrimer += 1
continue
stringent = {}
stringent[BCLabel] = samples[BCLabel]['RevBarcode']
revBC, revBCLabel = AlignBarcode(read2[1], stringent, barcode_mismatch)
if revBC == '':
NoRevBarcode += 1
continue
#look for reverse primer in reverse read
revalign = edlib.align(RevPrimer, read2[1], mode="HW", k=primer_mismatch, additionalEqualities=degenNuc)
if revalign['editDistance'] < 0: #not found
NoRevPrimer += 1
continue
else:
#look for forward primer
foralign = edlib.align(fwdprimer, read1[1], mode="HW", k=primer_mismatch, additionalEqualities=degenNuc)
if foralign['editDistance'] < 0: #not found
NoPrimer += 1
continue
if len(revbarcodes) > 0:
#look for valid revbarcodes
if len(revbarcodes) > 0:
revBC, revBCLabel = AlignBarcode(read2[1], revbarcodes, barcode_mismatch)
if revBC == '':
NoRevBarcode += 1
continue
#look for reverse primer in reverse read
revalign = edlib.align(revprimer, read2[1], mode="HW", k=primer_mismatch, additionalEqualities=degenNuc)
if revalign['editDistance'] < 0: #not found
NoRevPrimer += 1
continue
#if get here, then all is well, construct new header and trim reads
nuc_and_ops = np.array([1,2,3,4,5,6,7])
encoded_nucs = number_encoding(soft_clipped_read['seq'])
hits = 0
min_score = len(soft_clipped_read['seq'])
top_hits = {}
if read.is_reverse:
while hits < n_hits and min_score >= -10:
alignment = edlib.align(soft_clipped_read['seq'], minus_strand, mode='HW', task='path')
if hits ==0:
if alignment['editDistance'] > max_edit:
return(None)
for location in alignment['locations']:
mask_bases = 'X' * ( location[1] - location[0])
minus_strand = minus_strand[:location[0]] + mask_bases + minus_strand[location[1]:]
hits += 1
# [2]=ref1,[3]=ref1_rev,
# [4]=ref2,[5]=ref2_rev,
# [6]=ref, [7]=ref_rev
edlib_ret = [100,100,100,100,100,100,100,100]
# alt
ret = edlib.align(line, fa_alt, mode="HW", task="path")
edlib_ret[0] = ret["editDistance"]
ret = edlib.align(rc_line, fa_alt, mode="HW", task="path")
edlib_ret[1] = ret["editDistance"]
if fa_ref == "":
# ref1
ret = edlib.align(line, fa_ref1, mode="HW", task="path")
edlib_ret[2] = ret["editDistance"]
ret = edlib.align(rc_line, fa_ref1, mode="HW", task="path")
edlib_ret[3] = ret["editDistance"]
# ref2
ret = edlib.align(line, fa_ref2, mode="HW", task="path")
edlib_ret[4] = ret["editDistance"]
ret = edlib.align(rc_line, fa_ref2, mode="HW", task="path")
edlib_ret[5] = ret["editDistance"]
else:
# ref
ret = edlib.align(line, fa_ref, mode="HW", task="path")
edlib_ret[6] = ret["editDistance"]
ret = edlib.align(rc_line, fa_ref, mode="HW", task="path")
edlib_ret[7] = ret["editDistance"]
if read_pair == 1:
d_edlib_read1[name] = edlib_ret
else:
edlib_ret[1] = ret["editDistance"]
if fa_ref == "":
# ref1
ret = edlib.align(line, fa_ref1, mode="HW", task="path")
edlib_ret[2] = ret["editDistance"]
ret = edlib.align(rc_line, fa_ref1, mode="HW", task="path")
edlib_ret[3] = ret["editDistance"]
# ref2
ret = edlib.align(line, fa_ref2, mode="HW", task="path")
edlib_ret[4] = ret["editDistance"]
ret = edlib.align(rc_line, fa_ref2, mode="HW", task="path")
edlib_ret[5] = ret["editDistance"]
else:
# ref
ret = edlib.align(line, fa_ref, mode="HW", task="path")
edlib_ret[6] = ret["editDistance"]
ret = edlib.align(rc_line, fa_ref, mode="HW", task="path")
edlib_ret[7] = ret["editDistance"]
if read_pair == 1:
d_edlib_read1[name] = edlib_ret
else:
d_edlib_read2[name] = edlib_ret
numRef = 0
numAlt = 0
numOther = 0
for name in d_edlib_read1.keys():
ret1 = d_edlib_read1[name]
ret2 = d_edlib_read2[name]
multihits = 0
findForPrimer = 0
findRevPrimer = 0
with open(outR1, 'w') as outfile1:
with open(outR2, 'w') as outfile2:
for read1, read2 in zip(file1, file2):
Total += 1
ffp = False
frp = False
R1Seq = read1[1][:RL]
R1Qual = read1[2][:RL]
R2Seq = read2[1][:RL]
R2Qual = read2[2][:RL]
ForTrim, RevTrim = (0,)*2
#look for forward primer in forward read
R1foralign = edlib.align(fwdprimer, R1Seq, mode="HW", k=primer_mismatch, additionalEqualities=degenNuc)
if R1foralign['editDistance'] < 0:
if require_primer == 'on' or full_length: #not found
continue
else:
if len(R1foralign['locations']) > 1: #multiple hits
multihits += 1
continue
try:
ForTrim = R1foralign["locations"][0][1]+1
findForPrimer += 1
ffp = True
except IndexError:
pass
R1revalign = edlib.align(RevComp(revprimer), R1Seq, mode="HW", k=primer_mismatch, additionalEqualities=degenNuc)
if R1revalign['editDistance'] < 0:
R1RevCut = RL
def findFwdPrimer(primer, sequence, mismatch, equalities):
# search for match
align = edlib.align(primer, sequence, mode="HW",
k=int(mismatch), additionalEqualities=equalities)
if align["editDistance"] >= 0: # we found a hit
TrimPos = align["locations"][0][1]+1
return(TrimPos, align['editDistance'])
else: # return position will be False if not found
return (False, False)
def findRevPrimer(primer, sequence, mismatch, equalities):
#trim position
TrimPos = None
#search for match
align = edlib.align(primer, sequence, mode="HW", task="locations", k=int(mismatch), additionalEqualities=equalities)
if align["editDistance"] >= 0: #we found a hit
TrimPos = align["locations"][0][0]
#return position will be None if not found
return TrimPos