Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
return 0
min_len = max(ying_len, yang_len)
search_range = (min_len // 2) - 1
if search_range < 0:
search_range = 0
ying_flags = [False]*ying_len
yang_flags = [False]*yang_len
# looking only within search range, count & flag matched pairs
common_chars = 0
for i, ying_ch in enumerate(ying):
low = i - search_range if i > search_range else 0
hi = i + search_range if i + search_range < yang_len else yang_len - 1
for j in _range(low, hi+1):
if not yang_flags[j] and yang[j] == ying_ch:
ying_flags[i] = yang_flags[j] = True
common_chars += 1
break
# short circuit if no characters match
if not common_chars:
return 0
# count transpositions
k = trans_count = 0
for i, ying_f in enumerate(ying_flags):
if ying_f:
for j in _range(k, yang_len):
if yang_flags[j]:
k = j + 1
def jaro_dist(scan_res, desired):
scan_line = get_file_as_string(scan_res)
desired_line = get_file_as_string(desired)
return jellyfish.jaro_winkler(scan_line, desired_line, long_tolerance=True)
for row in data.index:
data[row] = data[row].lower()
out = pd.DataFrame(columns=["Dup_ID1", "Dup_ID2", "Dup_1", "Dup_2"])
if metric == "DL": # Damerau Levenshtein Distance
res = {_d: [] for _d in data}
for _d in res.keys():
for row in data.index:
if _d != data[row] \
and jf.damerau_levenshtein_distance(_d, data[row]) < \
((len(_d) + len(data[row])/2)*threshold):
res[_d].append(data[row])
out.loc[len(out)] = (
_d.split("*")[-1], row, _d, data[row])
elif metric == "LM": # Levenshtein Distance
res = {_d: [] for _d in data}
for _d in res.keys():
for row in data.index:
if _d != data[row] \
def match_name_score(self, query):
corpus = [self.table, self.column] + self.column.split(".")
score = 10e10
for word in corpus:
distance = jellyfish.damerau_levenshtein_distance(word, query)
if query in word:
distance /= 2
score = min(score, distance)
return score
def pickle_data():
dbpath = os.path.abspath(os.path.join(PWD, 'data.db'))
conn = sqlite3.connect(dbpath)
conn.row_factory = dict_factory
c = conn.cursor()
c.execute("""SELECT * FROM states ORDER BY name""")
states = []
for row in c:
row['name_metaphone'] = jellyfish.metaphone(row['name'])
row['is_territory'] = row['is_territory'] == 1
row['is_obsolete'] = row['is_obsolete'] == 1
row['is_contiguous'] = row['is_contiguous'] == 1
row['is_continental'] = row['is_continental'] == 1
row['time_zones'] = row['time_zones'].split(',')
states.append(row)
pkl_path = os.path.abspath(os.path.join(PWD, 'us', 'states.pkl'))
with open(pkl_path, 'wb') as pkl_file:
# Use `protocol=2` to ensure package compatibility with Python 2,
# even if the `.pkl` file is built under Python 3
pickle.dump(states, pkl_file, protocol=2)
def keywordize(str):
"""
Splits a string into words, removes common stopwords, stems and removes
duplicates.
"""
import jellyfish
return set([jellyfish.porter_stem(word.lower().encode('ascii',
'ignore'))
for word in tokenize(str)
if (word.isalpha() or word.isdigit()) and
word.lower() not in stop_words])
name2 = simple_clean(name2_name.last) + " " + unnickname(name2_name.first)
# calculate a buncha metrics
text1 = name1_standardized
text2 = name2
#print "comparing '%s' to '%s'" % (text1, text2)
ratio = 1/100.0*fuzz.ratio(text1, text2)
partial_ratio = 1/100.0*fuzz.partial_ratio(text1, text2)
token_sort_ratio = 1/100.0*fuzz.token_sort_ratio(text1, text2)
token_set_ratio = 1/100.0*fuzz.token_set_ratio(text1, text2)
avg_len = 1/2*len(text1)+len(text2)
min_len = min(len(text1), len(text2))
l_ratio = 0
try:
l_distance = jellyfish.levenshtein_distance(text1, text2)
l_ratio = 1.0 - ( (0.0 + l_distance) / (0.0+avg_len) )
except UnicodeEncodeError:
pass
long_match = longest_match(text1, text2)
lng_ratio = (0.0 + long_match) / (0.0 + min_len)
score = 0
if ( ratio > 0.6 or partial_ratio > 0.6 or l_ratio > 0.6 or lng_ratio > 0.6):
score = compute_scores([ratio,partial_ratio,l_ratio,lng_ratio])
if debug:
log.debug("|fuzzymatchresult|%s|'%s'|'%s'|score=%s|ratio=%s|partial_ratio=%s|token_sort_ratio=%s|token_set_ratio=%s| l_ratio=%s|lng_ratio=%s" % (match['cand_id'], match['cand_name'], name, score, ratio, partial_ratio, token_sort_ratio, token_set_ratio, l_ratio, lng_ratio))
if (score > 0.8):
if prop['pid'] in ('state', 'chamber'):
spec[prop['pid']] = prop['v']
legislators = db.legislators.find(spec)
results = []
for leg in legislators:
if legislators.count() == 1:
match = True
score = 100
else:
match = False
if leg['last_name'] == query['query']:
score = 90
else:
distance = levenshtein_distance(leg['full_name'].lower(),
query['query'].lower())
score = 100.0 / (1 + distance)
# Note: There's a bug in Refine that causes reconciliation
# scores to be overwritten if the same legislator is returned
# for multiple queries. see:
# http://code.google.com/p/google-refine/issues/detail?id=185
results.append({"id": leg['_id'],
"name": leg['full_name'],
"score": score,
"match": match,
"type": [
{"id": "/billy/legislator",
"name": "Legislator"}]})
if prop['pid'] in ('state', 'chamber'):
spec[prop['pid']] = prop['v']
legislators = db.legislators.find(spec)
results = []
for leg in legislators:
if legislators.count() == 1:
match = True
score = 100
else:
match = False
if leg['last_name'] == query['query']:
score = 90
else:
distance = levenshtein_distance(leg['full_name'].lower(),
query['query'].lower())
score = 100.0 / (1 + distance)
# Note: There's a bug in Refine that causes reconciliation
# scores to be overwritten if the same legislator is returned
# for multiple queries. see:
# http://code.google.com/p/google-refine/issues/detail?id=185
results.append({"id": leg['_id'],
"name": leg['full_name'],
"score": score,
"match": match,
"type": [
{"id": "/billy/legislator",
"name": "Legislator"}]})
v_cl[self.v_chain_type[v]] = []
if self.v_chain_type[v] == "IGHV":
v_cl[self.v_chain_type[v]].append(v)
f, s = pSeq[:pos1[1]], pSeq[pos1[1] + 1:]
v_overlap = len(f) + len(s) + 1
for v1, v2 in v_cl.items():
for v3 in v2:
if v3 not in self.vi_pieces:
continue
v, vv = self.vi_pieces[v3]
minlen1 = min(len(f), len(v))
minlen2 = min(len(s), len(vv))
if minlen1 > 0:
mismatch1 = jellyfish.levenshtein_distance(unicode(f[-minlen1:]), unicode(v[-minlen1:]))
else:
mismatch1 = 0
if minlen2 > 0:
mismatch2 = jellyfish.levenshtein_distance(unicode(s[:minlen2]), unicode(vv[:minlen2]))
else:
mismatch2 = 0
if (minlen1 <= 3 and mismatch2 <= 1) or (minlen1 >= self.__settings.minlen1 and mismatch1 <= self.__settings.mismatch1 and minlen2 >= self.__settings.minlen2 and mismatch2 <= self.__settings.mismatch2):
vtypes[v3] = (minlen1 + minlen2 + 1, mismatch1 + mismatch2)
if pos2 != [-1, -1]:
if pos2[0] != -1:
if True: #pos2[0] + 3 < len(pSeq) and pSeq[pos2[0] + 3] == "G":
if pos2[0] > 10:
offset = pos2[0] - 10
else: