How to use the jellyfish.levenshtein_distance function in jellyfish

To help you get started, we’ve selected a few jellyfish examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github sunlightlabs / read_FEC / fecreader / reconciliation / fec_reconciler.py View on Github external
name2 = simple_clean(name2_name.last) + " " + unnickname(name2_name.first)
        # calculate a buncha metrics
        text1 = name1_standardized
        text2 = name2
        #print "comparing '%s' to '%s'" % (text1, text2)
        ratio = 1/100.0*fuzz.ratio(text1, text2)
        partial_ratio = 1/100.0*fuzz.partial_ratio(text1, text2)
        token_sort_ratio = 1/100.0*fuzz.token_sort_ratio(text1, text2)
        token_set_ratio = 1/100.0*fuzz.token_set_ratio(text1, text2)
        
        avg_len = 1/2*len(text1)+len(text2)
        min_len = min(len(text1), len(text2))
        
        l_ratio = 0
        try:
            l_distance = jellyfish.levenshtein_distance(text1, text2)
            l_ratio = 1.0 - ( (0.0 + l_distance) / (0.0+avg_len) )
        except UnicodeEncodeError:
            pass
            
        long_match = longest_match(text1, text2)
        lng_ratio = (0.0 + long_match) / (0.0 + min_len)
        
        score = 0
        if ( ratio > 0.6 or partial_ratio > 0.6 or l_ratio > 0.6 or lng_ratio > 0.6):
            score = compute_scores([ratio,partial_ratio,l_ratio,lng_ratio])
           
        if debug:
            log.debug("|fuzzymatchresult|%s|'%s'|'%s'|score=%s|ratio=%s|partial_ratio=%s|token_sort_ratio=%s|token_set_ratio=%s| l_ratio=%s|lng_ratio=%s" % (match['cand_id'], match['cand_name'], name, score, ratio, partial_ratio, token_sort_ratio, token_set_ratio, l_ratio, lng_ratio))
        
        
        if (score > 0.8):
github openstates / billy / billy / site / api / handlers.py View on Github external
if prop['pid'] in ('state', 'chamber'):
                spec[prop['pid']] = prop['v']

        legislators = db.legislators.find(spec)

        results = []
        for leg in legislators:
            if legislators.count() == 1:
                match = True
                score = 100
            else:
                match = False
                if leg['last_name'] == query['query']:
                    score = 90
                else:
                    distance = levenshtein_distance(leg['full_name'].lower(),
                                                    query['query'].lower())
                    score = 100.0 / (1 + distance)

            # Note: There's a bug in Refine that causes reconciliation
            # scores to be overwritten if the same legislator is returned
            # for multiple queries. see:
            # http://code.google.com/p/google-refine/issues/detail?id=185

            results.append({"id": leg['_id'],
                            "name": leg['full_name'],
                            "score": score,
                            "match": match,
                            "type": [
                                {"id": "/billy/legislator",
                                 "name": "Legislator"}]})
github openstates / openstates / billy / site / api / handlers.py View on Github external
if prop['pid'] in ('state', 'chamber'):
                spec[prop['pid']] = prop['v']

        legislators = db.legislators.find(spec)

        results = []
        for leg in legislators:
            if legislators.count() == 1:
                match = True
                score = 100
            else:
                match = False
                if leg['last_name'] == query['query']:
                    score = 90
                else:
                    distance = levenshtein_distance(leg['full_name'].lower(),
                                                    query['query'].lower())
                    score = 100.0 / (1 + distance)

            # Note: There's a bug in Refine that causes reconciliation
            # scores to be overwritten if the same legislator is returned
            # for multiple queries. see:
            # http://code.google.com/p/google-refine/issues/detail?id=185

            results.append({"id": leg['_id'],
                            "name": leg['full_name'],
                            "score": score,
                            "match": match,
                            "type": [
                                {"id": "/billy/legislator",
                                 "name": "Legislator"}]})
github mandricigor / imrep / imrep.py View on Github external
v_cl[self.v_chain_type[v]] = []
                                if self.v_chain_type[v] == "IGHV":
                                    v_cl[self.v_chain_type[v]].append(v)

                            f, s = pSeq[:pos1[1]], pSeq[pos1[1] + 1:]

                            v_overlap = len(f) + len(s) + 1
                            for v1, v2 in v_cl.items():
                                for v3 in v2:
                                    if v3 not in self.vi_pieces:
                                        continue
                                    v, vv = self.vi_pieces[v3]
                                    minlen1 = min(len(f), len(v))
                                    minlen2 = min(len(s), len(vv))
                                    if minlen1 > 0:
                                        mismatch1 = jellyfish.levenshtein_distance(unicode(f[-minlen1:]), unicode(v[-minlen1:]))
                                    else:
                                        mismatch1 = 0
                                    if minlen2 > 0:
                                        mismatch2 = jellyfish.levenshtein_distance(unicode(s[:minlen2]), unicode(vv[:minlen2]))
                                    else:
                                        mismatch2 = 0
                                    if (minlen1 <= 3 and mismatch2 <= 1) or (minlen1 >= self.__settings.minlen1 and mismatch1 <= self.__settings.mismatch1 and minlen2 >= self.__settings.minlen2 and mismatch2 <= self.__settings.mismatch2):

                                        vtypes[v3] = (minlen1 + minlen2 + 1, mismatch1 + mismatch2)

                    if pos2 != [-1, -1]:
                        if pos2[0] != -1:
                            if True: #pos2[0] + 3 < len(pSeq) and pSeq[pos2[0] + 3] == "G":
                                if pos2[0] > 10:
                                    offset = pos2[0] - 10
                                else:
github mandricigor / imrep / imrep3.py View on Github external
j_cl = {}
                            for j in jc:
                                if self.j_chain_type[j] != "IGHJ" and self.j_chain_type[j] not in j_cl:
                                    j_cl[self.j_chain_type[j]] = []
                                if self.j_chain_type[j] != "IGHJ":
                                    j_cl[self.j_chain_type[j]].append(j)
                            f, s = pSeq[:pos2[0]], pSeq[pos2[0] + 1:]
                            for j1, j2 in j_cl.items():
                                for j3 in j2:
                                    if j3 not in self.jay_pieces:
                                        continue
                                    j, jj = self.jay_pieces[j3]
                                    minlen1 = min(len(f), len(j))
                                    minlen2 = min(len(s), len(jj))
                                    if minlen2 > 0:
                                        mismatch2 = jellyfish.levenshtein_distance(unicode(s[:minlen2]), unicode(jj[:minlen2]))
                                    else:
                                        mismatch2 = 0
                                    if minlen1 > 0:
                                        mismatch1 = jellyfish.levenshtein_distance(unicode(f[-minlen1:]), unicode(j[-minlen1:]))
                                    else:
                                        mismatch1 = 0
                                    if (minlen2 == 0 and mismatch1 <= 1) or (minlen2 > 3 and mismatch2 <= 1 and minlen1 >= 2 and mismatch1 <= 2):
                                        jtypes[j3] = mismatch1 + mismatch2
                        if pos2[1] != -1:
                            if pos2[1] > 10:
                                offset = pos2[1] - 10
                            else:
                                offset = 0
                            kmrs2 = self.kmers(pSeq[offset:], 3)
                            interJ = set(kmrs2) & jkeys
                            jlist = []
github J535D165 / recordlinkage / recordlinkage / algorithms / string.py View on Github external
def levenshtein_apply(x):

        try:
            return 1 - jellyfish.levenshtein_distance(x[0], x[1]) \
                / np.max([len(x[0]), len(x[1])])
        except Exception as err:
            if pandas.isnull(x[0]) or pandas.isnull(x[1]):
                return np.nan
            else:
                raise err
github mandricigor / imrep / imrep3.py View on Github external
v_cl[self.v_chain_type[v]] = []
                            v_cl[self.v_chain_type[v]].append(v)
                        f, s = pSeq[:pos1], pSeq[pos1 + 1:]
                        for v1, v2 in v_cl.items():
                            for v3 in v2:
                                if v3 not in self.vi_pieces:
                                    continue
                                v, vv = self.vi_pieces[v3]
                                minlen1 = min(len(f), len(v))
                                minlen2 = min(len(s), len(vv))
                                if minlen1 > 0:
                                    mismatch1 = jellyfish.levenshtein_distance(unicode(f[-minlen1:]), unicode(v[-minlen1:]))
                                else:
                                    mismatch1 = 0
                                if minlen2 > 0:
                                    mismatch2 = jellyfish.levenshtein_distance(unicode(s[:minlen2]), unicode(vv[:minlen2]))
                                else:
                                    mismatch2 = 0
                                if (minlen1 == 0 and mismatch2 <= 1) or (minlen1 > 3 and mismatch1 <= 1 and minlen2 >= 2 and mismatch2 <= 2):
                                    vtypes[v3] = mismatch1 + mismatch2
                    if pos2 != [-1, -1]:
                        if pos2[0] != -1:
                            if pos2[1] > 10:
                                offset = pos2[1] - 10
                            else:
                                offset = 0
                            kmrs2 = self.kmers(pSeq[offset:], 3)
                            interJ = set(kmrs2) & jkeys
                            jlist = []
                            for j in interJ:
                                jlist.extend(list(self.hashJ[j]))
                            if jlist:
github mandricigor / imrep / imrep2.py View on Github external
mismatch1 = jellyfish.levenshtein_distance(unicode(f[-minlen1:]), unicode(v[-minlen1:]))
                                else:
                                    mismatch1 = 0
                                if minlen2 > 0:
                                    mismatch2 = jellyfish.levenshtein_distance(unicode(s[:minlen2]), unicode(vv[:minlen2]))
                                else:
                                    mismatch2 = 0
                                if (minlen1 == 0 and mismatch2 == 0) or (minlen1 > 0 and mismatch1 <= 1 and minlen2 > 0 and mismatch2 <= 2):
                                    vtypes.add(v_t)
                        if pos2 != -1:
                            f, s = pSeq[:pos2], pSeq[pos2 + 1:]
                            for j, jj, j_t in self.jays[chain_name]:
                                minlen1 = min(len(f), len(j))
                                minlen2 = min(len(s), len(jj))
                                if minlen2 > 0:
                                    mismatch2 = jellyfish.levenshtein_distance(unicode(s[:minlen2]), unicode(jj[:minlen2]))
                                else:
                                    mismatch2 = 0
                                if minlen1 > 0:
                                    mismatch1 = jellyfish.levenshtein_distance(unicode(f[-minlen1:]), unicode(j[-minlen1:]))
                                else:
                                    mismatch1 = 0
                                if (minlen2 == 0 and mismatch1 <= 1) or (minlen2 > 0 and mismatch2 <= 1 and minlen1 > 0 and mismatch1 <= 0):
                                    jtypes.add(j_t)
                        if vtypes and jtypes:
                            found = True
                            cdr3 = pSeq[pos1: pos2 + 1]
                            if cdr3 not in self.pSeq_read_map:
                                full_cdr3.append(cdr3)
                                self.pSeq_read_map[cdr3] = {"v": vtypes, "j": jtypes, "chain_type": chain_name}
                        elif vtypes:
                            found = True
github mandricigor / imrep / imrep2.py View on Github external
pos2 = pSeq.rfind(letter)
                        if pos1 != -1 and pos2 != -1 and pos2 - pos1 < 5:
                            continue
                        vtypes = set()
                        jtypes = set()
                        if pos1 != -1:
                            f, s = pSeq[:pos1], pSeq[pos1 + 1:]
                            for v, vv, v_t in self.vis[chain_name]:
                                minlen1 = min(len(f), len(v))
                                minlen2 = min(len(s), len(vv))
                                if minlen1 > 0:
                                    mismatch1 = jellyfish.levenshtein_distance(unicode(f[-minlen1:]), unicode(v[-minlen1:]))
                                else:
                                    mismatch1 = 0
                                if minlen2 > 0:
                                    mismatch2 = jellyfish.levenshtein_distance(unicode(s[:minlen2]), unicode(vv[:minlen2]))
                                else:
                                    mismatch2 = 0
                                if (minlen1 == 0 and mismatch2 == 0) or (minlen1 > 0 and mismatch1 <= 1 and minlen2 > 0 and mismatch2 <= 2):
                                    vtypes.add(v_t)
                        if pos2 != -1:
                            f, s = pSeq[:pos2], pSeq[pos2 + 1:]
                            for j, jj, j_t in self.jays[chain_name]:
                                minlen1 = min(len(f), len(j))
                                minlen2 = min(len(s), len(jj))
                                if minlen2 > 0:
                                    mismatch2 = jellyfish.levenshtein_distance(unicode(s[:minlen2]), unicode(jj[:minlen2]))
                                else:
                                    mismatch2 = 0
                                if minlen1 > 0:
                                    mismatch1 = jellyfish.levenshtein_distance(unicode(f[-minlen1:]), unicode(j[-minlen1:]))
                                else:
github LIAAD / yake / yake / datarepresentation.py View on Github external
features_cand.append(doc_id)

        if keys != None:
            if rel:
                columns.append('rel')
                if self.unique_kw in keys or isVirtual:
                    features_cand.append(1)
                    seen.add(self.unique_kw)
                else:
                    features_cand.append(0)

            if rel_approx:
                columns.append('rel_approx')
                max_gold_ = ('', 0.)
                for gold_key in keys:
                    dist = 1.-jellyfish.levenshtein_distance(gold_key, self.unique_kw ) / max(len(gold_key), len(self.unique_kw)) # _tL
                    if max_gold_[1] < dist:
                        max_gold_ = ( gold_key, dist )
                features_cand.append(max_gold_[1])

        columns.append('kw')
        features_cand.append(self.unique_kw)
        columns.append('h')
        features_cand.append(self.H)
        columns.append('tf')
        features_cand.append(self.tf)
        columns.append('size')
        features_cand.append(self.size)
        columns.append('isVirtual')
        features_cand.append(int(isVirtual))

        for feature_name in features: