How to use the jellyfish.jaro_winkler function in jellyfish

To help you get started, we’ve selected a few jellyfish examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github AwesomeLemon / document-recognition / test_accuracy.py View on Github external
def jaro_dist(scan_res, desired):
        scan_line = get_file_as_string(scan_res)
        desired_line = get_file_as_string(desired)
        return jellyfish.jaro_winkler(scan_line, desired_line, long_tolerance=True)
github SEED-platform / seed / seed / lib / mcm / matchers.py View on Github external
# verify that the category has two elements, if not, then just
        # return _ for the first category. Need this because fuzzy_in_set uses the
        # same method
        table_name = '_'
        category = None
        if isinstance(cat, tuple):
            table_name = cat[0]
            category = cat[1]
        else:
            category = cat

        scores.append(
            (
                table_name,
                category,
                jellyfish.jaro_winkler(
                    str(s.encode('ascii', 'replace').lower()),
                    str(category.encode('ascii', 'replace').lower())
                )
            )
        )

        # sort first by the ones

    # print('all scores for {} are {}'.format(s, scores))
    scores.sort()
    scores = sorted(scores, key=cmp_to_key(sort_scores))
    # take the top n number of matches
    scores = scores[:top_n]
    # convert to hundreds
    scores = [(score[0], score[1], int(score[2] * 100)) for score in scores]
    # print('ending all categories match of {} with scores {}'.format(s, scores))
github LIAAD / yake / yake / yake.py View on Github external
def jaro(self, cand1, cand2):
        return jellyfish.jaro_winkler(cand1, cand2 )
github chrismattmann / tika-similarity / features.py View on Github external
def jaro_winkler_similarity(s, t):
    """ Jaro-Winkler Similarity """

    jw_sim = jellyfish.jaro_winkler(s, t)


    return jw_sim
github layday / instawow / instawow / manager.py View on Github external
await self.synchronise()

        s = normalise(search_terms)
        tokens_to_defns = bucketise(
            (
                (normalise(i.name), (i.source, i.id))
                for i in self.catalogue.__root__
                if self.config.game_flavour in i.compatibility
            ),
            key=lambda v: v[0],
        )

        # TODO: weigh matches under threshold against download count
        matches = heapq.nlargest(
            limit, ((jaro_winkler(s, n), n) for n in tokens_to_defns.keys()), key=lambda v: v[0]
        )
        defns = [Defn(*d) for _, m in matches for _, d in tokens_to_defns[m]]
        results = await self.resolve(defns)
        pkgs_by_defn = {d.with_name(r.slug): r for d, r in results.items() if is_pkg(r)}
        return pkgs_by_defn
github olefriis / simplepvr / python / simplepvr / util / auto_mapper.py View on Github external
for channel in element_tree.getroot().findall('channel'):
        found_match = False
        match_scores = []
        name__text = channel.find('display-name').text.encode(sys.stdout.encoding)

        icon_url = channel.find('icon').attrib['src']

        stripped_name = name__text.replace(" ", "")

        for hdhr_name in hdhr_names:
            score = 0
            safe_hdhr_name = hdhr_name if is_ascii(hdhr_name) else hdhr_name.decode(sys.stdout.encoding)
            stripped_hdhr_name = safe_hdhr_name.replace(" ", "")
            try:
                score = jellyfish.jaro_winkler(stripped_name, stripped_hdhr_name )
            except UnicodeEncodeError:
                try:
                    safe_name_text = name__text if is_ascii(name__text) else name__text.decode(sys.stdout.encoding)
                    logger.warn(u"Unable to do score for '{0}' vs '{1}'".format(safe_name_text, safe_hdhr_name))
                except UnicodeEncodeError:
                    ## Hvis vi heller ikke kan logge vores error pga. encoding, logger vi en ny error der er "sikker"
                    safe_hdhr_name = to_utf8(safe_hdhr_name)
                    safe_name_text = to_utf8(safe_name_text)
                    logger.warn(u"Unable to do score calculation for {0} - {1} - console encoding: {2}".format(safe_name_text, safe_hdhr_name, sys.stdout.encoding))
#                    logger.warn(name__text, " <-> ", hdhr_name, " isAscii: ", is_ascii(hdhr_name), " -- Safe version ", safe_name_text, " - ", safe_hdhr_name, " - sys encoding: ", sys.stdout.encoding)

            match_scores.append(score)

        maxValue = max(match_scores)
github inspirehep / beard / beard / similarity / pairs.py View on Github external
def _use_similarity(x, y):
    if len(x) <= 1 or len(y) <= 1:
        return -1.

    # jaro_winkler crashes if slashes are provided.
    return jellyfish.jaro_winkler(x, y)
github davidsbatista / BREDS / automatic-evaluation / code-examples / compare-results.py View on Github external
f_not_found = open("not_found.txt","w")
    f_negative  = open("negative.txt","w")
    f_positive  = open("positive.txt","w")

    tuples_not_found = set()

    for t in total_uniq:
        # try a direct match
        per_extracted = t[0].decode("utf8").upper().strip()
        org_truth = ground_truth.get(per_extracted)
        found = False;
        if org_truth:
            # if there is a direct look for similar organisations
            for org in org_truth:
                score = jellyfish.jaro_winkler(org.encode("utf8"),t[1].upper())
                if score>=0.8:
                    f_positive.write(t[0]+'\t'+t[1]+'\n')
                    positive += 1
                    found = True
                    break;

            if found == False:
                negative += 1
                f_negative.write(t[0]+'\t'+t[1]+'\t\t:'+';'.join(org_truth).encode("utf8")+'\n')

        else:
            tuples_not_found.add(t)
            not_found += 1

    for t in tuples_not_found:
        f_not_found.write(t[0]+'\t'+t[1]+'\n')