How to use the jellyfish.damerau_levenshtein_distance function in jellyfish

To help you get started, we’ve selected a few jellyfish examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github LaureBerti / Learn2Clean / python-package / learn2clean / duplicate_detection / duplicate_detector.py View on Github external
for row in data.index:

            data[row] = data[row].lower()

        out = pd.DataFrame(columns=["Dup_ID1", "Dup_ID2", "Dup_1", "Dup_2"])

        if metric == "DL":  # Damerau Levenshtein Distance

            res = {_d: [] for _d in data}

            for _d in res.keys():

                for row in data.index:

                    if _d != data[row] \
                        and jf.damerau_levenshtein_distance(_d, data[row]) < \
                            ((len(_d) + len(data[row])/2)*threshold):

                        res[_d].append(data[row])

                        out.loc[len(out)] = (
                            _d.split("*")[-1], row, _d, data[row])

        elif metric == "LM":  # Levenshtein Distance

            res = {_d: [] for _d in data}

            for _d in res.keys():

                for row in data.index:

                    if _d != data[row] \
github arkhn / pagai / pagai / engine / structure / column.py View on Github external
def match_name_score(self, query):
        corpus = [self.table, self.column] + self.column.split(".")
        score = 10e10
        for word in corpus:
            distance = jellyfish.damerau_levenshtein_distance(word, query)
            if query in word:
                distance /= 2
            score = min(score, distance)
        return score
github target / huntlib / huntlib / __init__.py View on Github external
Available algorithms:
        * levenshtein
        * damerau-levenshtein (DEFAULT)
        * hamming
        * jaro
        * jaro-winkler

    Return values:
        "levenshtein", "damerau-levenshtein" and "hamming" return integers
        "jaro" and "jaro-winkler" return floats in the range of 0.0 (completely
        different) to 1.0 (identical strings).
    '''
    algos = {
        "levenshtein":levenshtein_distance,
        "damerau-levenshtein":damerau_levenshtein_distance,
        "hamming":hamming_distance,
        "jaro":jaro_similarity,
        "jaro-winkler":jaro_winkler_similarity
    }

    if not method in list(algos.keys()):
        raise ValueError("Unsupported algorithm type: %s" % method)

    if str1 is None or str2 is None or not isinstance(str1, str) or not isinstance(str2, str):
        raise TypeError("Arguments must be strings.")

    distance_function = algos[method]

    # All the jellyfish distance functions expect unicode, which is the default
    # for Python3.  If we're running in Python2, we need to convert them.
    python_version = sys.version_info
github facebookincubator / python-nubia / nubia / internal / registry.py View on Github external
def are_close_enough(this, that):
            return jellyfish.damerau_levenshtein_distance(this, that) <= 2
github WeblateOrg / weblate / weblate / utils / search.py View on Github external
def similarity(self, first, second):
        """Returns string similarity in range 0 - 100%."""
        try:
            try:
                distance = damerau_levenshtein_distance(first, second)
            except ValueError:
                # Needed on Python 2 only (actually jellyfish < 0.7.2)
                distance = py_damerau_levenshtein_distance(first, second)

            return int(
                100 * (1.0 - (float(distance) / max(len(first), len(second), 1)))
            )
        except MemoryError:
            # Too long string, mark them as not much similar
            return 50
github J535D165 / recordlinkage / recordlinkage / algorithms / string.py View on Github external
def damerau_levenshtein_apply(x):

        try:
            return 1 - jellyfish.damerau_levenshtein_distance(x[0], x[1]) \
                / np.max([len(x[0]), len(x[1])])
        except Exception as err:
            if pandas.isnull(x[0]) or pandas.isnull(x[1]):
                return np.nan
            else:
                raise err