How to use textdistance - 10 common examples

To help you get started, we’ve selected a few textdistance examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github life4 / textdistance / textdistance / algorithms / compression_based.py View on Github external
probs = self._make_probs(data)
        start, end = self._get_range(data=data, probs=probs)
        output_fraction = Fraction(0, 1)
        output_denominator = 1
        while not (start <= output_fraction < end):
            output_numerator = 1 + ((start.numerator * output_denominator) // start.denominator)
            output_fraction = Fraction(output_numerator, output_denominator)
            output_denominator *= 2
        return output_fraction

    def _get_size(self, data):
        numerator = self._compress(data).numerator
        return math.ceil(math.log(numerator, self.base))


class RLENCD(_NCDBase):
    """Run-length encoding

    https://en.wikipedia.org/wiki/Run-length_encoding
    """

    def _compress(self, data):
        new_data = []
        for k, g in groupby(data):
            n = len(list(g))
            if n > 2:
                new_data.append(str(n) + k)
            elif n == 1:
                new_data.append(k)
            else:
                new_data.append(2 * k)
        return ''.join(new_data)
github life4 / textdistance / run_tests.py View on Github external
import os

try:
    import unittest2 as unittest
except ImportError:
    import unittest

import textdistance
from textdistance.libraries import prototype


libraries = prototype.clone()
# CONSTRAINTS = os.getenv('WITH_CONSTRAINTS', 'yes') == 'yes'
# NUMPY = os.getenv('WITH_NUMPY', 'yes') == 'yes'
CONSTRAINTS = os.environ['WITH_CONSTRAINTS'] == 'yes'
NUMPY = os.environ['WITH_NUMPY'] == 'yes'


from tests import *  # noQA


if __name__ == '__main__':
    unittest.main()
github life4 / textdistance / textdistance / algorithms / simple.py View on Github external
# app
from .base import Base as _Base, BaseSimilarity as _BaseSimilarity


__all__ = [
    'Prefix', 'Postfix', 'Length', 'Identity', 'Matrix',
    'prefix', 'postfix', 'length', 'identity', 'matrix',
]

try:
    string_types = (str, unicode)
except NameError:
    string_types = (str, )


class Prefix(_BaseSimilarity):
    """prefix similarity
    """
    def __init__(self, qval=1, sim_test=None):
        self.qval = qval
        self.sim_test = sim_test or self._ident

    def __call__(self, *sequences):
        if not sequences:
            return 0
        sequences = self._get_sequences(*sequences)
        test = lambda seq: self.sim_test(*seq)  # noQA
        result = [c[0] for c in takewhile(test, zip(*sequences))]

        s = sequences[0]
        if isinstance(s, string_types):
            return ''.join(result)
github life4 / textdistance / textdistance / algorithms / base.py View on Github external
def _get_sequences(self, *sequences):
        """Prepare sequences.

        qval=None: split text by words
        qval=1: do not split sequences. For text this is mean comparing by letters.
        qval>1: split sequences by q-grams
        """
        # by words
        if not self.qval:
            return [s.split() for s in sequences]
        # by chars
        if self.qval == 1:
            return sequences
        # by n-grams
        return [find_ngrams(s, self.qval) for s in sequences]
github life4 / textdistance / textdistance / algorithms / edit_based.py View on Github external
q_mat[i - 1, j - 1] + sim_val,
                )
                p_mat[i, j] = max(
                    d_mat[i - 1, j] - self.gap_open,
                    p_mat[i - 1, j] - self.gap_ext,
                )
                q_mat[i, j] = max(
                    d_mat[i, j - 1] - self.gap_open,
                    q_mat[i, j - 1] - self.gap_ext,
                )

        i, j = (n - 1 for n in d_mat.shape)
        return max(d_mat[i, j], p_mat[i, j], q_mat[i, j])


class StrCmp95(_BaseSimilarity):
    """strcmp95 similarity

    http://cpansearch.perl.org/src/SCW/Text-JaroWinkler-0.1/strcmp95.c
    """
    sp_mx = (
        ('A', 'E'), ('A', 'I'), ('A', 'O'), ('A', 'U'), ('B', 'V'), ('E', 'I'),
        ('E', 'O'), ('E', 'U'), ('I', 'O'), ('I', 'U'), ('O', 'U'), ('I', 'Y'),
        ('E', 'Y'), ('C', 'G'), ('E', 'F'), ('W', 'U'), ('W', 'V'), ('X', 'K'),
        ('S', 'Z'), ('X', 'S'), ('Q', 'C'), ('U', 'V'), ('M', 'N'), ('L', 'I'),
        ('Q', 'O'), ('P', 'R'), ('I', 'J'), ('2', 'Z'), ('5', 'S'), ('8', 'B'),
        ('1', 'I'), ('1', 'L'), ('0', 'O'), ('0', 'Q'), ('C', 'K'), ('G', 'J'),
    )

    def __init__(self, long_strings=False, external=True):
        self.long_strings = long_strings
        self.external = external
github life4 / textdistance / textdistance / algorithms / token_based.py View on Github external
return 1

    def __call__(self, *sequences):
        result = self.quick_answer(*sequences)
        if result is not None:
            return result

        sequences = self._get_counters(*sequences)               # sets
        intersection = self._intersect_counters(*sequences)      # set
        intersection = self._count_counters(intersection)        # int
        union = self._union_counters(*sequences)                 # set
        union = self._count_counters(union)                      # int
        return intersection / float(union)


class Sorensen(_BaseSimilarity):
    """
    Compute the Sorensen distance between the two sequences.
    They should contain hashable items.
    The return value is a float between 0 and 1, where 0 means equal,
    and 1 totally different.

    https://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient
    https://github.com/Yomguithereal/talisman/blob/master/src/metrics/distance/dice.js
    """
    def __init__(self, qval=1, as_set=False):
        self.qval = qval
        self.as_set = as_set

    def maximum(self, *sequences):
        return 1
github life4 / textdistance / textdistance / algorithms / token_based.py View on Github external
try:
    from functools import reduce
except ImportError:
    pass


__all__ = [
    'Jaccard', 'Sorensen', 'Tversky',
    'Overlap', 'Cosine', 'Tanimoto', 'MongeElkan', 'Bag',

    'jaccard', 'sorensen', 'tversky', 'sorensen_dice',
    'overlap', 'cosine', 'tanimoto', 'monge_elkan', 'bag',
]


class Jaccard(_BaseSimilarity):
    """
    Compute the Jaccard similarity between the two sequences.
    They should contain hashable items.
    The return value is a float between 0 and 1, where 1 means equal,
    and 0 totally different.

    https://en.wikipedia.org/wiki/Jaccard_index
    https://github.com/Yomguithereal/talisman/blob/master/src/metrics/distance/jaccard.js
    """
    def __init__(self, qval=1, as_set=False, external=True):
        self.qval = qval
        self.as_set = as_set
        self.external = external

    def maximum(self, *sequences):
        return 1
github life4 / textdistance / textdistance / algorithms / token_based.py View on Github external
if len(sequences) == 2 or self.bias is None:
            result = intersection
            for k, s in zip(ks, sequences):
                result += k * (s - intersection)
            return float(intersection) / result

        s1, s2 = sequences
        alpha, beta = ks
        a_val = min([s1, s2])
        b_val = max([s1, s2])
        c_val = float(intersection + self.bias)
        result = alpha * beta * (a_val - b_val) + b_val * beta
        return c_val / (result + c_val)


class Overlap(_BaseSimilarity):
    """overlap coefficient

    https://en.wikipedia.org/wiki/Overlap_coefficient
    https://github.com/Yomguithereal/talisman/blob/master/src/metrics/distance/overlap.js
    """
    def __init__(self, qval=1, as_set=False, external=True):
        self.qval = qval
        self.as_set = as_set
        self.external = external

    def maximum(self, *sequences):
        return 1

    def __call__(self, *sequences):
        result = self.quick_answer(*sequences)
        if result is not None:
github life4 / textdistance / textdistance / algorithms / token_based.py View on Github external
return 1

    def __call__(self, *sequences):
        result = self.quick_answer(*sequences)
        if result is not None:
            return result

        sequences = self._get_counters(*sequences)                  # sets
        intersection = self._intersect_counters(*sequences)         # set
        intersection = self._count_counters(intersection)           # int
        sequences = [self._count_counters(s) for s in sequences]    # ints

        return float(intersection) / min(sequences)


class Cosine(_BaseSimilarity):
    """cosine similarity (Ochiai coefficient)

    https://en.wikipedia.org/wiki/Cosine_similarity
    https://github.com/Yomguithereal/talisman/blob/master/src/metrics/distance/cosine.js
    """
    def __init__(self, qval=1, as_set=False):
        self.qval = qval
        self.as_set = as_set

    def maximum(self, *sequences):
        return 1

    def __call__(self, *sequences):
        result = self.quick_answer(*sequences)
        if result is not None:
            return result
github life4 / textdistance / textdistance / algorithms / sequence_based.py View on Github external
try:
    import numpy
except ImportError:
    from array import array
    numpy = None


__all__ = [
    'lcsseq', 'lcsstr', 'ratcliff_obershelp',
    'LCSSeq', 'LCSStr', 'RatcliffObershelp',
]


class LCSSeq(_BaseSimilarity):
    """longest common subsequence similarity

    https://en.wikipedia.org/wiki/Longest_common_subsequence_problem
    """
    def __init__(self, qval=1, test_func=None):
        self.qval = qval
        self.test_func = test_func or self._ident

    def _dynamic(self, seq1, seq2):
        """
        https://github.com/chrislit/abydos/blob/master/abydos/distance/_lcsseq.py
        http://www.dis.uniroma1.it/~bonifaci/algo/LCSSEQ.py
        http://rosettacode.org/wiki/Longest_common_subsequence#Dynamic_Programming_8
        """
        if numpy:
            lengths = numpy.zeros((len(seq1) + 1, len(seq2) + 1), dtype=numpy.int)