How to use the textdistance.algorithms.base.Base function in textdistance

To help you get started, we’ve selected a few textdistance examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github life4 / textdistance / textdistance / algorithms / edit_based.py View on Github external
dist = self.test_func(s1[r - 1], s2[c - 1])
                edit = prev[c - 1] + (not dist)
                cur[c] = min(edit, deletion, insertion)
        return cur[-1]

    def __call__(self, s1, s2):
        s1, s2 = self._get_sequences(s1, s2)

        result = self.quick_answer(s1, s2)
        if result is not None:
            return result

        return self._cicled(s1, s2)


class DamerauLevenshtein(_Base):
    """
    Compute the absolute Damerau-Levenshtein distance between the two sequences.
    The Damerau-Levenshtein distance is the minimum number of edit operations necessary
    for transforming one sequence into the other. The edit operations allowed are:

        * deletion:      ABC -> BC, AC, AB
        * insertion:     ABC -> ABCD, EABC, AEBC..
        * substitution:  ABC -> ABE, ADC, FBC..
        * transposition: ABC -> ACB, BAC

    https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance
    """
    def __init__(self, qval=1, test_func=None, external=True):
        self.qval = qval
        self.test_func = test_func or self._ident
        self.external = external
github life4 / textdistance / textdistance / algorithms / vector_based.py View on Github external
result = sum(e ** self.p for e in result)
        return result ** (1.0 / self.p)

    def __call__(self, s1, s2):
        if numpy:
            return self._numpy(s1, s2)
        else:
            return self._pure(s1, s2)


class Manhattan(_Base):
    def __call__(self, s1, s2):
        raise NotImplementedError


class Euclidean(_Base):
    def __init__(self, squared=False):
        self.squared = squared

    def _numpy(self, s1, s2):
        s1 = numpy.asarray(s1)
        s2 = numpy.asarray(s2)
        q = numpy.matrix(s1 - s2)
        result = (q * q.T).sum()
        if self.squared:
            return result
        return numpy.sqrt(result)

    def _pure(self, s1, s2):
        raise NotImplementedError

    def __call__(self, s1, s2):
github life4 / textdistance / textdistance / algorithms / edit_based.py View on Github external
self.test_func = test_func or self._ident
        self.truncate = truncate
        self.external = external

    def __call__(self, *sequences):
        sequences = self._get_sequences(*sequences)

        result = self.quick_answer(*sequences)
        if result is not None:
            return result

        _zip = zip if self.truncate else zip_longest
        return sum([not self.test_func(*es) for es in _zip(*sequences)])


class Levenshtein(_Base):
    """
    Compute the absolute Levenshtein distance between the two sequences.
    The Levenshtein distance is the minimum number of edit operations necessary
    for transforming one sequence into the other. The edit operations allowed are:

        * deletion:     ABC -> BC, AC, AB
        * insertion:    ABC -> ABCD, EABC, AEBC..
        * substitution: ABC -> ABE, ADC, FBC..

    https://en.wikipedia.org/wiki/Levenshtein_distance
    TODO: https://gist.github.com/kylebgorman/1081951/9b38b7743a3cb5167ab2c6608ac8eea7fc629dca
    """
    def __init__(self, qval=1, test_func=None, external=True):
        self.qval = qval
        self.test_func = test_func or self._ident
        self.external = external
github life4 / textdistance / textdistance / algorithms / simple.py View on Github external
class Postfix(Prefix):
    """postfix similarity
    """
    def __call__(self, *sequences):
        s = sequences[0]
        sequences = [reversed(s) for s in sequences]
        result = reversed(super(Postfix, self).__call__(*sequences))
        if isinstance(s, string_types):
            return ''.join(result)
        if isinstance(s, bytes):
            return b''.join(result)
        return list(result)


class Length(_Base):
    """Length distance
    """
    def __call__(self, *sequences):
        lengths = list(map(len, sequences))
        return max(lengths) - min(lengths)


class Identity(_BaseSimilarity):
    """Identity similarity
    """

    def maximum(self, *sequences):
        return 1

    def __call__(self, *sequences):
        return int(self._ident(*sequences))
github life4 / textdistance / textdistance / algorithms / phonetic.py View on Github external
for chars in zip(*sequences):
                if not self._ident(*chars):
                    new_sequences.append(chars)
            new_sequences = map(list, zip(*new_sequences))
            # update sequences
            ss = zip_longest(new_sequences, sequences, fillvalue=list())
            sequences = [s1 + s2[minlen:] for s1, s2 in ss]
            # update lengths
            lengths = list(map(len, sequences))

        if not lengths:
            return max_length
        return max_length - max(lengths)


class Editex(_Base):
    """
    https://anhaidgroup.github.io/py_stringmatching/v0.3.x/Editex.html
    """
    letter_groups = (
        frozenset('AEIOUY'),
        frozenset('BP'),
        frozenset('CKQ'),
        frozenset('DT'),
        frozenset('LR'),
        frozenset('MN'),
        frozenset('GJ'),
        frozenset('FPV'),
        frozenset('SXZ'),
        frozenset('CSZ'),
    )
    all_letters = frozenset('AEIOUYBPCKQDTLRMNGJFVSXZ')
github life4 / textdistance / textdistance / algorithms / compression_based.py View on Github external
__all__ = [
    'ArithNCD', 'LZMANCD', 'BZ2NCD', 'RLENCD', 'BWTRLENCD', 'ZLIBNCD',
    'SqrtNCD', 'EntropyNCD',

    'bz2_ncd', 'lzma_ncd', 'arith_ncd', 'rle_ncd', 'bwtrle_ncd', 'zlib_ncd',
    'sqrt_ncd', 'entropy_ncd',
]


try:
    string_types = (str, unicode)
except NameError:
    string_types = (str, )


class _NCDBase(_Base):
    """Normalized compression distance (NCD)

    https://en.wikipedia.org/wiki/Normalized_compression_distance#Normalized_compression_distance
    """
    qval = 1

    def __init__(self, qval=1):
        self.qval = qval

    def maximum(self, *sequences):
        return 1

    def _get_size(self, data):
        return len(self._compress(data))

    def __call__(self, *sequences):
github life4 / textdistance / textdistance / algorithms / vector_based.py View on Github external
class Chebyshev(_Base):
    def _numpy(self, s1, s2):
        s1, s2 = numpy.asarray(s1), numpy.asarray(s2)
        return max(abs(s1 - s2))

    def _pure(self, s1, s2):
        return max(abs(e1 - e2) for e1, e2 in zip(s1, s2))

    def __call__(self, s1, s2):
        if numpy:
            return self._numpy(s1, s2)
        else:
            return self._pure(s1, s2)


class Minkowski(_Base):
    def __init__(self, p=1, weight=1):
        if p < 1:
            raise ValueError("p must be at least 1")
        self.p = p
        self.weight = weight

    def _numpy(self, s1, s2):
        s1, s2 = numpy.asarray(s1), numpy.asarray(s2)
        result = (self.weight * abs(s1 - s2)) ** self.p
        return result.sum() ** (1.0 / self.p)

    def _pure(self, s1, s2):
        result = (self.weight * abs(e1 - e2) for e1, e2 in zip(s1, s2))
        result = sum(e ** self.p for e in result)
        return result ** (1.0 / self.p)
github life4 / textdistance / textdistance / algorithms / vector_based.py View on Github external
result = (self.weight * abs(s1 - s2)) ** self.p
        return result.sum() ** (1.0 / self.p)

    def _pure(self, s1, s2):
        result = (self.weight * abs(e1 - e2) for e1, e2 in zip(s1, s2))
        result = sum(e ** self.p for e in result)
        return result ** (1.0 / self.p)

    def __call__(self, s1, s2):
        if numpy:
            return self._numpy(s1, s2)
        else:
            return self._pure(s1, s2)


class Manhattan(_Base):
    def __call__(self, s1, s2):
        raise NotImplementedError


class Euclidean(_Base):
    def __init__(self, squared=False):
        self.squared = squared

    def _numpy(self, s1, s2):
        s1 = numpy.asarray(s1)
        s2 = numpy.asarray(s2)
        q = numpy.matrix(s1 - s2)
        result = (q * q.T).sum()
        if self.squared:
            return result
        return numpy.sqrt(result)
github life4 / textdistance / textdistance / algorithms / base.py View on Github external
def _count_counters(self, counter):
        """Return all elements count from Counter
        """
        if getattr(self, 'as_set', False):
            return len(set(counter))
        else:
            return sum(counter.values())

    def __repr__(self):
        return '{name}({data})'.format(
            name=type(self).__name__,
            data=self.__dict__,
        )


class BaseSimilarity(Base):
    def distance(self, *sequences):
        return self.maximum(*sequences) - self.similarity(*sequences)

    def similarity(self, *sequences):
        return self(*sequences)

    def quick_answer(self, *sequences):
        if not sequences:
            return self.maximum(*sequences)
        if len(sequences) == 1:
            return self.maximum(*sequences)
        if self._ident(*sequences):
            return self.maximum(*sequences)
        if not all(sequences):
            return 0
        # try get answer from external libs
github life4 / textdistance / textdistance / algorithms / vector_based.py View on Github external
result = (q * q.T).sum()
        if self.squared:
            return result
        return numpy.sqrt(result)

    def _pure(self, s1, s2):
        raise NotImplementedError

    def __call__(self, s1, s2):
        if numpy:
            return self._numpy(s1, s2)
        else:
            return self._pure(s1, s2)


class Mahalanobis(_Base):
    def __call__(self, s1, s2):
        raise NotImplementedError


class Correlation(_BaseSimilarity):
    def _numpy(self, *sequences):
        sequences = [numpy.asarray(s) for s in sequences]
        ssm = [s - s.mean() for s in sequences]
        result = reduce(numpy.dot, sequences)
        for sm in ssm:
            result /= numpy.sqrt(numpy.dot(sm, sm))
        return result

    def _pure(self, *sequences):
        raise NotImplementedError