How to use the textdistance.algorithms.base.BaseSimilarity function in textdistance

To help you get started, we’ve selected a few textdistance examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github life4 / textdistance / textdistance / algorithms / simple.py View on Github external
# app
from .base import Base as _Base, BaseSimilarity as _BaseSimilarity


__all__ = [
    'Prefix', 'Postfix', 'Length', 'Identity', 'Matrix',
    'prefix', 'postfix', 'length', 'identity', 'matrix',
]

try:
    string_types = (str, unicode)
except NameError:
    string_types = (str, )


class Prefix(_BaseSimilarity):
    """prefix similarity
    """
    def __init__(self, qval=1, sim_test=None):
        self.qval = qval
        self.sim_test = sim_test or self._ident

    def __call__(self, *sequences):
        if not sequences:
            return 0
        sequences = self._get_sequences(*sequences)
        test = lambda seq: self.sim_test(*seq)  # noQA
        result = [c[0] for c in takewhile(test, zip(*sequences))]

        s = sequences[0]
        if isinstance(s, string_types):
            return ''.join(result)
github life4 / textdistance / textdistance / algorithms / edit_based.py View on Github external
q_mat[i - 1, j - 1] + sim_val,
                )
                p_mat[i, j] = max(
                    d_mat[i - 1, j] - self.gap_open,
                    p_mat[i - 1, j] - self.gap_ext,
                )
                q_mat[i, j] = max(
                    d_mat[i, j - 1] - self.gap_open,
                    q_mat[i, j - 1] - self.gap_ext,
                )

        i, j = (n - 1 for n in d_mat.shape)
        return max(d_mat[i, j], p_mat[i, j], q_mat[i, j])


class StrCmp95(_BaseSimilarity):
    """strcmp95 similarity

    http://cpansearch.perl.org/src/SCW/Text-JaroWinkler-0.1/strcmp95.c
    """
    sp_mx = (
        ('A', 'E'), ('A', 'I'), ('A', 'O'), ('A', 'U'), ('B', 'V'), ('E', 'I'),
        ('E', 'O'), ('E', 'U'), ('I', 'O'), ('I', 'U'), ('O', 'U'), ('I', 'Y'),
        ('E', 'Y'), ('C', 'G'), ('E', 'F'), ('W', 'U'), ('W', 'V'), ('X', 'K'),
        ('S', 'Z'), ('X', 'S'), ('Q', 'C'), ('U', 'V'), ('M', 'N'), ('L', 'I'),
        ('Q', 'O'), ('P', 'R'), ('I', 'J'), ('2', 'Z'), ('5', 'S'), ('8', 'B'),
        ('1', 'I'), ('1', 'L'), ('0', 'O'), ('0', 'Q'), ('C', 'K'), ('G', 'J'),
    )

    def __init__(self, long_strings=False, external=True):
        self.long_strings = long_strings
        self.external = external
github life4 / textdistance / textdistance / algorithms / token_based.py View on Github external
return 1

    def __call__(self, *sequences):
        result = self.quick_answer(*sequences)
        if result is not None:
            return result

        sequences = self._get_counters(*sequences)               # sets
        intersection = self._intersect_counters(*sequences)      # set
        intersection = self._count_counters(intersection)        # int
        union = self._union_counters(*sequences)                 # set
        union = self._count_counters(union)                      # int
        return intersection / float(union)


class Sorensen(_BaseSimilarity):
    """
    Compute the Sorensen distance between the two sequences.
    They should contain hashable items.
    The return value is a float between 0 and 1, where 0 means equal,
    and 1 totally different.

    https://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient
    https://github.com/Yomguithereal/talisman/blob/master/src/metrics/distance/dice.js
    """
    def __init__(self, qval=1, as_set=False):
        self.qval = qval
        self.as_set = as_set

    def maximum(self, *sequences):
        return 1
github life4 / textdistance / textdistance / algorithms / token_based.py View on Github external
try:
    from functools import reduce
except ImportError:
    pass


__all__ = [
    'Jaccard', 'Sorensen', 'Tversky',
    'Overlap', 'Cosine', 'Tanimoto', 'MongeElkan', 'Bag',

    'jaccard', 'sorensen', 'tversky', 'sorensen_dice',
    'overlap', 'cosine', 'tanimoto', 'monge_elkan', 'bag',
]


class Jaccard(_BaseSimilarity):
    """
    Compute the Jaccard similarity between the two sequences.
    They should contain hashable items.
    The return value is a float between 0 and 1, where 1 means equal,
    and 0 totally different.

    https://en.wikipedia.org/wiki/Jaccard_index
    https://github.com/Yomguithereal/talisman/blob/master/src/metrics/distance/jaccard.js
    """
    def __init__(self, qval=1, as_set=False, external=True):
        self.qval = qval
        self.as_set = as_set
        self.external = external

    def maximum(self, *sequences):
        return 1
github life4 / textdistance / textdistance / algorithms / token_based.py View on Github external
if len(sequences) == 2 or self.bias is None:
            result = intersection
            for k, s in zip(ks, sequences):
                result += k * (s - intersection)
            return float(intersection) / result

        s1, s2 = sequences
        alpha, beta = ks
        a_val = min([s1, s2])
        b_val = max([s1, s2])
        c_val = float(intersection + self.bias)
        result = alpha * beta * (a_val - b_val) + b_val * beta
        return c_val / (result + c_val)


class Overlap(_BaseSimilarity):
    """overlap coefficient

    https://en.wikipedia.org/wiki/Overlap_coefficient
    https://github.com/Yomguithereal/talisman/blob/master/src/metrics/distance/overlap.js
    """
    def __init__(self, qval=1, as_set=False, external=True):
        self.qval = qval
        self.as_set = as_set
        self.external = external

    def maximum(self, *sequences):
        return 1

    def __call__(self, *sequences):
        result = self.quick_answer(*sequences)
        if result is not None:
github life4 / textdistance / textdistance / algorithms / token_based.py View on Github external
return 1

    def __call__(self, *sequences):
        result = self.quick_answer(*sequences)
        if result is not None:
            return result

        sequences = self._get_counters(*sequences)                  # sets
        intersection = self._intersect_counters(*sequences)         # set
        intersection = self._count_counters(intersection)           # int
        sequences = [self._count_counters(s) for s in sequences]    # ints

        return float(intersection) / min(sequences)


class Cosine(_BaseSimilarity):
    """cosine similarity (Ochiai coefficient)

    https://en.wikipedia.org/wiki/Cosine_similarity
    https://github.com/Yomguithereal/talisman/blob/master/src/metrics/distance/cosine.js
    """
    def __init__(self, qval=1, as_set=False):
        self.qval = qval
        self.as_set = as_set

    def maximum(self, *sequences):
        return 1

    def __call__(self, *sequences):
        result = self.quick_answer(*sequences)
        if result is not None:
            return result
github life4 / textdistance / textdistance / algorithms / sequence_based.py View on Github external
try:
    import numpy
except ImportError:
    from array import array
    numpy = None


__all__ = [
    'lcsseq', 'lcsstr', 'ratcliff_obershelp',
    'LCSSeq', 'LCSStr', 'RatcliffObershelp',
]


class LCSSeq(_BaseSimilarity):
    """longest common subsequence similarity

    https://en.wikipedia.org/wiki/Longest_common_subsequence_problem
    """
    def __init__(self, qval=1, test_func=None):
        self.qval = qval
        self.test_func = test_func or self._ident

    def _dynamic(self, seq1, seq2):
        """
        https://github.com/chrislit/abydos/blob/master/abydos/distance/_lcsseq.py
        http://www.dis.uniroma1.it/~bonifaci/algo/LCSSEQ.py
        http://rosettacode.org/wiki/Longest_common_subsequence#Dynamic_Programming_8
        """
        if numpy:
            lengths = numpy.zeros((len(seq1) + 1, len(seq2) + 1), dtype=numpy.int)
github life4 / textdistance / textdistance / algorithms / vector_based.py View on Github external
result = reduce(numpy.dot, sequences)
        for sm in ssm:
            result /= numpy.sqrt(numpy.dot(sm, sm))
        return result

    def _pure(self, *sequences):
        raise NotImplementedError

    def __call__(self, *sequences):
        if numpy:
            return self._numpy(*sequences)
        else:
            return self._pure(*sequences)


class Kulsinski(_BaseSimilarity):
    def __call__(self, s1, s2):
        raise NotImplementedError
github life4 / textdistance / textdistance / algorithms / edit_based.py View on Github external
return weight
        tmp = float(common_chars - i - 1) / (s1_len + s2_len - i * 2 + 2)
        weight += (1.0 - weight) * tmp
        return weight


class Jaro(JaroWinkler):
    def __init__(self, long_tolerance=False, qval=1, external=True):
        super(Jaro, self).__init__(
            long_tolerance=long_tolerance,
            winklerize=False,
            qval=qval,
            external=external)


class NeedlemanWunsch(_BaseSimilarity):
    """
    Computes the Needleman-Wunsch measure between two strings.
    The Needleman-Wunsch generalizes the Levenshtein distance and considers global
    alignment between two strings. Specifically, it is computed by assigning
    a score to each alignment between two input strings and choosing the
    score of the best alignment, that is, the maximal score.
    An alignment between two strings is a set of correspondences between the
    characters of between them, allowing for gaps.

    https://en.wikipedia.org/wiki/Needleman%E2%80%93Wunsch_algorithm
    """
    positive = False

    def __init__(self, gap_cost=1.0, sim_func=None, qval=1, external=True):
        self.qval = qval
        self.gap_cost = gap_cost
github life4 / textdistance / textdistance / algorithms / bag_based.py View on Github external
from .base import BaseSimilarity as _BaseSimilarity


__all__ = ['bag']


class Bag(_BaseSimilarity):
    """Bag distance
    """
    def __call__(self, *sequences):
        sequences = self._get_counters(*sequences)              # sets
        intersection = self._intersect_counters(*sequences)     # set
        return self._count_counters(intersection)               # int


bag = Bag()