# How to use textdistance - 10 common examples

## To help you get started, we’ve selected a few textdistance examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

life4 / textdistance / textdistance / algorithms / compression_based.py View on Github
``````probs = self._make_probs(data)
start, end = self._get_range(data=data, probs=probs)
output_fraction = Fraction(0, 1)
output_denominator = 1
while not (start <= output_fraction < end):
output_numerator = 1 + ((start.numerator * output_denominator) // start.denominator)
output_fraction = Fraction(output_numerator, output_denominator)
output_denominator *= 2
return output_fraction

def _get_size(self, data):
numerator = self._compress(data).numerator
return math.ceil(math.log(numerator, self.base))

class RLENCD(_NCDBase):
"""Run-length encoding

https://en.wikipedia.org/wiki/Run-length_encoding
"""

def _compress(self, data):
new_data = []
for k, g in groupby(data):
n = len(list(g))
if n > 2:
new_data.append(str(n) + k)
elif n == 1:
new_data.append(k)
else:
new_data.append(2 * k)
return ''.join(new_data)``````
life4 / textdistance / run_tests.py View on Github
``````import os

try:
import unittest2 as unittest
except ImportError:
import unittest

import textdistance
from textdistance.libraries import prototype

libraries = prototype.clone()
# CONSTRAINTS = os.getenv('WITH_CONSTRAINTS', 'yes') == 'yes'
# NUMPY = os.getenv('WITH_NUMPY', 'yes') == 'yes'
CONSTRAINTS = os.environ['WITH_CONSTRAINTS'] == 'yes'
NUMPY = os.environ['WITH_NUMPY'] == 'yes'

from tests import *  # noQA

if __name__ == '__main__':
unittest.main()``````
life4 / textdistance / textdistance / algorithms / simple.py View on Github
``````# app
from .base import Base as _Base, BaseSimilarity as _BaseSimilarity

__all__ = [
'Prefix', 'Postfix', 'Length', 'Identity', 'Matrix',
'prefix', 'postfix', 'length', 'identity', 'matrix',
]

try:
string_types = (str, unicode)
except NameError:
string_types = (str, )

class Prefix(_BaseSimilarity):
"""prefix similarity
"""
def __init__(self, qval=1, sim_test=None):
self.qval = qval
self.sim_test = sim_test or self._ident

def __call__(self, *sequences):
if not sequences:
return 0
sequences = self._get_sequences(*sequences)
test = lambda seq: self.sim_test(*seq)  # noQA
result = [c[0] for c in takewhile(test, zip(*sequences))]

s = sequences[0]
if isinstance(s, string_types):
return ''.join(result)``````
life4 / textdistance / textdistance / algorithms / base.py View on Github
``````def _get_sequences(self, *sequences):
"""Prepare sequences.

qval=None: split text by words
qval=1: do not split sequences. For text this is mean comparing by letters.
qval>1: split sequences by q-grams
"""
# by words
if not self.qval:
return [s.split() for s in sequences]
# by chars
if self.qval == 1:
return sequences
# by n-grams
return [find_ngrams(s, self.qval) for s in sequences]``````
life4 / textdistance / textdistance / algorithms / edit_based.py View on Github
``````q_mat[i - 1, j - 1] + sim_val,
)
p_mat[i, j] = max(
d_mat[i - 1, j] - self.gap_open,
p_mat[i - 1, j] - self.gap_ext,
)
q_mat[i, j] = max(
d_mat[i, j - 1] - self.gap_open,
q_mat[i, j - 1] - self.gap_ext,
)

i, j = (n - 1 for n in d_mat.shape)
return max(d_mat[i, j], p_mat[i, j], q_mat[i, j])

class StrCmp95(_BaseSimilarity):
"""strcmp95 similarity

http://cpansearch.perl.org/src/SCW/Text-JaroWinkler-0.1/strcmp95.c
"""
sp_mx = (
('A', 'E'), ('A', 'I'), ('A', 'O'), ('A', 'U'), ('B', 'V'), ('E', 'I'),
('E', 'O'), ('E', 'U'), ('I', 'O'), ('I', 'U'), ('O', 'U'), ('I', 'Y'),
('E', 'Y'), ('C', 'G'), ('E', 'F'), ('W', 'U'), ('W', 'V'), ('X', 'K'),
('S', 'Z'), ('X', 'S'), ('Q', 'C'), ('U', 'V'), ('M', 'N'), ('L', 'I'),
('Q', 'O'), ('P', 'R'), ('I', 'J'), ('2', 'Z'), ('5', 'S'), ('8', 'B'),
('1', 'I'), ('1', 'L'), ('0', 'O'), ('0', 'Q'), ('C', 'K'), ('G', 'J'),
)

def __init__(self, long_strings=False, external=True):
self.long_strings = long_strings
self.external = external``````
life4 / textdistance / textdistance / algorithms / token_based.py View on Github
``````return 1

def __call__(self, *sequences):
if result is not None:
return result

sequences = self._get_counters(*sequences)               # sets
intersection = self._intersect_counters(*sequences)      # set
intersection = self._count_counters(intersection)        # int
union = self._union_counters(*sequences)                 # set
union = self._count_counters(union)                      # int
return intersection / float(union)

class Sorensen(_BaseSimilarity):
"""
Compute the Sorensen distance between the two sequences.
They should contain hashable items.
The return value is a float between 0 and 1, where 0 means equal,
and 1 totally different.

https://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient
https://github.com/Yomguithereal/talisman/blob/master/src/metrics/distance/dice.js
"""
def __init__(self, qval=1, as_set=False):
self.qval = qval
self.as_set = as_set

def maximum(self, *sequences):
return 1``````
life4 / textdistance / textdistance / algorithms / token_based.py View on Github
``````try:
from functools import reduce
except ImportError:
pass

__all__ = [
'Jaccard', 'Sorensen', 'Tversky',
'Overlap', 'Cosine', 'Tanimoto', 'MongeElkan', 'Bag',

'jaccard', 'sorensen', 'tversky', 'sorensen_dice',
'overlap', 'cosine', 'tanimoto', 'monge_elkan', 'bag',
]

class Jaccard(_BaseSimilarity):
"""
Compute the Jaccard similarity between the two sequences.
They should contain hashable items.
The return value is a float between 0 and 1, where 1 means equal,
and 0 totally different.

https://en.wikipedia.org/wiki/Jaccard_index
https://github.com/Yomguithereal/talisman/blob/master/src/metrics/distance/jaccard.js
"""
def __init__(self, qval=1, as_set=False, external=True):
self.qval = qval
self.as_set = as_set
self.external = external

def maximum(self, *sequences):
return 1``````
life4 / textdistance / textdistance / algorithms / token_based.py View on Github
``````if len(sequences) == 2 or self.bias is None:
result = intersection
for k, s in zip(ks, sequences):
result += k * (s - intersection)
return float(intersection) / result

s1, s2 = sequences
alpha, beta = ks
a_val = min([s1, s2])
b_val = max([s1, s2])
c_val = float(intersection + self.bias)
result = alpha * beta * (a_val - b_val) + b_val * beta
return c_val / (result + c_val)

class Overlap(_BaseSimilarity):
"""overlap coefficient

https://en.wikipedia.org/wiki/Overlap_coefficient
https://github.com/Yomguithereal/talisman/blob/master/src/metrics/distance/overlap.js
"""
def __init__(self, qval=1, as_set=False, external=True):
self.qval = qval
self.as_set = as_set
self.external = external

def maximum(self, *sequences):
return 1

def __call__(self, *sequences):
if result is not None:``````
life4 / textdistance / textdistance / algorithms / token_based.py View on Github
``````return 1

def __call__(self, *sequences):
if result is not None:
return result

sequences = self._get_counters(*sequences)                  # sets
intersection = self._intersect_counters(*sequences)         # set
intersection = self._count_counters(intersection)           # int
sequences = [self._count_counters(s) for s in sequences]    # ints

return float(intersection) / min(sequences)

class Cosine(_BaseSimilarity):
"""cosine similarity (Ochiai coefficient)

https://en.wikipedia.org/wiki/Cosine_similarity
https://github.com/Yomguithereal/talisman/blob/master/src/metrics/distance/cosine.js
"""
def __init__(self, qval=1, as_set=False):
self.qval = qval
self.as_set = as_set

def maximum(self, *sequences):
return 1

def __call__(self, *sequences):
if result is not None:
return result``````
life4 / textdistance / textdistance / algorithms / sequence_based.py View on Github
``````try:
import numpy
except ImportError:
from array import array
numpy = None

__all__ = [
'lcsseq', 'lcsstr', 'ratcliff_obershelp',
'LCSSeq', 'LCSStr', 'RatcliffObershelp',
]

class LCSSeq(_BaseSimilarity):
"""longest common subsequence similarity

https://en.wikipedia.org/wiki/Longest_common_subsequence_problem
"""
def __init__(self, qval=1, test_func=None):
self.qval = qval
self.test_func = test_func or self._ident

def _dynamic(self, seq1, seq2):
"""
https://github.com/chrislit/abydos/blob/master/abydos/distance/_lcsseq.py
http://www.dis.uniroma1.it/~bonifaci/algo/LCSSEQ.py
http://rosettacode.org/wiki/Longest_common_subsequence#Dynamic_Programming_8
"""
if numpy:
lengths = numpy.zeros((len(seq1) + 1, len(seq2) + 1), dtype=numpy.int)``````

## textdistance

Compute distance between the two texts.

MIT