Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
dist = self.test_func(s1[r - 1], s2[c - 1])
edit = prev[c - 1] + (not dist)
cur[c] = min(edit, deletion, insertion)
return cur[-1]
def __call__(self, s1, s2):
s1, s2 = self._get_sequences(s1, s2)
result = self.quick_answer(s1, s2)
if result is not None:
return result
return self._cicled(s1, s2)
class DamerauLevenshtein(_Base):
"""
Compute the absolute Damerau-Levenshtein distance between the two sequences.
The Damerau-Levenshtein distance is the minimum number of edit operations necessary
for transforming one sequence into the other. The edit operations allowed are:
* deletion: ABC -> BC, AC, AB
* insertion: ABC -> ABCD, EABC, AEBC..
* substitution: ABC -> ABE, ADC, FBC..
* transposition: ABC -> ACB, BAC
https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance
"""
def __init__(self, qval=1, test_func=None, external=True):
self.qval = qval
self.test_func = test_func or self._ident
self.external = external
result = sum(e ** self.p for e in result)
return result ** (1.0 / self.p)
def __call__(self, s1, s2):
if numpy:
return self._numpy(s1, s2)
else:
return self._pure(s1, s2)
class Manhattan(_Base):
def __call__(self, s1, s2):
raise NotImplementedError
class Euclidean(_Base):
def __init__(self, squared=False):
self.squared = squared
def _numpy(self, s1, s2):
s1 = numpy.asarray(s1)
s2 = numpy.asarray(s2)
q = numpy.matrix(s1 - s2)
result = (q * q.T).sum()
if self.squared:
return result
return numpy.sqrt(result)
def _pure(self, s1, s2):
raise NotImplementedError
def __call__(self, s1, s2):
self.test_func = test_func or self._ident
self.truncate = truncate
self.external = external
def __call__(self, *sequences):
sequences = self._get_sequences(*sequences)
result = self.quick_answer(*sequences)
if result is not None:
return result
_zip = zip if self.truncate else zip_longest
return sum([not self.test_func(*es) for es in _zip(*sequences)])
class Levenshtein(_Base):
"""
Compute the absolute Levenshtein distance between the two sequences.
The Levenshtein distance is the minimum number of edit operations necessary
for transforming one sequence into the other. The edit operations allowed are:
* deletion: ABC -> BC, AC, AB
* insertion: ABC -> ABCD, EABC, AEBC..
* substitution: ABC -> ABE, ADC, FBC..
https://en.wikipedia.org/wiki/Levenshtein_distance
TODO: https://gist.github.com/kylebgorman/1081951/9b38b7743a3cb5167ab2c6608ac8eea7fc629dca
"""
def __init__(self, qval=1, test_func=None, external=True):
self.qval = qval
self.test_func = test_func or self._ident
self.external = external
class Postfix(Prefix):
"""postfix similarity
"""
def __call__(self, *sequences):
s = sequences[0]
sequences = [reversed(s) for s in sequences]
result = reversed(super(Postfix, self).__call__(*sequences))
if isinstance(s, string_types):
return ''.join(result)
if isinstance(s, bytes):
return b''.join(result)
return list(result)
class Length(_Base):
"""Length distance
"""
def __call__(self, *sequences):
lengths = list(map(len, sequences))
return max(lengths) - min(lengths)
class Identity(_BaseSimilarity):
"""Identity similarity
"""
def maximum(self, *sequences):
return 1
def __call__(self, *sequences):
return int(self._ident(*sequences))
for chars in zip(*sequences):
if not self._ident(*chars):
new_sequences.append(chars)
new_sequences = map(list, zip(*new_sequences))
# update sequences
ss = zip_longest(new_sequences, sequences, fillvalue=list())
sequences = [s1 + s2[minlen:] for s1, s2 in ss]
# update lengths
lengths = list(map(len, sequences))
if not lengths:
return max_length
return max_length - max(lengths)
class Editex(_Base):
"""
https://anhaidgroup.github.io/py_stringmatching/v0.3.x/Editex.html
"""
letter_groups = (
frozenset('AEIOUY'),
frozenset('BP'),
frozenset('CKQ'),
frozenset('DT'),
frozenset('LR'),
frozenset('MN'),
frozenset('GJ'),
frozenset('FPV'),
frozenset('SXZ'),
frozenset('CSZ'),
)
all_letters = frozenset('AEIOUYBPCKQDTLRMNGJFVSXZ')
__all__ = [
'ArithNCD', 'LZMANCD', 'BZ2NCD', 'RLENCD', 'BWTRLENCD', 'ZLIBNCD',
'SqrtNCD', 'EntropyNCD',
'bz2_ncd', 'lzma_ncd', 'arith_ncd', 'rle_ncd', 'bwtrle_ncd', 'zlib_ncd',
'sqrt_ncd', 'entropy_ncd',
]
try:
string_types = (str, unicode)
except NameError:
string_types = (str, )
class _NCDBase(_Base):
"""Normalized compression distance (NCD)
https://en.wikipedia.org/wiki/Normalized_compression_distance#Normalized_compression_distance
"""
qval = 1
def __init__(self, qval=1):
self.qval = qval
def maximum(self, *sequences):
return 1
def _get_size(self, data):
return len(self._compress(data))
def __call__(self, *sequences):
class Chebyshev(_Base):
def _numpy(self, s1, s2):
s1, s2 = numpy.asarray(s1), numpy.asarray(s2)
return max(abs(s1 - s2))
def _pure(self, s1, s2):
return max(abs(e1 - e2) for e1, e2 in zip(s1, s2))
def __call__(self, s1, s2):
if numpy:
return self._numpy(s1, s2)
else:
return self._pure(s1, s2)
class Minkowski(_Base):
def __init__(self, p=1, weight=1):
if p < 1:
raise ValueError("p must be at least 1")
self.p = p
self.weight = weight
def _numpy(self, s1, s2):
s1, s2 = numpy.asarray(s1), numpy.asarray(s2)
result = (self.weight * abs(s1 - s2)) ** self.p
return result.sum() ** (1.0 / self.p)
def _pure(self, s1, s2):
result = (self.weight * abs(e1 - e2) for e1, e2 in zip(s1, s2))
result = sum(e ** self.p for e in result)
return result ** (1.0 / self.p)
result = (self.weight * abs(s1 - s2)) ** self.p
return result.sum() ** (1.0 / self.p)
def _pure(self, s1, s2):
result = (self.weight * abs(e1 - e2) for e1, e2 in zip(s1, s2))
result = sum(e ** self.p for e in result)
return result ** (1.0 / self.p)
def __call__(self, s1, s2):
if numpy:
return self._numpy(s1, s2)
else:
return self._pure(s1, s2)
class Manhattan(_Base):
def __call__(self, s1, s2):
raise NotImplementedError
class Euclidean(_Base):
def __init__(self, squared=False):
self.squared = squared
def _numpy(self, s1, s2):
s1 = numpy.asarray(s1)
s2 = numpy.asarray(s2)
q = numpy.matrix(s1 - s2)
result = (q * q.T).sum()
if self.squared:
return result
return numpy.sqrt(result)
def _count_counters(self, counter):
"""Return all elements count from Counter
"""
if getattr(self, 'as_set', False):
return len(set(counter))
else:
return sum(counter.values())
def __repr__(self):
return '{name}({data})'.format(
name=type(self).__name__,
data=self.__dict__,
)
class BaseSimilarity(Base):
def distance(self, *sequences):
return self.maximum(*sequences) - self.similarity(*sequences)
def similarity(self, *sequences):
return self(*sequences)
def quick_answer(self, *sequences):
if not sequences:
return self.maximum(*sequences)
if len(sequences) == 1:
return self.maximum(*sequences)
if self._ident(*sequences):
return self.maximum(*sequences)
if not all(sequences):
return 0
# try get answer from external libs
result = (q * q.T).sum()
if self.squared:
return result
return numpy.sqrt(result)
def _pure(self, s1, s2):
raise NotImplementedError
def __call__(self, s1, s2):
if numpy:
return self._numpy(s1, s2)
else:
return self._pure(s1, s2)
class Mahalanobis(_Base):
def __call__(self, s1, s2):
raise NotImplementedError
class Correlation(_BaseSimilarity):
def _numpy(self, *sequences):
sequences = [numpy.asarray(s) for s in sequences]
ssm = [s - s.mean() for s in sequences]
result = reduce(numpy.dot, sequences)
for sm in ssm:
result /= numpy.sqrt(numpy.dot(sm, sm))
return result
def _pure(self, *sequences):
raise NotImplementedError