Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
# app
from .base import Base as _Base, BaseSimilarity as _BaseSimilarity
__all__ = [
'Prefix', 'Postfix', 'Length', 'Identity', 'Matrix',
'prefix', 'postfix', 'length', 'identity', 'matrix',
]
try:
string_types = (str, unicode)
except NameError:
string_types = (str, )
class Prefix(_BaseSimilarity):
"""prefix similarity
"""
def __init__(self, qval=1, sim_test=None):
self.qval = qval
self.sim_test = sim_test or self._ident
def __call__(self, *sequences):
if not sequences:
return 0
sequences = self._get_sequences(*sequences)
test = lambda seq: self.sim_test(*seq) # noQA
result = [c[0] for c in takewhile(test, zip(*sequences))]
s = sequences[0]
if isinstance(s, string_types):
return ''.join(result)
q_mat[i - 1, j - 1] + sim_val,
)
p_mat[i, j] = max(
d_mat[i - 1, j] - self.gap_open,
p_mat[i - 1, j] - self.gap_ext,
)
q_mat[i, j] = max(
d_mat[i, j - 1] - self.gap_open,
q_mat[i, j - 1] - self.gap_ext,
)
i, j = (n - 1 for n in d_mat.shape)
return max(d_mat[i, j], p_mat[i, j], q_mat[i, j])
class StrCmp95(_BaseSimilarity):
"""strcmp95 similarity
http://cpansearch.perl.org/src/SCW/Text-JaroWinkler-0.1/strcmp95.c
"""
sp_mx = (
('A', 'E'), ('A', 'I'), ('A', 'O'), ('A', 'U'), ('B', 'V'), ('E', 'I'),
('E', 'O'), ('E', 'U'), ('I', 'O'), ('I', 'U'), ('O', 'U'), ('I', 'Y'),
('E', 'Y'), ('C', 'G'), ('E', 'F'), ('W', 'U'), ('W', 'V'), ('X', 'K'),
('S', 'Z'), ('X', 'S'), ('Q', 'C'), ('U', 'V'), ('M', 'N'), ('L', 'I'),
('Q', 'O'), ('P', 'R'), ('I', 'J'), ('2', 'Z'), ('5', 'S'), ('8', 'B'),
('1', 'I'), ('1', 'L'), ('0', 'O'), ('0', 'Q'), ('C', 'K'), ('G', 'J'),
)
def __init__(self, long_strings=False, external=True):
self.long_strings = long_strings
self.external = external
return 1
def __call__(self, *sequences):
result = self.quick_answer(*sequences)
if result is not None:
return result
sequences = self._get_counters(*sequences) # sets
intersection = self._intersect_counters(*sequences) # set
intersection = self._count_counters(intersection) # int
union = self._union_counters(*sequences) # set
union = self._count_counters(union) # int
return intersection / float(union)
class Sorensen(_BaseSimilarity):
"""
Compute the Sorensen distance between the two sequences.
They should contain hashable items.
The return value is a float between 0 and 1, where 0 means equal,
and 1 totally different.
https://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient
https://github.com/Yomguithereal/talisman/blob/master/src/metrics/distance/dice.js
"""
def __init__(self, qval=1, as_set=False):
self.qval = qval
self.as_set = as_set
def maximum(self, *sequences):
return 1
try:
from functools import reduce
except ImportError:
pass
__all__ = [
'Jaccard', 'Sorensen', 'Tversky',
'Overlap', 'Cosine', 'Tanimoto', 'MongeElkan', 'Bag',
'jaccard', 'sorensen', 'tversky', 'sorensen_dice',
'overlap', 'cosine', 'tanimoto', 'monge_elkan', 'bag',
]
class Jaccard(_BaseSimilarity):
"""
Compute the Jaccard similarity between the two sequences.
They should contain hashable items.
The return value is a float between 0 and 1, where 1 means equal,
and 0 totally different.
https://en.wikipedia.org/wiki/Jaccard_index
https://github.com/Yomguithereal/talisman/blob/master/src/metrics/distance/jaccard.js
"""
def __init__(self, qval=1, as_set=False, external=True):
self.qval = qval
self.as_set = as_set
self.external = external
def maximum(self, *sequences):
return 1
if len(sequences) == 2 or self.bias is None:
result = intersection
for k, s in zip(ks, sequences):
result += k * (s - intersection)
return float(intersection) / result
s1, s2 = sequences
alpha, beta = ks
a_val = min([s1, s2])
b_val = max([s1, s2])
c_val = float(intersection + self.bias)
result = alpha * beta * (a_val - b_val) + b_val * beta
return c_val / (result + c_val)
class Overlap(_BaseSimilarity):
"""overlap coefficient
https://en.wikipedia.org/wiki/Overlap_coefficient
https://github.com/Yomguithereal/talisman/blob/master/src/metrics/distance/overlap.js
"""
def __init__(self, qval=1, as_set=False, external=True):
self.qval = qval
self.as_set = as_set
self.external = external
def maximum(self, *sequences):
return 1
def __call__(self, *sequences):
result = self.quick_answer(*sequences)
if result is not None:
return 1
def __call__(self, *sequences):
result = self.quick_answer(*sequences)
if result is not None:
return result
sequences = self._get_counters(*sequences) # sets
intersection = self._intersect_counters(*sequences) # set
intersection = self._count_counters(intersection) # int
sequences = [self._count_counters(s) for s in sequences] # ints
return float(intersection) / min(sequences)
class Cosine(_BaseSimilarity):
"""cosine similarity (Ochiai coefficient)
https://en.wikipedia.org/wiki/Cosine_similarity
https://github.com/Yomguithereal/talisman/blob/master/src/metrics/distance/cosine.js
"""
def __init__(self, qval=1, as_set=False):
self.qval = qval
self.as_set = as_set
def maximum(self, *sequences):
return 1
def __call__(self, *sequences):
result = self.quick_answer(*sequences)
if result is not None:
return result
try:
import numpy
except ImportError:
from array import array
numpy = None
__all__ = [
'lcsseq', 'lcsstr', 'ratcliff_obershelp',
'LCSSeq', 'LCSStr', 'RatcliffObershelp',
]
class LCSSeq(_BaseSimilarity):
"""longest common subsequence similarity
https://en.wikipedia.org/wiki/Longest_common_subsequence_problem
"""
def __init__(self, qval=1, test_func=None):
self.qval = qval
self.test_func = test_func or self._ident
def _dynamic(self, seq1, seq2):
"""
https://github.com/chrislit/abydos/blob/master/abydos/distance/_lcsseq.py
http://www.dis.uniroma1.it/~bonifaci/algo/LCSSEQ.py
http://rosettacode.org/wiki/Longest_common_subsequence#Dynamic_Programming_8
"""
if numpy:
lengths = numpy.zeros((len(seq1) + 1, len(seq2) + 1), dtype=numpy.int)
result = reduce(numpy.dot, sequences)
for sm in ssm:
result /= numpy.sqrt(numpy.dot(sm, sm))
return result
def _pure(self, *sequences):
raise NotImplementedError
def __call__(self, *sequences):
if numpy:
return self._numpy(*sequences)
else:
return self._pure(*sequences)
class Kulsinski(_BaseSimilarity):
def __call__(self, s1, s2):
raise NotImplementedError
return weight
tmp = float(common_chars - i - 1) / (s1_len + s2_len - i * 2 + 2)
weight += (1.0 - weight) * tmp
return weight
class Jaro(JaroWinkler):
def __init__(self, long_tolerance=False, qval=1, external=True):
super(Jaro, self).__init__(
long_tolerance=long_tolerance,
winklerize=False,
qval=qval,
external=external)
class NeedlemanWunsch(_BaseSimilarity):
"""
Computes the Needleman-Wunsch measure between two strings.
The Needleman-Wunsch generalizes the Levenshtein distance and considers global
alignment between two strings. Specifically, it is computed by assigning
a score to each alignment between two input strings and choosing the
score of the best alignment, that is, the maximal score.
An alignment between two strings is a set of correspondences between the
characters of between them, allowing for gaps.
https://en.wikipedia.org/wiki/Needleman%E2%80%93Wunsch_algorithm
"""
positive = False
def __init__(self, gap_cost=1.0, sim_func=None, qval=1, external=True):
self.qval = qval
self.gap_cost = gap_cost
from .base import BaseSimilarity as _BaseSimilarity
__all__ = ['bag']
class Bag(_BaseSimilarity):
"""Bag distance
"""
def __call__(self, *sequences):
sequences = self._get_counters(*sequences) # sets
intersection = self._intersect_counters(*sequences) # set
return self._count_counters(intersection) # int
bag = Bag()