Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def localitySensitiveHashing(self, colName, blockSize=6, method = "levenshtein", threshold = 0.81):
"""
colName: the column to be clustered
blockSize: size of blocking
method: methods to calculate the similarity
threshold: controls how similar two items should be thought to be the same
"""
self._tf._assert_cols_in_df(columns_provided=[colName], columns_df=self._tf._df.columns)
rdd = self._tf._df.select(["id", colName]).rdd.map(list)
methodDict = {
# Edit Based
"hamming": textdistance.hamming.normalized_similarity,
"mlipns": textdistance.mlipns.normalized_similarity,
"levenshtein": textdistance.levenshtein.normalized_similarity,
# Token based
"jaccard": textdistance.jaccard.normalized_similarity,
"overlap": textdistance.overlap.normalized_similarity,
"cosine": textdistance.cosine.normalized_similarity,
# Sequence based
"lcsseq": textdistance.lcsseq.normalized_similarity,
"lcsstr": textdistance.lcsstr.normalized_similarity,
# Phonetic based
"mra": textdistance.mra.normalized_similarity,
}
try:
sim = methodDict[method]
except:
print("Waring: %s is not a valid method\n, changes into levenshtein by default.")
def other_distance(s1, s2):
return [textdistance.hamming.normalized_similarity(s1, s2),
textdistance.mlipns.normalized_similarity(s1, s2),
textdistance.damerau_levenshtein.normalized_similarity(s1, s2),
textdistance.strcmp95.normalized_similarity(s1, s2),
textdistance.needleman_wunsch.normalized_similarity(s1, s2),
textdistance.gotoh.normalized_similarity(s1, s2),
textdistance.smith_waterman.normalized_similarity(s1, s2),
textdistance.ratcliff_obershelp.normalized_similarity(s1, s2)]