How to use the datasketch.MinHashLSH function in datasketch

To help you get started, we’ve selected a few datasketch examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github TeamHG-Memex / undercrawler / scripts / crawl_stats.py View on Github external
def print_stats(
        f, show=None, skip_unique=False, max_int_value=5, duration_limit=None,
        print_duplicates=False, print_urls=False, limit=None):
    stats = Counter()
    if not skip_unique:
        lsh = MinHashLSH(threshold=0.9, num_perm=128)
        too_common = get_too_common_shingles(f, limit=1000)
    urls = {}
    min_timestamp = max_timestamp = None
    for i, item in enumerate(item_reader(f, limit=limit)):
        if print_urls:
            print(item['url'])
        content_type = item.get('content_type', 'missing')
        stats.update([
            'content_type: ' + content_type,
            'content_type[0]: ' + content_type.split('/')[0]])
        if min_timestamp is None:
            min_timestamp = item['timestamp']
        max_timestamp = item['timestamp']
        if duration_limit and \
                (max_timestamp - min_timestamp) / 1000 > duration_limit:
            break
github fake-name / wlnupdates / util / db_organize.py View on Github external
altnid_sid_dict  = dict([(tmp[0], tmp[1]) for tmp in altn])
	altnid_name_dict = dict([(tmp[0], tmp[2]) for tmp in altn])
	sid_sname_dict   = dict([(tmp[1], tmp[3]) for tmp in altn])

	sid_altnid_dict = {}
	for nid, sid in altnid_sid_dict.items():
		sid_altnid_dict.setdefault(sid, [])
		sid_altnid_dict[sid].append(nid)


	print("Have %s altnames for %s series" % (len(altnid_sid_dict), len(sid_altnid_dict)))

	perms = 512
	gram_sz = 3
	minhashes = {}
	lsh = MinHashLSH(threshold=SIMILARITY_RATIO, num_perm=perms)

	print("Building lsh minhash data structure")
	with ProcessPoolExecutor(max_workers=8) as ex:
		print("Submitting jobs")
		futures = [(key, ex.submit(minhash_str, content, perms, gram_sz))
				for
					key, content
				in
					altnid_name_dict.items()
				if
					len(content) >= 5
			]

		print("Consuming futures")
		for key, future in tqdm.tqdm(futures):
			minhash = future.result()
github usc-isi-i2 / rltk / rltk / blocking / _minhash_lsh.py View on Github external
# Create minhashes
        minhashes = {}
        for rid in records:
            m = MinHash(num_perm=self._num_perm)
            for d in records[rid]:
                qgrams = set(self.nt.basic(d, 2))
                for gram in qgrams:
                    m.update(gram.encode('utf-8'))
            minhashes[rid] = m
        
        # Create LSH instance and add min hashes
        if self._bands == MinHashLSHRecordDeduplication.BANDS and self._rows == MinHashLSHRecordDeduplication.ROWS:
            lsh = MinHashLSH(threshold=self._threshold,num_perm=self._num_perm)
        else:
            lsh = MinHashLSH(num_perm=self._num_perm, params=(self._bands, self._rows))
            
        max_blocks = []
        for rid in records:
            lsh.insert(rid, minhashes[rid])
            max_blocks.append(rid)
        
        # Generate blocks
        while(len(max_blocks)>0):
            key = max_blocks[0]
            bucket = lsh.query(minhashes[key])
            for rid in bucket:
                if rid in max_blocks:
                    max_blocks.remove(rid)
                indexer["b"+str(self._block_index)].append(rid)
            self._block_index += 1
github TeamHG-Memex / undercrawler / scripts / analyze_possible_duplicates.py View on Github external
def analyze_file(name, f, verbose=False):
    urls = []
    Doc = namedtuple('Doc', ['item', 'min_hash'])
    documents = {} # key -> Doc
    lsh = MinHashLSH(threshold=0.9, num_perm=128)
    too_common = get_too_common_shingles(f, name, limit=300)
    for i, item in enumerate(item_reader(f, name)):
        urls.append(item['url'])
        min_hash = get_min_hash(item['extracted_text'], too_common)
        key = 'item_{}'.format(i)
        item = {'url': item['url']}
        documents[key] = Doc(item, min_hash)
        if key in lsh:
            lsh.remove(key)
        lsh.insert(key, min_hash)
    paths = [''.join([p.netloc, p.path]) for p in map(urlsplit, urls)]
    duplicates = get_duplicates(lsh, documents, verbose=verbose)
    print(name.ljust(40), '\t'.join(map(str, [
        len(urls), len(set(urls)), len(set(paths)),
        n_unique(documents, duplicates),
        ])))
github cangyeone / GeophysicsResearch / WaveReconize / fgpoint / mysimhash_t.py View on Github external
m3 = MinHash(num_perm=128)

for d in data1:
    m1.update(d.encode('utf8'))
for d in data2:
    m1.update(d.encode('utf8'))
for d in data3:
    m1.update(d.encode('utf8'))
print((m1.hashvalues))
print((m2.hashvalues))
print((m3.hashvalues))
import numpy as np
print(np.shape(m1.hashvalues))
# Create an MinHashLSH index optimized for Jaccard threshold 0.5,
# that accepts MinHash objects with 128 permutations functions
lsh = MinHashLSH(threshold=0.5, num_perm=128)

# Insert m2 and m3 into the index
lsh.insert("m2", m2)
lsh.insert("m3", m3)

# Check for membership using the key
print("m2" in lsh)
print("m3" in lsh)

# Using m1 as the query, retrieve the keys of the qualifying datasets
result = lsh.query(m1)
print("Candidates with Jaccard similarity > 0.5", result)

# Remove key from lsh
lsh.remove("m2")
github clhchtcjj / BiNE / model / lsh.py View on Github external
def construct_lsh(obj_dict):
    lsh_0 = MinHashLSH(threshold=0, num_perm=128,params=None)
    lsh_5 = MinHashLSH(threshold=0.6, num_perm=128,params=None)
    # forest = MinHashLSHForest(num_perm=128)
    keys = obj_dict.keys()
    values = obj_dict.values()
    ms = []
    for i in range(len(keys)):
        temp = MinHash(num_perm=128)
        for d in values[i]:
            temp.update(d.encode('utf8'))
        ms.append(temp)
        lsh_0.insert(keys[i], temp)
        lsh_5.insert(keys[i], temp)
    return lsh_0,lsh_5, keys, ms
github ekzhu / datasketch / benchmark / lsh_benchmark.py View on Github external
def benchmark_lsh(num_perm, threshold, index_data, query_data):
    print("Building LSH index")
    lsh = MinHashLSH(threshold, num_perm)
    for key, minhash in zip(index_data.keys, index_data.minhashes[num_perm]):
        lsh.insert(key, minhash)
    print("Querying")
    times = []
    results = []
    for qs, minhash in zip(query_data.sets, query_data.minhashes[num_perm]):
        start = time.clock()
        result = lsh.query(minhash)
        duration = time.clock() - start
        times.append(duration)
        results.append(sorted([[key, _compute_jaccard(qs, index_data.sets[key])]
                               for key in result],
                              key=lambda x : x[1], reverse=True))
    return times, results
github TeamHG-Memex / MaybeDont / maybedont / predict.py View on Github external
for each page.
        :param storage_config: configuration for a redis backend to persist
        minhashes in. Using this backend makes DupePredictor instances
        persistent across restarts. The configuration format is:
        storage_config={'type': 'redis', 'redis': {'host': 'localhost', 'port': 6379}}.
        See https://ekzhu.github.io/datasketch/lsh.html#minhash-lsh-at-scale
        """
        self.jaccard_threshold = jaccard_threshold
        self.num_perm = num_perm
        self.storage_config = storage_config
        if storage_config:
            self.lsh = MinHashLSH(
                threshold=self.jaccard_threshold, num_perm=self.num_perm,
                storage_config=self.storage_config)
        else:
            self.lsh = MinHashLSH(
                threshold=self.jaccard_threshold, num_perm=self.num_perm)
        self.too_common_shingles = set()
        if texts_sample:
            self.too_common_shingles = get_too_common_shingles(texts_sample)

        self.seen_urls = {}  # url: URLMeta
        self.urls_by_path = defaultdict(set)  # path: {url}
        self.urls_by_path_q = defaultdict(set)  # (path, q): {url}
        self.urls_by_path_qwp = defaultdict(set)  # (path, param, q): {url}
        self.params_by_path = defaultdict(set)  # path: {param}
        self.param_values = defaultdict(set)  # (path, param): {value}

        # Duplicate hypotheses:
        # (1) All items with same path are duplicates. Key is (path,)
        self.path_dupstats = defaultdict(DupStat)
        # (2) All items with same path that differ only in given param are
github clhchtcjj / BiNE / model / lsh.py View on Github external
def construct_lsh(obj_dict):
    lsh_0 = MinHashLSH(threshold=0, num_perm=128,params=None)
    lsh_5 = MinHashLSH(threshold=0.6, num_perm=128,params=None)
    # forest = MinHashLSHForest(num_perm=128)
    keys = obj_dict.keys()
    values = obj_dict.values()
    ms = []
    for i in range(len(keys)):
        temp = MinHash(num_perm=128)
        for d in values[i]:
            temp.update(d.encode('utf8'))
        ms.append(temp)
        lsh_0.insert(keys[i], temp)
        lsh_5.insert(keys[i], temp)
    return lsh_0,lsh_5, keys, ms
github TeamHG-Memex / undercrawler / scripts / analyze_possible_duplicates.py View on Github external
def learn_duplicates(name, f, verbose=False):
    print(name)
    logging.basicConfig(level=logging.DEBUG)
    texts_sample = [
        item['extracted_text'] for item in item_reader(f, name, limit=300)]
    dupe_predictor = DupePredictor(texts_sample)

    lsh = MinHashLSH(threshold=0.9, num_perm=128)  # separate from dupe_predictor
    too_common_shingles = dupe_predictor.too_common_shingles
    threshold = 0.98
    y_pred, y_true = [], []
    def _report_pr():
        tp = sum(p > threshold and d for p, d in zip(y_pred, y_true))
        fp = sum(p > threshold and not d for p, d in zip(y_pred, y_true))
        fn = sum(p < threshold and d for p, d in zip(y_pred, y_true))
        n_dup = tp + fn
        print('precision: %.3f, recall %.3f at %.2f threshold '
                '(%d duplicates)' % (
            tp / (tp + fp) if tp else 0.,
            tp / n_dup if n_dup else 0., threshold, n_dup))
    for i, item in enumerate(item_reader(f, name)):
        dupe_prob = dupe_predictor.get_dupe_prob(item['url'])
        y_pred.append(dupe_prob)
        min_hash = get_min_hash(item['extracted_text'], too_common_shingles)