How to use datasketch - 10 common examples

To help you get started, we’ve selected a few datasketch examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github ekzhu / datasketch / test / aio / test_lsh_mongo.py View on Github external
async def test__H_mongo(self):
        """
        Check _H output consistent bytes length given
        the same concatenated hash value size
        """
        mg = WeightedMinHashGenerator(100, sample_size=128)
        for l in range(2, mg.sample_size + 1, 16):
            m = mg.minhash(np.random.randint(1, 99999999, 100))
            async with AsyncMinHashLSH(storage_config=self._storage_config_mongo,
                                       num_perm=128) as lsh:
                await lsh.insert("m", m)
                fs = (ht.keys() for ht in lsh.hashtables)
                hashtables = await asyncio.gather(*fs)
                sizes = [len(H) for H in hashtables]
                self.assertTrue(all(sizes[0] == s for s in sizes))
github ekzhu / datasketch / test / aio / test_lsh_mongo.py View on Github external
m2 = mg.minhash(np.random.uniform(1, 10, 10))
            await lsh.insert("a", m1)
            await lsh.insert("b", m2)
            for t in lsh.hashtables:
                self.assertTrue(await t.size() >= 1)
                items = []
                for H in await t.keys():
                    items.extend(await t.get(H))
                self.assertTrue("a" in items)
                self.assertTrue("b" in items)
            self.assertTrue(await lsh.has_key("a"))
            self.assertTrue(await lsh.has_key("b"))
            for i, H in enumerate(await lsh.keys.get("a")):
                self.assertTrue("a" in await lsh.hashtables[i].get(H))

            mg = WeightedMinHashGenerator(10, 5)
            m3 = mg.minhash(np.random.uniform(1, 10, 10))
            with self.assertRaises(ValueError):
                await lsh.insert("c", m3)
github ekzhu / datasketch / test / aio / test_lsh_mongo.py View on Github external
async def test_insert_mongo(self):
        async with AsyncMinHashLSH(storage_config=self._storage_config_mongo,
                                   threshold=0.5, num_perm=4) as lsh:
            mg = WeightedMinHashGenerator(10, 4)
            m1 = mg.minhash(np.random.uniform(1, 10, 10))
            m2 = mg.minhash(np.random.uniform(1, 10, 10))
            await lsh.insert("a", m1)
            await lsh.insert("b", m2)
            for t in lsh.hashtables:
                self.assertTrue(await t.size() >= 1)
                items = []
                for H in await t.keys():
                    items.extend(await t.get(H))
                self.assertTrue("a" in items)
                self.assertTrue("b" in items)
            self.assertTrue(await lsh.has_key("a"))
            self.assertTrue(await lsh.has_key("b"))
            for i, H in enumerate(await lsh.keys.get("a")):
                self.assertTrue("a" in await lsh.hashtables[i].get(H))
github ekzhu / datasketch / test / aio / test_lsh_mongo.py View on Github external
async def test__H_mongo(self):
        """
        Check _H output consistent bytes length given
        the same concatenated hash value size
        """
        mg = WeightedMinHashGenerator(100, sample_size=128)
        for l in range(2, mg.sample_size + 1, 16):
            m = mg.minhash(np.random.randint(1, 99999999, 100))
            async with AsyncMinHashLSH(storage_config=self._storage_config_mongo,
                                       num_perm=128) as lsh:
                await lsh.insert("m", m)
                fs = (ht.keys() for ht in lsh.hashtables)
                hashtables = await asyncio.gather(*fs)
                sizes = [len(H) for H in hashtables]
                self.assertTrue(all(sizes[0] == s for s in sizes))
github ekzhu / datasketch / test / aio / test_lsh_mongo.py View on Github external
async def test_insertion_session_mongo(self):
        def chunk(it, size):
            it = iter(it)
            return iter(lambda: tuple(islice(it, size)), ())

        _chunked_str = chunk((random.choice(string.ascii_lowercase) for _ in range(10000)), 4)
        seq = frozenset(chain((''.join(s) for s in _chunked_str),
                              ('aahhb', 'aahh', 'aahhc', 'aac', 'kld', 'bhg', 'kkd', 'yow', 'ppi', 'eer')))
        objs = [MinHash(16) for _ in range(len(seq))]
        for e, obj in zip(seq, objs):
            for i in e:
                obj.update(i.encode('utf-8'))

        data = [(e, m) for e, m in zip(seq, objs)]

        async with AsyncMinHashLSH(storage_config=self._storage_config_mongo,
                                   threshold=0.5, num_perm=16) as lsh:
            async with lsh.insertion_session(batch_size=1000) as session:
                fs = (session.insert(key, minhash, check_duplication=False) for key, minhash in data)
                await asyncio.gather(*fs)

            for t in lsh.hashtables:
                self.assertTrue(await t.size() >= 1)
                items = []
                for H in await t.keys():
                    items.extend(await t.get(H))
                self.assertTrue('aahhb' in items)
                self.assertTrue('kld' in items)
            self.assertTrue(await lsh.has_key('aahhb'))
            self.assertTrue(await lsh.has_key('kld'))
            for i, H in enumerate(await lsh.keys.get('aahh')):
                self.assertTrue('aahh' in await lsh.hashtables[i].get(H))
github ekzhu / datasketch / test / aio / test_lsh_mongo.py View on Github external
async def test_init_mongo(self):
        async with AsyncMinHashLSH(storage_config=self._storage_config_mongo,
                                   threshold=0.8) as lsh:
            self.assertTrue(await lsh.is_empty())
            b1, r1 = lsh.b, lsh.r

        async with AsyncMinHashLSH(storage_config=self._storage_config_mongo,
                                   threshold=0.8,
                                   weights=(0.2, 0.8)) as lsh:
            b2, r2 = lsh.b, lsh.r
        self.assertTrue(b1 < b2)
        self.assertTrue(r1 > r2)
github ekzhu / datasketch / test / aio / test_lsh_mongo.py View on Github external
async def test_init_mongo(self):
        async with AsyncMinHashLSH(storage_config=self._storage_config_mongo,
                                   threshold=0.8) as lsh:
            self.assertTrue(await lsh.is_empty())
            b1, r1 = lsh.b, lsh.r

        async with AsyncMinHashLSH(storage_config=self._storage_config_mongo,
                                   threshold=0.8,
                                   weights=(0.2, 0.8)) as lsh:
            b2, r2 = lsh.b, lsh.r
        self.assertTrue(b1 < b2)
        self.assertTrue(r1 > r2)
github google / timesketch / timesketch / lib / analyzers / phishy_domains.py View on Github external
from it and then creates a MinHash object from every
        remaining character in the domain.

        If a domain starts with www., it will be stripped of the
        domain before the Minhash is calculated.

        Args:
          domain: string with a full domain, eg. www.google.com

        Returns:
            A minhash (instance of datasketch.minhash.MinHash)
        """
        domain_items = domain.split('.')
        domain_part = '.'.join(domain_items[:-1])

        minhash = MinHash(similarity.DEFAULT_PERMUTATIONS)
        for char in domain_part:
            minhash.update(char.encode('utf8'))

        return minhash
github TeamHG-Memex / undercrawler / scripts / crawl_stats.py View on Github external
def print_stats(
        f, show=None, skip_unique=False, max_int_value=5, duration_limit=None,
        print_duplicates=False, print_urls=False, limit=None):
    stats = Counter()
    if not skip_unique:
        lsh = MinHashLSH(threshold=0.9, num_perm=128)
        too_common = get_too_common_shingles(f, limit=1000)
    urls = {}
    min_timestamp = max_timestamp = None
    for i, item in enumerate(item_reader(f, limit=limit)):
        if print_urls:
            print(item['url'])
        content_type = item.get('content_type', 'missing')
        stats.update([
            'content_type: ' + content_type,
            'content_type[0]: ' + content_type.split('/')[0]])
        if min_timestamp is None:
            min_timestamp = item['timestamp']
        max_timestamp = item['timestamp']
        if duration_limit and \
                (max_timestamp - min_timestamp) / 1000 > duration_limit:
            break
github fake-name / wlnupdates / util / db_organize.py View on Github external
altnid_sid_dict  = dict([(tmp[0], tmp[1]) for tmp in altn])
	altnid_name_dict = dict([(tmp[0], tmp[2]) for tmp in altn])
	sid_sname_dict   = dict([(tmp[1], tmp[3]) for tmp in altn])

	sid_altnid_dict = {}
	for nid, sid in altnid_sid_dict.items():
		sid_altnid_dict.setdefault(sid, [])
		sid_altnid_dict[sid].append(nid)


	print("Have %s altnames for %s series" % (len(altnid_sid_dict), len(sid_altnid_dict)))

	perms = 512
	gram_sz = 3
	minhashes = {}
	lsh = MinHashLSH(threshold=SIMILARITY_RATIO, num_perm=perms)

	print("Building lsh minhash data structure")
	with ProcessPoolExecutor(max_workers=8) as ex:
		print("Submitting jobs")
		futures = [(key, ex.submit(minhash_str, content, perms, gram_sz))
				for
					key, content
				in
					altnid_name_dict.items()
				if
					len(content) >= 5
			]

		print("Consuming futures")
		for key, future in tqdm.tqdm(futures):
			minhash = future.result()