How to use the datasketch.lsh.MinHashLSH function in datasketch

To help you get started, we’ve selected a few datasketch examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github ekzhu / datasketch / datasketch / lshensemble.py View on Github external
raise ValueError("num_part must be at least 1")
        if m < 2 or m > num_perm:
            raise ValueError("m must be in the range of [2, num_perm]")
        if any(w < 0.0 or w > 1.0 for w in weights):
            raise ValueError("Weight must be in [0.0, 1.0]")
        if sum(weights) != 1.0:
            raise ValueError("Weights must sum to 1.0")
        self.threshold = threshold
        self.h = num_perm
        self.m = m
        rs = self._init_optimal_params(weights)
        # Initialize multiple LSH indexes for each partition
        storage_config = {'type': 'dict'} if not storage_config else storage_config
        basename = storage_config.get('basename', _random_name(11))
        self.indexes = [
            dict((r, MinHashLSH(
                num_perm=self.h,
                params=(int(self.h/r), r),
                storage_config=self._get_storage_config(
                    basename, storage_config, partition, r),
                prepickle=prepickle)) for r in rs)
            for partition in range(0, num_part)]
        self.lowers = [None for _ in self.indexes]
        self.uppers = [None for _ in self.indexes]
github ekzhu / datasketch / examples / lsh_examples.py View on Github external
def eg1():
    m1 = MinHash(num_perm=128)
    m2 = MinHash(num_perm=128)
    m3 = MinHash(num_perm=128)
    for d in set1:
        m1.update(d.encode('utf8'))
    for d in set2:
        m2.update(d.encode('utf8'))
    for d in set3:
        m3.update(d.encode('utf8'))

    # Create LSH index
    lsh = MinHashLSH(threshold=0.5, num_perm=128)
    lsh.insert("m2", m2)
    lsh.insert("m3", m3)
    result = lsh.query(m1)
    print("Approximate neighbours with Jaccard similarity > 0.5", result)
github google / timesketch / timesketch / lib / similarity.py View on Github external
1.0. The initialized MinHash LSH will be optimized for the
            threshold by minizing the false positive and false negative.

    Returns:
        A tuple with an LSH (instance of datasketch.lsh.LSH) and a
        dictionary with event ID as key and minhash as value.
    """
    if delimiters is None:
        delimiters = DEFAULT_DELIMITERS
    if num_perm is None:
        num_perm = DEFAULT_PERMUTATIONS
    if threshold is None:
        threshold = DEFAULT_THRESHOLD

    minhashes = {}
    lsh = MinHashLSH(threshold, num_perm)

    with lsh.insertion_session() as lsh_session:
        for event in events:
            # Insert minhash in LSH index.
            key = (event.event_id, event.event_type, event.index_name)
            minhash = minhash_from_text(
                event.source[field], num_perm, delimiters)
            minhashes[key] = minhash
            lsh_session.insert(key, minhash)

    return lsh, minhashes
github ekzhu / datasketch / examples / lsh_examples.py View on Github external
def eg2():
    mg = WeightedMinHashGenerator(10, 5)
    m1 = mg.minhash(v1)
    m2 = mg.minhash(v2)
    m3 = mg.minhash(v3)
    print("Estimated Jaccard m1, m2", m1.jaccard(m2))
    print("Estimated Jaccard m1, m3", m1.jaccard(m3))
    # Create LSH index
    lsh = MinHashLSH(threshold=0.1, num_perm=5)
    lsh.insert("m2", m2)
    lsh.insert("m3", m3)
    result = lsh.query(m1)
    print("Approximate neighbours with weighted Jaccard similarity > 0.1", result)