How to use the datasketch.LeanMinHash function in datasketch

To help you get started, we’ve selected a few datasketch examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github Carbonara-Project / Guanciale / guanciale / matching.py View on Github external
#flow.append(hex(instr.offset)+"  OUT:" + calleds_dict[instr.addr])
                    flowhash.update("OUT:" + calleds_dict[instr.addr])
                    self.targets[instr.addr] = "OUT:" + calleds_dict[instr.addr]
            else:
                if instr.jumpout:
                    #flow.append(hex(instr.offset)+"  OUT:" + calleds_dict[instr.addr])
                    flowhash.update("OUT:" + calleds_dict[instr.addr])
                    self.targets[instr.addr] = "OUT:" + calleds_dict[instr.addr]
                else:
                    off = addrs_dict[instr.offset]
                    tgt = addrs_dict[instr.addr]
                    #flow.append("%x (%d)   JMP:%s   - %x (%d)" % (instr.offset, off, str(tgt - off), instr.addr, tgt))
                    flowhash.update("JMP:" + str(tgt - off))
                    self.targets[instr.addr] = "JMP:" + str(tgt - off)
        
        lean_flowhash = datasketch.LeanMinHash(flowhash)
        flowhash_buf = bytearray(lean_flowhash.bytesize())
        lean_flowhash.serialize(flowhash_buf)
        
        self.flowhash = str(flowhash_buf)
        '''
        for f in flow:
github findopendata / findopendata / apiserver / main.py View on Github external
resp = requests.post(lshserver_endpoint+"/query",
                json={"seed": query["seed"], "minhash": query["minhash"]})
        resp.raise_for_status()
    except requests.exceptions.HTTPError as err:
        app.logger.error("Error in querying the LSH server: {}".format(err))
        cnxpool.putconn(cnx)
        abort(500)
    column_ids = [column_id for column_id in resp.json()
            if column_id != str(query_id)]
    if len(column_ids) == 0:
        # Return empty result.
        cnxpool.putconn(cnx)
        return jsonify([])
    # Create the final query results.
    results = []
    query_minhash = LeanMinHash(seed=query["seed"], hashvalues=query["minhash"])
    # Obtain the column sketches of the results.
    with cnx.cursor(cursor_factory=RealDictCursor) as cursor:
        _execute_get_column_sketches(cursor, tuple(column_ids),
                original_hosts=original_host_filter)
        for column in cursor:
            # Skip columns from query table.
            if column["package_file_id"] == query["package_file_id"]:
                continue
            # Compute the similarities for each column in the result.
            jaccard = query_minhash.jaccard(LeanMinHash(
                    seed=column["seed"], hashvalues=column["minhash"]))
            containment = _containment(jaccard, column["distinct_count"],
                    query["distinct_count"])
            column.pop("seed")
            column.pop("minhash")
            column["jaccard"] = jaccard
github findopendata / findopendata / apiserver / main.py View on Github external
# Return empty result.
        cnxpool.putconn(cnx)
        return jsonify([])
    # Create the final query results.
    results = []
    query_minhash = LeanMinHash(seed=query["seed"], hashvalues=query["minhash"])
    # Obtain the column sketches of the results.
    with cnx.cursor(cursor_factory=RealDictCursor) as cursor:
        _execute_get_column_sketches(cursor, tuple(column_ids),
                original_hosts=original_host_filter)
        for column in cursor:
            # Skip columns from query table.
            if column["package_file_id"] == query["package_file_id"]:
                continue
            # Compute the similarities for each column in the result.
            jaccard = query_minhash.jaccard(LeanMinHash(
                    seed=column["seed"], hashvalues=column["minhash"]))
            containment = _containment(jaccard, column["distinct_count"],
                    query["distinct_count"])
            column.pop("seed")
            column.pop("minhash")
            column["jaccard"] = jaccard
            column["containment"] = containment
            if len(results) < limit:
                heapq.heappush(results,
                        (containment, column["id"], dict(column)))
            else:
                heapq.heappushpop(results,
                        (containment, column["id"], dict(column)))
    # Done with SQL.
    cnxpool.putconn(cnx)
    results = [column for _, _, column in heapq.nlargest(limit, results)]
github Carbonara-Project / Guanciale / guanciale / matching.py View on Github external
stmts[i].dst.value = addrs[stmts[i].dst.value]
                
                v = str(stmts[i]) + "\n"
                ins += v
                ngram = last + v
                shingled[ngram] = shingled.get(ngram, 0) +1
                last = v
            
            #self.vex_code += "+++ Instr #%d +++\n%s\n" % (c, ins)
        
        for ngram in shingled:
            for c in range(shingled[ngram]):
                vexhash.update("[%d]\n%s" % (c, ngram))
                #self.shingled_code += "[%d]\n%s" % (c, ngram)
        
        lean_vexhash = datasketch.LeanMinHash(vexhash)
        vexhash_buf = bytearray(lean_vexhash.bytesize())
        lean_vexhash.serialize(vexhash_buf)
        
        self.vexhash = str(vexhash_buf)