How to use the datasketch.hyperloglog.HyperLogLog function in datasketch

To help you get started, we’ve selected a few datasketch examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github ekzhu / datasketch / benchmark / similarity_benchmark.py View on Github external
def _run_hyperloglog(A, B, data, seed, p):
    (a_start, a_end), (b_start, b_end) = A, B
    hasher = pyhash.murmur3_32()
    h1 = HyperLogLog(p=p, hashobj=Hash)
    h2 = HyperLogLog(p=p, hashobj=Hash)
    for i in xrange(a_start, a_end):
        h1.update(hasher(data[i], seed=seed))
    for i in xrange(b_start, b_end):
        h2.update(hasher(data[i], seed=seed))
    return _hyperloglog_jaccard(h1, h2)
github ekzhu / datasketch / benchmark / inclusion_benchmark.py View on Github external
def _run_hyperloglog(A, B, data, seed, p):
    (a_start, a_end), (b_start, b_end) = A, B
    hasher = pyhash.murmur3_32()
    h1 = HyperLogLog(p=p, hashobj=Hash)
    h2 = HyperLogLog(p=p, hashobj=Hash)
    for i in xrange(a_start, a_end):
        h1.update(hasher(data[i], seed=seed))
    for i in xrange(b_start, b_end):
        h2.update(hasher(data[i], seed=seed))
    return _hyperloglog_inclusion(h1, h2)
github ekzhu / datasketch / benchmark / similarity_benchmark.py View on Github external
def _run_hyperloglog(A, B, data, seed, p):
    (a_start, a_end), (b_start, b_end) = A, B
    hasher = pyhash.murmur3_32()
    h1 = HyperLogLog(p=p, hashobj=Hash)
    h2 = HyperLogLog(p=p, hashobj=Hash)
    for i in xrange(a_start, a_end):
        h1.update(hasher(data[i], seed=seed))
    for i in xrange(b_start, b_end):
        h2.update(hasher(data[i], seed=seed))
    return _hyperloglog_jaccard(h1, h2)
github ekzhu / datasketch / examples / hyperloglog_examples.py View on Github external
def eg1():
    h = HyperLogLog()
    for d in data1:
        h.update(d.encode('utf8'))
    print("Estimated cardinality is", h.count())

    s1 = set(data1)
    print("Actual cardinality is", len(s1))
github ekzhu / datasketch / benchmark / hyperloglog_benchmark.py View on Github external
def run_perf(card, p):
    h = HyperLogLog(p=p)
    logging.info("HyperLogLog using p = %d " % p)
    start = time.clock()
    for i in range(card):
        h.update(int_bytes(i))
    duration = time.clock() - start
    logging.info("Digested %d hashes in %.4f sec" % (card, duration))
    return duration
github ekzhu / datasketch / benchmark / inclusion_benchmark.py View on Github external
def _run_hyperloglog(A, B, data, seed, p):
    (a_start, a_end), (b_start, b_end) = A, B
    hasher = pyhash.murmur3_32()
    h1 = HyperLogLog(p=p, hashobj=Hash)
    h2 = HyperLogLog(p=p, hashobj=Hash)
    for i in xrange(a_start, a_end):
        h1.update(hasher(data[i], seed=seed))
    for i in xrange(b_start, b_end):
        h2.update(hasher(data[i], seed=seed))
    return _hyperloglog_inclusion(h1, h2)
github ekzhu / datasketch / datasketch / hyperloglog.py View on Github external
size = struct.calcsize('B')
        try:
            p = struct.unpack_from('B', buf, 0)[0]
        except TypeError:
            p = struct.unpack_from('B', buffer(buf), 0)[0]
        self.__init__(p=p)
        offset = size
        try:
            self.reg = np.array(struct.unpack_from('%dB' % self.m,
                buf, offset), dtype=np.int8)
        except TypeError:
            self.reg = np.array(struct.unpack_from('%dB' % self.m,
                buffer(buf), offset), dtype=np.int8)


class HyperLogLogPlusPlus(HyperLogLog):
    '''
    HyperLogLog++ is an enhanced HyperLogLog `from Google
    `_.
    Main changes from the original HyperLogLog:

    1. Use 64 bits instead of 32 bits for hash function
    2. A new small-cardinality estimation scheme
    3. Sparse representation (not implemented here)

    Args:
        p (int, optional): The precision parameter. It is ignored if
            the `reg` is given.
        reg (numpy.array, optional): The internal state.
            This argument is for initializing the HyperLogLog from
            an existing one.
        hashfunc (optional): The hash function used by this MinHash.
github ekzhu / datasketch / benchmark / hyperloglog_benchmark.py View on Github external
def run_acc(size, seed, p):
    logging.info("HyperLogLog using p = %d " % p)
    h = HyperLogLog(p=p)
    s = set()
    random.seed(seed)
    for i in range(size):
        v = int_bytes(random.randint(1, size))
        h.update(v)
        s.add(v)
    perr = abs(float(len(s)) - h.count()) / float(len(s))
    return perr