How to use the datatable.sum function in datatable

To help you get started, we’ve selected a few datatable examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github h2oai / datatable / tests / test-groups.py View on Github external
def test_groupby_large_random_integers(seed):
    random.seed(seed)
    ngrps1 = random.choice([1, 1, 2, 2, 2, 3, 4, 5])
    n0 = 1 << random.choice([1, 1, 2, 2, 2, 3, 3, 3, 4, 5, 6, 7])
    chunks = ([random.sample(range(n0), random.randint(1, n0))] +
              [random.sample([0] * 100 + list(range(256)),
                             random.randint(1, 20))
               for i in range(ngrps1)])
    n = int(random.expovariate(0.0001)) + 10
    sample = [sum(random.choice(chunks[i]) << (8 * i)
                  for i in range(len(chunks)))
              for _ in range(n)]
    nuniques = len(set(sample))
    f0 = dt.Frame(sample)
    assert f0.nunique1() == nuniques
    f1 = dt.rbind(*([f0] * random.randint(2, 20)))
    assert f1.nunique1() == nuniques
github h2oai / datatable / tests / test-groups.py View on Github external
def test_groupby_with_filter1():
    f0 = dt.Frame(KEY=[1, 2, 1, 2, 1, 2], X=[-10, 2, 3, 0, 1, -7])
    f1 = f0[f.X > 0, sum(f.X), f.KEY]
    assert f1.to_list() == [[1, 2], [4, 2]]
github h2oai / datatable / tests / test_reduce.py View on Github external
def test_sum_empty_frame():
    DT = dt.Frame([[]] * 4, names=list("ABCD"),
                  stypes=(dt.bool8, dt.int32, dt.float32, dt.float64))
    assert DT.shape == (0, 4)
    RZ = DT[:, sum(f[:])]
    frame_integrity_check(RZ)
    assert RZ.shape == (1, 4)
    assert RZ.names == ("A", "B", "C", "D")
    assert RZ.stypes == (dt.int64, dt.int64, dt.float32, dt.float64)
    assert RZ.to_list() == [[0], [0], [0], [0]]
    assert str(RZ)
github h2oai / datatable / tests / test-groups.py View on Github external
def test_group_empty_frame4():
    DT = dt.Frame(A=[], stype=dt.float32)
    D2 = DT[:, sum(f.A), by(f.A)]
    frame_integrity_check(D2)
    assert D2.shape == (0, 2)
    assert D2.stypes == (dt.float32, dt.float32)
github h2oai / db-benchmark / pydatatable / groupby-pydatatable.py View on Github external
chk = ans[:, sum(f.v1)]
chkt = timeit.default_timer() - t_start
write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(flatten(chk.to_list())), chk_time_sec=chkt, on_disk=on_disk)
print(ans.head(3), flush=True)
print(ans.tail(3), flush=True)
del ans

question = "sum v1 mean v3 by id3" # q3
gc.collect()
t_start = timeit.default_timer()
ans = x[:, {"v1": sum(f.v1), "v3": mean(f.v3)}, by(f.id3)]
print(ans.shape, flush=True)
t = timeit.default_timer() - t_start
m = memory_usage()
t_start = timeit.default_timer()
chk = ans[:, [sum(f.v1), sum(f.v3)]]
chkt = timeit.default_timer() - t_start
write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(flatten(chk.to_list())), chk_time_sec=chkt, on_disk=on_disk)
del ans
gc.collect()
t_start = timeit.default_timer()
ans = x[:, {"v1": sum(f.v1), "v3": mean(f.v3)}, by(f.id3)]
print(ans.shape, flush=True)
t = timeit.default_timer() - t_start
m = memory_usage()
t_start = timeit.default_timer()
chk = ans[:, [sum(f.v1), sum(f.v3)]]
chkt = timeit.default_timer() - t_start
write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(flatten(chk.to_list())), chk_time_sec=chkt, on_disk=on_disk)
print(ans.head(3), flush=True)
print(ans.tail(3), flush=True)
del ans
github h2oai / driverlessai-recipes / transformers / numeric / count_negative_values_transformer.py View on Github external
def transform(self, X: dt.Frame):
        return X[:, dt.sum([(dt.f[x] < 0) for x in range(X.ncols)])]
github WinVector / data_algebra / data_algebra / datatable_model.py View on Github external
    "sum": lambda expr: datatable.sum(expr_to_dt_expr(expr.args[0])),
    "+": lambda expr: expr_to_dt_expr(expr.args[0]) + expr_to_dt_expr(expr.args[1]),
github h2oai / driverlessai-recipes / transformers / numeric / sum.py View on Github external
def transform(self, X: dt.Frame):
        return X[:, dt.sum([dt.f[x] for x in range(X.ncols)])]
github h2oai / db-benchmark / pydatatable / groupby-pydatatable.py View on Github external
print(ans.shape, flush=True)
t = timeit.default_timer() - t_start
m = memory_usage()
t_start = timeit.default_timer()
chk = ans[:, sum(f.range_v1_v2)]
chkt = timeit.default_timer() - t_start
write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(flatten(chk.to_list())), chk_time_sec=chkt, on_disk=on_disk)
del ans
gc.collect()
t_start = timeit.default_timer()
ans = x[:, {"range_v1_v2": max(f.v1)-min(f.v2)}, by(f.id3)]
print(ans.shape, flush=True)
t = timeit.default_timer() - t_start
m = memory_usage()
t_start = timeit.default_timer()
chk = ans[:, sum(f.range_v1_v2)]
chkt = timeit.default_timer() - t_start
write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(flatten(chk.to_list())), chk_time_sec=chkt, on_disk=on_disk)
print(ans.head(3), flush=True)
print(ans.tail(3), flush=True)
del ans

question = "largest two v3 by id6" # q8
gc.collect()
t_start = timeit.default_timer()
ans = x[:2, {"largest2_v3": f.v3}, by(f.id6), sort(-f.v3)]
print(ans.shape, flush=True)
t = timeit.default_timer() - t_start
m = memory_usage()
t_start = timeit.default_timer()
chk = ans[:, sum(f.largest2_v3)]
chkt = timeit.default_timer() - t_start