How to use the datatable.join function in datatable

To help you get started, we’ve selected a few datatable examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github h2oai / driverlessai-recipes / data / airlines_joined_data_flights_in_out_regression.py View on Github external
X[:, date_col] = dt.f['Year'] * 10000 + dt.f['Month'] * 100 + dt.f['DayofMonth']
        cols_to_keep = ['Date']

        # add number of flights in/out for each airport per given interval
        timeslice_mins = 60
        for name, new_col, col, group in [
            ("out", "CRSDepTime_mod", "CRSDepTime", "Origin"),
            ("in", "CRSArrTime_mod", "CRSArrTime", "Dest")
        ]:
            X[:, new_col] = X[:, dt.f[col] // timeslice_mins]
            group_cols = [date_col, group, new_col]
            new_name = 'flights_%s' % name
            flights = X[:, {new_name: dt.count()}, dt.by(*group_cols)]
            flights.key = group_cols
            cols_to_keep.append(new_name)
            X = X[:, :, dt.join(flights)]

        # Fill NaNs with 0s
        X[dt.isna(dt.f['DepDelay']), 'DepDelay'] = 0
        cols_to_keep.extend([
            'DepDelay',
            'Year',
            'Month',
            'DayofMonth',
            'DayOfWeek',
            'CRSDepTime',
            'UniqueCarrier',
            'FlightNum',
            'TailNum',
            'CRSElapsedTime',
            'Origin',
            'Dest',
github h2oai / datatable / tests / test_join.py View on Github external
keys = list(set(dt.Frame(keys, stype=st).to_list()[0]))
        else:
            keys = list(set(keys))
    else:
        l = int(random.expovariate(0.05)) + 1
        keys = list(set(random_string(l) for _ in range(nkeys)))
    nkeys = len(keys)

    dkey = dt.Frame(KEY=keys, VAL=range(nkeys), stypes={"KEY": st})
    dkey.key = "KEY"
    keys, vals = dkey.to_list()
    main = [random.choice(keys) for i in range(ndata)]
    dmain = dt.Frame(KEY=main, stype=st)
    res = [vals[keys.index(main[i])] for i in range(ndata)]

    djoined = dmain[:, :, join(dkey)]
    frame_integrity_check(djoined)
    assert djoined.shape == (ndata, 2)
    assert djoined.names == ("KEY", "VAL")
    assert djoined.to_list() == [main, res]
github h2oai / datatable / tests / test_join.py View on Github external
def test_join_missing_levels():
    d0 = dt.Frame(A=[1, 2, 3])
    d1 = dt.Frame(A=[1, 2], K=[True, False])
    d1.key = "A"
    res = d0[:, :, join(d1)]
    frame_integrity_check(res)
    assert res.to_list() == [[1, 2, 3], [True, False, None]]
github h2oai / datatable / tests / random_attack.py View on Github external
def join_self(self):
        ncols = self.ncols
        if self.nkeys:
            self.df = self.df[:, :, join(self.df)]
        else:
            with pytest.raises(ValueError, match="The join frame is not keyed"):
                self.df = self.df[:, :, join(self.df)]
            return False

        s = slice(self.nkeys, ncols)
        join_data = copy.deepcopy(self.data[s])
        join_types = self.types[s].copy()
        join_names = self.names[s].copy()

        self.data += join_data
        self.types += join_types
        self.names += join_names
        self.nkeys = 0
        self.dedup_names()
        return True
github h2oai / datatable / tests / ijby / test-assign-expr.py View on Github external
def test_assign_from_joined_frame():
    DT = dt.Frame(A=range(5))
    JDT = dt.Frame(A=[1, 2, 3], B=['a', 'b', 'c'])
    JDT.key = 'A'
    DT[:, "Z", join(JDT)] = g.B
    assert_equals(DT, dt.Frame(A=range(5), Z=[None, 'a', 'b', 'c', None]))
github h2oai / driverlessai-recipes / recipes / ieee_fraud_2019.py View on Github external
print("MyIEEEGroupBysTransformers name  {} {}".format(
                    self._output_feature_names, self._feature_desc
                )
                )

                return X[:, dt.f["col_cnt"] / dt.f["daily_cnt"]]

            elif self.group_type == 'hour':

                X = dt.Frame(X[:, self.group_col])
                X[:, 'date'] = dt.Frame(ieee_datetime.dt.strftime('%Y%m%d_%H').values)

                # Compute daily counts
                hourly_cnt = X[:, {"hourly_cnt": dt.count()}, dt.by("date")]
                hourly_cnt.key = ["date"]
                X = X[:, :, dt.join(hourly_cnt)]

                # Compute card count
                col_cnt = X[:, {"col_cnt": dt.count()}, dt.by(*["date", self.group_col])]
                col_cnt.key = ["date", self.group_col]
                X = X[:, :, dt.join(col_cnt)]

                self._output_feature_names = ["IEEEGroupBys_{}_{}".format(self.group_col, self.group_type)]
                self._feature_desc = ["IEEEGroupBys_{}_{}".format(self.group_col, self.group_type)]

                print('=' * 50)
                print("MyIEEEGroupBysTransformers name  {} {}".format(
                    self._output_feature_names, self._feature_desc
                )
                )

                return X[:, dt.f["col_cnt"] / dt.f["hourly_cnt"]]
github h2oai / driverlessai-recipes / recipes / ieee_fraud_2019.py View on Github external
return X[:, dt.f["col_cnt"] / dt.f["daily_cnt"]]

            elif self.group_type == 'hour':

                X = dt.Frame(X[:, self.group_col])
                X[:, 'date'] = dt.Frame(ieee_datetime.dt.strftime('%Y%m%d_%H').values)

                # Compute daily counts
                hourly_cnt = X[:, {"hourly_cnt": dt.count()}, dt.by("date")]
                hourly_cnt.key = ["date"]
                X = X[:, :, dt.join(hourly_cnt)]

                # Compute card count
                col_cnt = X[:, {"col_cnt": dt.count()}, dt.by(*["date", self.group_col])]
                col_cnt.key = ["date", self.group_col]
                X = X[:, :, dt.join(col_cnt)]

                self._output_feature_names = ["IEEEGroupBys_{}_{}".format(self.group_col, self.group_type)]
                self._feature_desc = ["IEEEGroupBys_{}_{}".format(self.group_col, self.group_type)]

                print('=' * 50)
                print("MyIEEEGroupBysTransformers name  {} {}".format(
                    self._output_feature_names, self._feature_desc
                )
                )

                return X[:, dt.f["col_cnt"] / dt.f["hourly_cnt"]]


        else:
            print('='*50)
            print("MyIEEEGroupBysTransformers ERROR  {} {}".format(
github h2oai / db-benchmark / pydatatable / join-pydatatable.py View on Github external
y.key = 'id1'
ans = x[:, :, join(y)][isfinite(f.v2), :] # , on='id1'
tmp = ans.copy(deep=True) ## ensure join results materialized #141
print(ans.shape, flush=True)
t = timeit.default_timer() - t_start
m = memory_usage()
t_start = timeit.default_timer()
chk = ans[:, [sum(f.v1), sum(f.v2)]]
chkt = timeit.default_timer() - t_start
write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(flatten(chk.to_list())), chk_time_sec=chkt, on_disk=on_disk)
del ans, y, tmp
gc.collect()
y = small.copy(deep=True)
t_start = timeit.default_timer()
y.key = 'id1'
ans = x[:, :, join(y)][isfinite(f.v2), :] # , on='id1'
tmp = ans.copy(deep=True)
print(ans.shape, flush=True)
t = timeit.default_timer() - t_start
m = memory_usage()
t_start = timeit.default_timer()
chk = ans[:, [sum(f.v1), sum(f.v2)]]
chkt = timeit.default_timer() - t_start
write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(flatten(chk.to_list())), chk_time_sec=chkt, on_disk=on_disk)
print(ans.head(3), flush=True)
print(ans.tail(3), flush=True)
del ans, y, tmp

question = "medium inner on int" # q2
gc.collect()
y = medium.copy(deep=True)
t_start = timeit.default_timer()
github h2oai / driverlessai-recipes / models / timeseries / historic_mean.py View on Github external
def predict(self, X, **kwargs):
        if self.tgc is None or not all([x in X.names for x in self.tgc]):
            return np.ones(X.shape[0]) * self.nan_value

        tgc_wo_time = list(np.setdiff1d(self.tgc, self.time_column))

        # Datatable code
        if len(tgc_wo_time) > 0:
            # Join the average per group to the input datafrane
            self.group_means.key = tgc_wo_time
            # Predictions for unknown tgc will be None in DT
            yhat_dt = X[:, :, dt.join(self.group_means)][:, "yhat"]
            # In DT missing values after the join are None
            # Need to cast to float64 to replace None or np.nan
            yhat_dt.replace(None, np.float64(self.nan_value))

            return yhat_dt.to_numpy()[:, 0]
        else:
            # if no Groups are avaible then just return the target average
            return np.full((X.shape[0], 1), self.nan_value)
github h2oai / datatable / datatable / graph / __init__.py View on Github external
def make_datatable(dt, rows, select, groupby=None, join=None, sort=None,
                   engine=None, mode=None, replacement=None):
    """
    Implementation of the `Frame.__call__()` method.

    This is the "main" function in the module; it is responsible for
    evaluating various transformations when they are applied to a target
    Frame.
    """
    if isinstance(groupby, datatable.join):
        join = groupby
        groupby = None
    update_mode = mode == "update"
    delete_mode = mode == "delete"
    jframe = join.joinframe if join else None
    with f.bind_datatable(dt), g.bind_datatable(jframe):
        ee = make_engine(engine, dt, jframe)
        ee.rowindex = dt.internal.rowindex
        rowsnode = make_rowfilter(rows, ee)
        grbynode = make_groupby(groupby, ee)
        colsnode = make_columnset(select, ee, update_mode)
        sortnode = make_sort(sort, ee)

        if join:
            join.execute(ee)