How to use the datatable.f function in datatable

To help you get started, we’ve selected a few datatable examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github h2oai / datatable / tests / munging / test_replace.py View on Github external
def test_replace_str64():
    Y = dt.Frame([["BLSD", "RY", "IO OUSEVOUY", "@"], [3, 4, 1, 2]],
                 names=["A", "B"], stypes=["str64", "int32"])
    Y[f.B < 100, f.A] = "*"
    frame_integrity_check(Y)
    assert Y.stypes == (dt.str64, dt.int32)
    assert Y.to_list() == [["*"] * 4, [3, 4, 1, 2]]
github h2oai / datatable / tests / munging / test_dt_rows.py View on Github external
def test_rows_less_than_or_equal(df1):
    dt1 = df1[f.A <= f.B, :]
    frame_integrity_check(dt1)
    assert dt1.names == df1.names
    assert dt1.to_list() == [[0, 1, 3, 4, None, 9], [3, 2, 3, 4, None, 9]]
github h2oai / driverlessai-recipes / models / algorithms / knearestneighbour.py View on Github external
lb = LabelEncoder()
            lb.fit(self.labels)
            y = lb.transform(y)
        else:
            model = KNeighborsRegressor(n_neighbors=self.params['n_neighbors'], metric=self.params['metric'],
                                        weights=self.params['weights'], n_jobs=self.params['n_jobs'])
        self.means = dict()
        self.standard_scaler = StandardScaler()
        for col in X.names:
            XX = X[:, col]
            self.means[col] = XX.mean1()
            if self.means[col] is None:
                self.means[col] = 0
            XX.replace(None, self.means[col])
            X[:, col] = XX
            assert X[dt.isna(dt.f[col]), col].nrows == 0
        X = X.to_numpy()
        X = self.standard_scaler.fit_transform(X)
        feature_model.fit(X, y)
        model.fit(X, y)
        importances = np.array(abs(feature_model.coef_))

        self.set_model_properties(model=model,
                                  features=orig_cols,
                                  importances=importances.tolist(),  # abs(model.coef_[0])
                                  iterations=0)
github h2oai / db-benchmark / pydatatable / read-pydatatable.py View on Github external
cache = "TRUE"

wc_lines = subprocess.run(['wc','-l',data_name], stdout=subprocess.PIPE).stdout.decode('utf-8').split(" ", 1)[0]
in_rows = int(wc_lines)-1

print("reading...")

question = "all rows" #1
gc.collect()
t_start = timeit.default_timer()
ans = dt.fread(data_name, show_progress=False)
print(ans.shape)
t = timeit.default_timer() - t_start
m = memory_usage()
t_start = timeit.default_timer()
chk = ans[:, sum(f.v3)]
chkt = timeit.default_timer() - t_start
write_log(task=task, data=data_name, in_rows=in_rows, question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(flatten(chk.topython())), chk_time_sec=chkt)
del ans
gc.collect()
t_start = timeit.default_timer()
ans = dt.fread(data_name, show_progress=False)
print(ans.shape)
t = timeit.default_timer() - t_start
m = memory_usage()
t_start = timeit.default_timer()
chk = ans[:, sum(f.v3)]
chkt = timeit.default_timer() - t_start
write_log(task=task, data=data_name, in_rows=in_rows, question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(flatten(chk.topython())), chk_time_sec=chkt)
del ans
gc.collect()
t_start = timeit.default_timer()
github h2oai / driverlessai-recipes / transformers / augmentation / ipaddress_features.py View on Github external
def transform(self, X: dt.Frame):

        try:
            return X[:, {"x": (dt.isna(dt.f[0])) & None | self.get_ip_property(self.parse_ipaddress(dt.f[0]))}]
            # return X.to_pandas().astype(str).iloc[:, 0].apply(lambda x: self.get_ip_property(self.parse_ipaddress(x)))

        except ValueError:
            return np.zeros(X.shape[0])
github h2oai / driverlessai-recipes / transformers / targetencoding / leaky_mean_target_encoder.py View on Github external
def fit_transform(self, X: dt.Frame, y: np.array = None):
        target = '__internal_target__'
        X[:, target] = dt.Frame(y)
        target_is_numeric = X[:, target][:, [bool, int, float]].shape[1] > 0
        if target_is_numeric:
            self._group_means = X[:, dt.mean(dt.f[target]), dt.by(*self.input_feature_names)]
        else:
            X[:, target] = dt.Frame(LabelEncoder().fit_transform(X[:, target].to_pandas().iloc[:, 0].values).ravel())
            self._group_means = X[:, dt.median(dt.f[target]), dt.by(*self.input_feature_names)]
        del X[:, target]
        self._group_means.key = self.input_feature_names
        return self.transform(X)
github WZBSocialScienceCenter / tmtoolkit / tmtoolkit / preprocess / _tmpreproc.py View on Github external
:return: this instance
        """
        if not USE_DT:
            raise RuntimeError('this function requires the package "datatable" to be installed')

        import datatable as dt

        if not isinstance(tokendf, dt.Frame):
            raise ValueError('`tokendf` must be a datatable Frame object')

        if {'doc', 'position', 'token'} & set(pd_dt_colnames(tokendf)) != {'doc', 'position', 'token'}:
            raise ValueError('`tokendf` must contain a columns "doc", "position" and "token"')

        # convert big dataframe to dict of document token dicts to be used in load_tokens
        tokens = {}
        for dl in dt.unique(tokendf[:, dt.f.doc]).to_list()[0]:
            doc_df = tokendf[dt.f.doc == dl, :]
            colnames = pd_dt_colnames(doc_df)
            colnames.pop(colnames.index('doc'))
            tokens[dl] = doc_df[:, colnames]

        return self.load_tokens(tokens)
github h2oai / db-benchmark / pydatatable / join-pydatatable.py View on Github external
big = dt.fread(src_jn_y[2])

print(x.nrows, flush=True)
print(small.nrows, flush=True)
print(medium.nrows, flush=True)
print(big.nrows, flush=True)

task_init = timeit.default_timer()
print("joining...", flush=True)

question = "small inner on int" # q1
gc.collect()
y = small.copy(deep=True)
t_start = timeit.default_timer()
y.key = 'id1'
ans = x[:, :, join(y)][isfinite(f.v2), :] # , on='id1'
tmp = ans.copy(deep=True) ## ensure join results materialized #141
print(ans.shape, flush=True)
t = timeit.default_timer() - t_start
m = memory_usage()
t_start = timeit.default_timer()
chk = ans[:, [sum(f.v1), sum(f.v2)]]
chkt = timeit.default_timer() - t_start
write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(flatten(chk.to_list())), chk_time_sec=chkt, on_disk=on_disk)
del ans, y, tmp
gc.collect()
y = small.copy(deep=True)
t_start = timeit.default_timer()
y.key = 'id1'
ans = x[:, :, join(y)][isfinite(f.v2), :] # , on='id1'
tmp = ans.copy(deep=True)
print(ans.shape, flush=True)
github h2oai / db-benchmark / pydatatable / groupby-pydatatable.py View on Github external
print(ans.shape, flush=True)
t = timeit.default_timer() - t_start
m = memory_usage()
t_start = timeit.default_timer()
chk = ans[:, [sum(f.v1), sum(f.v3)]]
chkt = timeit.default_timer() - t_start
write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(flatten(chk.to_list())), chk_time_sec=chkt, on_disk=on_disk)
del ans
gc.collect()
t_start = timeit.default_timer()
ans = x[:, {"v1": sum(f.v1), "v3": mean(f.v3)}, by(f.id3)]
print(ans.shape, flush=True)
t = timeit.default_timer() - t_start
m = memory_usage()
t_start = timeit.default_timer()
chk = ans[:, [sum(f.v1), sum(f.v3)]]
chkt = timeit.default_timer() - t_start
write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(flatten(chk.to_list())), chk_time_sec=chkt, on_disk=on_disk)
print(ans.head(3), flush=True)
print(ans.tail(3), flush=True)
del ans

question = "mean v1:v3 by id4" # q4
gc.collect()
t_start = timeit.default_timer()
ans = x[:, {"v1": mean(f.v1), "v2": mean(f.v2), "v3": mean(f.v3)}, by(f.id4)]
print(ans.shape, flush=True)
t = timeit.default_timer() - t_start
m = memory_usage()
t_start = timeit.default_timer()
chk = ans[:, [sum(f.v1), sum(f.v2), sum(f.v3)]]
chkt = timeit.default_timer() - t_start