How to use datatable - 10 common examples

To help you get started, we’ve selected a few datatable examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github h2oai / driverlessai-recipes / data / airlines_joined_data_flights_in_out_regression.py View on Github external
X[:, date_col] = dt.f['Year'] * 10000 + dt.f['Month'] * 100 + dt.f['DayofMonth']
        cols_to_keep = ['Date']

        # add number of flights in/out for each airport per given interval
        timeslice_mins = 60
        for name, new_col, col, group in [
            ("out", "CRSDepTime_mod", "CRSDepTime", "Origin"),
            ("in", "CRSArrTime_mod", "CRSArrTime", "Dest")
        ]:
            X[:, new_col] = X[:, dt.f[col] // timeslice_mins]
            group_cols = [date_col, group, new_col]
            new_name = 'flights_%s' % name
            flights = X[:, {new_name: dt.count()}, dt.by(*group_cols)]
            flights.key = group_cols
            cols_to_keep.append(new_name)
            X = X[:, :, dt.join(flights)]

        # Fill NaNs with 0s
        X[dt.isna(dt.f['DepDelay']), 'DepDelay'] = 0
        cols_to_keep.extend([
            'DepDelay',
            'Year',
            'Month',
            'DayofMonth',
            'DayOfWeek',
            'CRSDepTime',
            'UniqueCarrier',
            'FlightNum',
            'TailNum',
            'CRSElapsedTime',
            'Origin',
            'Dest',
github h2oai / datatable / datatable / graph / context.py View on Github external
node.generate_c()
        out = _header
        out += "// Extern declarations\n"
        out += self._extern_declarations
        out += "\n\n"
        out += "// Global variables\n"
        out += self._global_declarations
        out += "\n"
        out += "\n\n\n"
        for fnbody in self._functions.values():
            out += fnbody
            out += "\n\n"
        return out


sz_sint, sz_int, sz_lint, sz_llint, sz_sizet = core.get_integer_sizes()
t16 = ("int" if sz_int == 2 else
       "short int" if sz_sint == 2 else "")
t32 = ("int" if sz_int == 4 else
       "long int" if sz_lint == 4 else "")
t64 = ("int" if sz_int == 8 else
       "long int" if sz_lint == 8 else
       "long long int" if sz_llint == 8 else "")
tsz = (t32 if sz_sizet == 4 else
       t64 if sz_sizet == 8 else "")
if not (t16 and t32 and t64 and tsz):
    raise RuntimeError("Invalid integer sizes: short int(%d), int(%d), "
                       "long int(%d), long long int(%d), size_t(%d)"
                       % (sz_sint, sz_int, sz_lint, sz_llint, sz_sizet))

decl_sizes = "\n".join(["typedef signed char int8_t;",
                        "typedef %s int16_t;" % t16,
github h2oai / datatable / datatable / graph / context.py View on Github external
"typedef %s int16_t;" % t16,
                        "typedef %s int32_t;" % t32,
                        "typedef %s int64_t;" % t64,
                        "typedef unsigned char uint8_t;",
                        "typedef unsigned %s uint16_t;" % t16,
                        "typedef unsigned %s uint32_t;" % t32,
                        "typedef unsigned %s uint64_t;" % t64,
                        "typedef unsigned %s size_t;" % tsz])

(ptr_dt_malloc,
 ptr_dt_realloc,
 ptr_dt_free,
 # ptr_rowindex_from_filterfn32,
 ptr_dt_column_data,
 ptr_dt_unpack_slicerowindex,
 ptr_dt_unpack_arrayrowindex) = core.get_internal_function_ptrs()


_header = """
/**
 * This code is auto-generated by context.py
 **/
// Integer types
%s
#define NULL ((void*)0)

// External functions
typedef void* (*ptr_0)(size_t);
typedef void* (*ptr_1)(void*, size_t);
typedef void (*ptr_2)(void*);
typedef void* (*ptr_3)(void*, int64_t, int);
typedef void* (*ptr_4)(void*, int64_t);
github h2oai / driverlessai-recipes / data / airlines_joined_data_flights_in_out.py View on Github external
("out", "CRSDepTime_mod", "CRSDepTime", "Origin"),
            ("in", "CRSArrTime_mod", "CRSArrTime", "Dest")
        ]:
            X[:, new_col] = X[:, dt.f[col] // timeslice_mins]
            group_cols = [date_col, group, new_col]
            new_name = 'flights_%s_per_%d_min' % (name, timeslice_mins)
            flights = X[:, {new_name: dt.count()}, dt.by(*group_cols)]
            flights.key = group_cols
            cols_to_keep.append(new_name)
            X = X[:, :, dt.join(flights)]

        # select flights leaving from SFO only
        X = X[dt.f['Origin'] == 'SFO', :]

        # Fill NaNs in DepDelay column
        X[dt.isna(dt.f['DepDelay']), 'DepDelay'] = 0

        # create binary target column
        depdelay_threshold_mins = 15
        target = 'DepDelay%dm' % depdelay_threshold_mins
        X[:, target] = dt.f['DepDelay'] > depdelay_threshold_mins
        cols_to_keep.extend([
            target,
            'Year',
            'Month',
            'DayofMonth',
            'DayOfWeek',
            'CRSDepTime',
            'UniqueCarrier',
            'FlightNum',
            'TailNum',
            'CRSElapsedTime',
github h2oai / driverlessai-recipes / data / airlines_joined_data_flights_in_out_regression.py View on Github external
# add date
        date_col = 'Date'
        X[:, date_col] = dt.f['Year'] * 10000 + dt.f['Month'] * 100 + dt.f['DayofMonth']
        cols_to_keep = ['Date']

        # add number of flights in/out for each airport per given interval
        timeslice_mins = 60
        for name, new_col, col, group in [
            ("out", "CRSDepTime_mod", "CRSDepTime", "Origin"),
            ("in", "CRSArrTime_mod", "CRSArrTime", "Dest")
        ]:
            X[:, new_col] = X[:, dt.f[col] // timeslice_mins]
            group_cols = [date_col, group, new_col]
            new_name = 'flights_%s' % name
            flights = X[:, {new_name: dt.count()}, dt.by(*group_cols)]
            flights.key = group_cols
            cols_to_keep.append(new_name)
            X = X[:, :, dt.join(flights)]

        # Fill NaNs with 0s
        X[dt.isna(dt.f['DepDelay']), 'DepDelay'] = 0
        cols_to_keep.extend([
            'DepDelay',
            'Year',
            'Month',
            'DayofMonth',
            'DayOfWeek',
            'CRSDepTime',
            'UniqueCarrier',
            'FlightNum',
            'TailNum',
github h2oai / datatable / tests / models / test_ftrl.py View on Github external
ft = Ftrl(alpha = 0.1, nepochs = 10000, model_type = "binomial")
    df_train_odd = dt.Frame([[1, 3, 7, 5, 9]])
    df_target_odd = dt.Frame([["odd", "odd", "odd", "odd", "odd"]])
    ft.fit(df_train_odd, df_target_odd)
    assert_equals(ft.labels, dt.Frame([["odd"], [0]], names = ["label", "id"]))

    df_train_wrong = dt.Frame([[2, 4, None, 6]])
    df_target_wrong = dt.Frame([["even", "even", "none", "even"]])
    with pytest.raises(ValueError) as e:
        ft.fit(df_train_wrong, df_target_wrong)
    assert ("Got two new labels in the target column, however, positive "
            "label is already set"
            == str(e.value))

    df_train_even_odd = dt.Frame([[2, 1, 8, 3]])
    df_target_even_odd = dt.Frame([["even", "odd", "even", "odd"]])
    ft.fit(df_train_even_odd, df_target_even_odd)
    assert_equals(ft.labels, dt.Frame([["even", "odd"], [1, 0]], names = ["label", "id"]))

    p = ft.predict(df_train_odd)
    p_dict = p.to_dict()
    delta_odd = [abs(i - j) for i, j in zip(p_dict["odd"], [1, 1, 1, 1, 1])]
    delta_even = [abs(i - j) for i, j in zip(p_dict["even"], [0, 0, 0, 0, 0])]
    assert ft.model_type_trained == "binomial"
    assert max(delta_odd) < epsilon
    assert max(delta_even) < epsilon

    p = ft.predict(df_train_even_odd)
    p_dict = p.to_dict()
    delta_even = [abs(i - j) for i, j in zip(p_dict["even"], [1, 0, 1, 0])]
    delta_odd = [abs(i - j) for i, j in zip(p_dict["odd"], [0, 1, 0, 1])]
    assert ft.model_type_trained == "binomial"
github h2oai / datatable / tests / models / test_ftrl.py View on Github external
def test_ftrl_fit_predict_nones():
    ft = Ftrl()
    ft.fit(None, None)
    df_target = ft.predict(None)
    assert df_target == None
github h2oai / datatable / tests / models / test_ftrl.py View on Github external
def test_ftrl_wrong_validation_target_type():
    nepochs = 1234
    nepochs_validation = 56
    nbins = 78
    ft = Ftrl(alpha = 0.5, nbins = nbins, nepochs = nepochs)
    r = range(ft.nbins)
    df_X = dt.Frame(r)
    df_y = dt.Frame(r)
    df_X_val = df_X
    df_y_val = dt.Frame(["Some string data" for _ in r])

    with pytest.raises(TypeError) as e:
        res = ft.fit(df_X, df_y, df_X_val, df_y_val,
                     nepochs_validation = 0)
    assert ("Training and validation target columns must have the same ltype, "
            "got: `integer` and `string`" == str(e.value))
github h2oai / datatable / tests / models / test_ftrl.py View on Github external
@pytest.mark.parametrize('target',
                         [[True, False],
                         ["yes", "no"],
                         [20, 10],
                         [0.5, -0.5]])
def test_ftrl_fit_predict_bool_binomial(target):
    ft = Ftrl(alpha = 0.1, nepochs = 10000, model_type = "binomial")
    df_train = dt.Frame([True, False])
    df_target = dt.Frame(target)
    ft.fit(df_train, df_target)
    df_res = ft.predict(df_train)
    assert ft.labels[:, 0].to_list() == [sorted(target)]
    assert ft.model_type_trained == "binomial"
    assert df_res[0, 1] <= 1
    assert df_res[0, 1] >= 1 - epsilon
    assert df_res[1, 1] >= 0
    assert df_res[1, 1] < epsilon
    assert df_res[0, 0] >= 0
    assert df_res[0, 0] < epsilon
    assert df_res[1, 0] <= 1
    assert df_res[1, 0] >= 1 - epsilon
github h2oai / datatable / tests / models / test_ftrl.py View on Github external
def test_ftrl_early_stopping_multinomial():
    nepochs = 2000
    ft = Ftrl(alpha = 0.2, nepochs = nepochs, double_precision = True)
    labels = ["blue", "green", "red"]

    df_train = dt.Frame(["cucumber", None, "shift", "sky", "day", "orange",
                         "ocean"])
    df_target = dt.Frame(["green", "red", "red", "blue", "green", None,
                          "blue"])
    res = ft.fit(df_train, df_target, df_train[:4, :], df_target[:4, :],
                 nepochs_validation = 1, validation_error = 1e-3)
    frame_integrity_check(ft.model)
    p = ft.predict(df_train)
    frame_integrity_check(p)
    p_none = 1/p.ncols
    p_dict = p.to_dict()
    p_list = p.to_list()
    sum_p =[sum(row) for row in zip(*p_list)]
    delta_sum = [abs(i - j) for i, j in zip(sum_p, [1] * 5)]