How to use Kaggler - 10 common examples

To help you get started, we’ve selected a few Kaggler examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github jeongyoonlee / Kaggler / tests / test_encoders.py View on Github external
def test_TargetEncoder(generate_data):
    df = generate_data()
    feature_cols = [x for x in df.columns if x != TARGET_COL]
    cat_cols = [x for x in feature_cols if df[x].nunique() < 100]

    te = TargetEncoder()
    X_cat = te.fit_transform(df[cat_cols], df[TARGET_COL])
    print('Without CV:\n{}'.format(X_cat.head()))

    assert X_cat.shape[1] == len(cat_cols)

    cv = KFold(n_splits=N_FOLD, shuffle=True, random_state=RANDOM_SEED)
    te = TargetEncoder(cv=cv)
    X_cat = te.fit_transform(df[cat_cols], df[TARGET_COL])
    print('With CV (fit_transform()):\n{}'.format(X_cat.head()))

    assert X_cat.shape[1] == len(cat_cols)

    te = TargetEncoder(cv=cv)
    te.fit(df[cat_cols], df[TARGET_COL])
    X_cat = te.transform(df[cat_cols])
    print('With CV (fit() and transform() separately):\n{}'.format(X_cat.head()))

    assert X_cat.shape[1] == len(cat_cols)
github jeongyoonlee / Kaggler / tests / test_encoders.py View on Github external
cat_cols = [x for x in feature_cols if df[x].nunique() < 100]

    te = TargetEncoder()
    X_cat = te.fit_transform(df[cat_cols], df[TARGET_COL])
    print('Without CV:\n{}'.format(X_cat.head()))

    assert X_cat.shape[1] == len(cat_cols)

    cv = KFold(n_splits=N_FOLD, shuffle=True, random_state=RANDOM_SEED)
    te = TargetEncoder(cv=cv)
    X_cat = te.fit_transform(df[cat_cols], df[TARGET_COL])
    print('With CV (fit_transform()):\n{}'.format(X_cat.head()))

    assert X_cat.shape[1] == len(cat_cols)

    te = TargetEncoder(cv=cv)
    te.fit(df[cat_cols], df[TARGET_COL])
    X_cat = te.transform(df[cat_cols])
    print('With CV (fit() and transform() separately):\n{}'.format(X_cat.head()))

    assert X_cat.shape[1] == len(cat_cols)
github jeongyoonlee / Kaggler / tests / test_encoders.py View on Github external
def test_TargetEncoder(generate_data):
    df = generate_data()
    feature_cols = [x for x in df.columns if x != TARGET_COL]
    cat_cols = [x for x in feature_cols if df[x].nunique() < 100]

    te = TargetEncoder()
    X_cat = te.fit_transform(df[cat_cols], df[TARGET_COL])
    print('Without CV:\n{}'.format(X_cat.head()))

    assert X_cat.shape[1] == len(cat_cols)

    cv = KFold(n_splits=N_FOLD, shuffle=True, random_state=RANDOM_SEED)
    te = TargetEncoder(cv=cv)
    X_cat = te.fit_transform(df[cat_cols], df[TARGET_COL])
    print('With CV (fit_transform()):\n{}'.format(X_cat.head()))

    assert X_cat.shape[1] == len(cat_cols)

    te = TargetEncoder(cv=cv)
    te.fit(df[cat_cols], df[TARGET_COL])
    X_cat = te.transform(df[cat_cols])
    print('With CV (fit() and transform() separately):\n{}'.format(X_cat.head()))
github jeongyoonlee / Kaggler / tests / test_automl.py View on Github external
def test_automl():
    X, y = make_regression(n_samples=N_OBS,
                           n_features=N_FEATURE,
                           n_informative=N_IMP_FEATURE,
                           random_state=RANDOM_SEED)
    X = pd.DataFrame(X, columns=['x{}'.format(i) for i in range(X.shape[1])])
    y = pd.Series(y)
    logging.info(X.shape, y.shape)

    X_trn, X_tst, y_trn, y_tst = train_test_split(X, y, test_size=.2, random_state=RANDOM_SEED)

    model = AutoLGB(objective='regression', metric='l1')
    model.tune(X_trn, y_trn)
    model.fit(X_trn, y_trn)
    p = model.predict(X_tst)
    r = (np.random.rand(X_tst.shape[0]) * (y_trn.max() - y_trn.min()) + y_trn.min())
    logging.info('MAE (LGB): {:.4f}'.format(mae(y_tst, p)))
    assert mae(y_tst, p) < mae(y_tst, r)

    model = AutoXGB(objective='reg:linear', metric='rmse')
    model.tune(X_trn, y_trn)
    model.fit(X_trn, y_trn)
    p = model.predict(X_tst)
    r = (np.random.rand(X_tst.shape[0]) * (y_trn.max() - y_trn.min()) + y_trn.min())
    logging.info('MAE (XGB): {:.4f}'.format(mae(y_tst, p)))
    assert mae(y_tst, p) < mae(y_tst, r)
github jeongyoonlee / Kaggler / tests / test_automl.py View on Github external
random_state=RANDOM_SEED)
    X = pd.DataFrame(X, columns=['x{}'.format(i) for i in range(X.shape[1])])
    y = pd.Series(y)
    logging.info(X.shape, y.shape)

    X_trn, X_tst, y_trn, y_tst = train_test_split(X, y, test_size=.2, random_state=RANDOM_SEED)

    model = AutoLGB(objective='regression', metric='l1')
    model.tune(X_trn, y_trn)
    model.fit(X_trn, y_trn)
    p = model.predict(X_tst)
    r = (np.random.rand(X_tst.shape[0]) * (y_trn.max() - y_trn.min()) + y_trn.min())
    logging.info('MAE (LGB): {:.4f}'.format(mae(y_tst, p)))
    assert mae(y_tst, p) < mae(y_tst, r)

    model = AutoXGB(objective='reg:linear', metric='rmse')
    model.tune(X_trn, y_trn)
    model.fit(X_trn, y_trn)
    p = model.predict(X_tst)
    r = (np.random.rand(X_tst.shape[0]) * (y_trn.max() - y_trn.min()) + y_trn.min())
    logging.info('MAE (XGB): {:.4f}'.format(mae(y_tst, p)))
    assert mae(y_tst, p) < mae(y_tst, r)
github jeongyoonlee / Kaggler / tests / test_encoders.py View on Github external
X_emb = ee.fit_transform(X=df[feature_cols], y=df[TARGET_COL])
    assert X_emb.shape[1] == sum(ee.n_emb)

    print('Test with the binary classification target')
    df[TARGET_COL] = (df[TARGET_COL] > df[TARGET_COL].mean()).astype(int)

    ee = EmbeddingEncoder(cat_cols=cat_cols,
                          num_cols=num_cols,
                          random_state=RANDOM_SEED)

    X_emb = ee.fit_transform(X=df[feature_cols], y=df[TARGET_COL])
    assert X_emb.shape[1] == sum(ee.n_emb)

    print('Test with the binary classification target with cross validation')
    cv = KFold(n_splits=N_FOLD, shuffle=True, random_state=RANDOM_SEED)
    ee = EmbeddingEncoder(cat_cols=cat_cols,
                          num_cols=num_cols,
                          cv=cv,
                          random_state=RANDOM_SEED)

    X_emb = ee.fit_transform(X=df[feature_cols], y=df[TARGET_COL])
    assert X_emb.shape[1] == sum(ee.n_emb)
github jeongyoonlee / Kaggler / tests / test_encoders.py View on Github external
feature_cols = [x for x in df.columns if x != TARGET_COL]
    cat_cols = [x for x in feature_cols if df[x].nunique() < 100]
    num_cols = [x for x in feature_cols if x not in cat_cols]

    print('Test with the regression target')
    ee = EmbeddingEncoder(cat_cols=cat_cols,
                          num_cols=num_cols,
                          random_state=RANDOM_SEED)

    X_emb = ee.fit_transform(X=df[feature_cols], y=df[TARGET_COL])
    assert X_emb.shape[1] == sum(ee.n_emb)

    print('Test with the binary classification target')
    df[TARGET_COL] = (df[TARGET_COL] > df[TARGET_COL].mean()).astype(int)

    ee = EmbeddingEncoder(cat_cols=cat_cols,
                          num_cols=num_cols,
                          random_state=RANDOM_SEED)

    X_emb = ee.fit_transform(X=df[feature_cols], y=df[TARGET_COL])
    assert X_emb.shape[1] == sum(ee.n_emb)

    print('Test with the binary classification target with cross validation')
    cv = KFold(n_splits=N_FOLD, shuffle=True, random_state=RANDOM_SEED)
    ee = EmbeddingEncoder(cat_cols=cat_cols,
                          num_cols=num_cols,
                          cv=cv,
                          random_state=RANDOM_SEED)

    X_emb = ee.fit_transform(X=df[feature_cols], y=df[TARGET_COL])
    assert X_emb.shape[1] == sum(ee.n_emb)
github jeongyoonlee / kaggler-template / src / train_predict_krs1.py View on Github external
def train_predict(train_file, test_file, predict_valid_file, predict_test_file,
                  n_est=100, batch_size=1024, retrain=True):

    model_name = os.path.splitext(os.path.splitext(os.path.basename(predict_test_file))[0])[0]

    logging.basicConfig(format='%(asctime)s   %(levelname)s   %(message)s',
                        level=logging.DEBUG,
                        filename='{}.log'.format(model_name))

    logging.info('Loading training and test data...')
    X, y = load_data(train_file)
    X_tst, _ = load_data(test_file)

    dims = X.shape[1]
    logging.info('{} dims'.format(dims))

    logging.info('Loading CV Ids')
    cv = StratifiedKFold(n_splits=N_FOLD, shuffle=True, random_state=SEED)

    p = np.zeros_like(y)
    p_tst = np.zeros((X_tst.shape[0],))
    for i, (i_trn, i_val) in enumerate(cv.split(X, y), 1):
        logging.info('Training model #{}'.format(i))
        clf = nn_model(dims)
        clf.fit_generator(generator=batch_generator(X[i_trn],
                                                    y[i_trn],
                                                    batch_size,
github jeongyoonlee / kaggler-template / src / train_predict_krs1.py View on Github external
def train_predict(train_file, test_file, predict_valid_file, predict_test_file,
                  n_est=100, batch_size=1024, retrain=True):

    model_name = os.path.splitext(os.path.splitext(os.path.basename(predict_test_file))[0])[0]

    logging.basicConfig(format='%(asctime)s   %(levelname)s   %(message)s',
                        level=logging.DEBUG,
                        filename='{}.log'.format(model_name))

    logging.info('Loading training and test data...')
    X, y = load_data(train_file)
    X_tst, _ = load_data(test_file)

    dims = X.shape[1]
    logging.info('{} dims'.format(dims))

    logging.info('Loading CV Ids')
    cv = StratifiedKFold(n_splits=N_FOLD, shuffle=True, random_state=SEED)

    p = np.zeros_like(y)
    p_tst = np.zeros((X_tst.shape[0],))
    for i, (i_trn, i_val) in enumerate(cv.split(X, y), 1):
        logging.info('Training model #{}'.format(i))
        clf = nn_model(dims)
        clf.fit_generator(generator=batch_generator(X[i_trn],
                                                    y[i_trn],
                                                    batch_size,
                                                    True),
github jeongyoonlee / Kaggler / kaggler / model / automl.py View on Github external
algo=tpe.suggest, max_evals=n_eval, verbose=1,
                             rstate=self.random_state)

        hyperparams = space_eval(self.space, best)
        return hyperparams, trials

    def fit(self, X, y):
        self.model = XGBModel(n_estimators=self.n_best, **self.params)
        self.model.fit(X=X[self.features], y=y, eval_metric='mae', verbose=False)
        return self

    def predict(self, X):
        return self.model.predict(X[self.features])


class AutoLGB(BaseAutoML):

    params = {
        "bagging_freq": 1,
        "verbosity": -1,
        "seed": RANDOM_SEED,
        "num_threads": -1,
    }

    space = {
        "learning_rate": hp.loguniform("learning_rate", np.log(0.01), np.log(0.3)),
        "num_leaves": hp.choice("num_leaves", [15, 31, 63, 127, 255]),
        "max_depth": hp.choice("max_depth", [-1, 4, 6, 8, 10]),
        "feature_fraction": hp.quniform("feature_fraction", .5, .9, 0.1),
        "bagging_fraction": hp.quniform("bagging_fraction", .5, .9, 0.1),
        "min_child_samples": hp.choice('min_child_samples', [10, 25, 100]),
        "lambda_l1": hp.choice('lambda_l1', [0, .1, 1, 10]),