How to use the kaggler.data_io.load_data function in Kaggler

To help you get started, we’ve selected a few Kaggler examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github jeongyoonlee / kaggler-template / src / train_predict_krs1.py View on Github external
def train_predict(train_file, test_file, predict_valid_file, predict_test_file,
                  n_est=100, batch_size=1024, retrain=True):

    model_name = os.path.splitext(os.path.splitext(os.path.basename(predict_test_file))[0])[0]

    logging.basicConfig(format='%(asctime)s   %(levelname)s   %(message)s',
                        level=logging.DEBUG,
                        filename='{}.log'.format(model_name))

    logging.info('Loading training and test data...')
    X, y = load_data(train_file)
    X_tst, _ = load_data(test_file)

    dims = X.shape[1]
    logging.info('{} dims'.format(dims))

    logging.info('Loading CV Ids')
    cv = StratifiedKFold(n_splits=N_FOLD, shuffle=True, random_state=SEED)

    p = np.zeros_like(y)
    p_tst = np.zeros((X_tst.shape[0],))
    for i, (i_trn, i_val) in enumerate(cv.split(X, y), 1):
        logging.info('Training model #{}'.format(i))
        clf = nn_model(dims)
        clf.fit_generator(generator=batch_generator(X[i_trn],
                                                    y[i_trn],
                                                    batch_size,
github jeongyoonlee / kaggler-template / src / train_predict_krs1.py View on Github external
def train_predict(train_file, test_file, predict_valid_file, predict_test_file,
                  n_est=100, batch_size=1024, retrain=True):

    model_name = os.path.splitext(os.path.splitext(os.path.basename(predict_test_file))[0])[0]

    logging.basicConfig(format='%(asctime)s   %(levelname)s   %(message)s',
                        level=logging.DEBUG,
                        filename='{}.log'.format(model_name))

    logging.info('Loading training and test data...')
    X, y = load_data(train_file)
    X_tst, _ = load_data(test_file)

    dims = X.shape[1]
    logging.info('{} dims'.format(dims))

    logging.info('Loading CV Ids')
    cv = StratifiedKFold(n_splits=N_FOLD, shuffle=True, random_state=SEED)

    p = np.zeros_like(y)
    p_tst = np.zeros((X_tst.shape[0],))
    for i, (i_trn, i_val) in enumerate(cv.split(X, y), 1):
        logging.info('Training model #{}'.format(i))
        clf = nn_model(dims)
        clf.fit_generator(generator=batch_generator(X[i_trn],
                                                    y[i_trn],
                                                    batch_size,
                                                    True),
github jeongyoonlee / kaggler-template / src / train_predict_lgb1.py View on Github external
def train_predict(train_file, test_file, predict_valid_file, predict_test_file,
                  n_est=100, n_leaf=200, lrate=.1, n_min=8, subcol=.3, subrow=.8,
                  subrow_freq=100, n_stop=100, retrain=True):

    model_name = os.path.splitext(os.path.splitext(os.path.basename(predict_test_file))[0])[0]

    logging.basicConfig(format='%(asctime)s   %(levelname)s   %(message)s',
                        level=logging.DEBUG,
                        filename='{}.log'.format(model_name))

    logging.info('Loading training and test data...')
    X, y = load_data(train_file)
    X_tst, _ = load_data(test_file)

    logging.info('Loading CV Ids')
    cv = StratifiedKFold(n_splits=N_FOLD, shuffle=True, random_state=SEED)

    params = {'random_state': SEED,
              'n_jobs': -1,
              'objective': 'binary',
              'boosting': 'gbdt',
              'learning_rate': lrate,
              'num_leaves': n_leaf,
              'feature_fraction': subcol,
              'bagging_fraction': subrow,
              'bagging_freq': subrow_freq,
              'verbosity': -1,
              'min_child_samples': n_min,
github jeongyoonlee / kaggler-template / src / train_predict_xgb1.py View on Github external
params = {'objective': "reg:linear",
              'max_depth': depth,
              'eta': lrate,
              'subsample': subrow,
              'colsample_bytree': subcol,
              'colsample_bylevel': sublev,
              'min_child_weight': weight,
              'silent': 1,
              'nthread': 10,
              'seed': SEED}

    logging.info('Loading training and test data...')
    X, y = load_data(train_file)
    y = np.log(y + offset)

    X_tst, _ = load_data(test_file)
    xgtst = xgb.DMatrix(X_tst)

    logging.info('Loading CV Ids')
    cv = KFold(len(y), n_folds=n_fold, shuffle=True, random_state=SEED)

    p_val = np.zeros(X.shape[0])
    p_tst = np.zeros(X_tst.shape[0])
    for i, (i_trn, i_val) in enumerate(cv, 1):
        xgtrn = xgb.DMatrix(X[i_trn], label=y[i_trn])
        xgval = xgb.DMatrix(X[i_val], label=y[i_val])

        logging.info('Training model #{}'.format(i))
        watchlist = [(xgtrn, 'train'), (xgval, 'val')]

        if i == 1:
            logging.info('Training with early stopping')
github jeongyoonlee / kaggler-template / src / train_predict_xgb1.py View on Github external
filename='{}.log'.format(model_name))

    # set xgb parameters
    params = {'objective': "reg:linear",
              'max_depth': depth,
              'eta': lrate,
              'subsample': subrow,
              'colsample_bytree': subcol,
              'colsample_bylevel': sublev,
              'min_child_weight': weight,
              'silent': 1,
              'nthread': 10,
              'seed': SEED}

    logging.info('Loading training and test data...')
    X, y = load_data(train_file)
    y = np.log(y + offset)

    X_tst, _ = load_data(test_file)
    xgtst = xgb.DMatrix(X_tst)

    logging.info('Loading CV Ids')
    cv = KFold(len(y), n_folds=n_fold, shuffle=True, random_state=SEED)

    p_val = np.zeros(X.shape[0])
    p_tst = np.zeros(X_tst.shape[0])
    for i, (i_trn, i_val) in enumerate(cv, 1):
        xgtrn = xgb.DMatrix(X[i_trn], label=y[i_trn])
        xgval = xgb.DMatrix(X[i_val], label=y[i_val])

        logging.info('Training model #{}'.format(i))
        watchlist = [(xgtrn, 'train'), (xgval, 'val')]
github jeongyoonlee / kaggler-template / src / train_predict_lgb1.py View on Github external
def train_predict(train_file, test_file, predict_valid_file, predict_test_file,
                  n_est=100, n_leaf=200, lrate=.1, n_min=8, subcol=.3, subrow=.8,
                  subrow_freq=100, n_stop=100, retrain=True):

    model_name = os.path.splitext(os.path.splitext(os.path.basename(predict_test_file))[0])[0]

    logging.basicConfig(format='%(asctime)s   %(levelname)s   %(message)s',
                        level=logging.DEBUG,
                        filename='{}.log'.format(model_name))

    logging.info('Loading training and test data...')
    X, y = load_data(train_file)
    X_tst, _ = load_data(test_file)

    logging.info('Loading CV Ids')
    cv = StratifiedKFold(n_splits=N_FOLD, shuffle=True, random_state=SEED)

    params = {'random_state': SEED,
              'n_jobs': -1,
              'objective': 'binary',
              'boosting': 'gbdt',
              'learning_rate': lrate,
              'num_leaves': n_leaf,
              'feature_fraction': subcol,
              'bagging_fraction': subrow,
              'bagging_freq': subrow_freq,
              'verbosity': -1,
              'min_child_samples': n_min,
              'metric': 'auc'}