How to use the lightgbm.Dataset function in lightgbm

To help you get started, we’ve selected a few lightgbm examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github C-rawler / DCIC-2019-Credit-intelligence-score-2th-Place / lgb_mse.py View on Github external
"verbosity": -1}

# "seed": 8888
# folds = StratifiedKFold(n_splits=10, shuffle=True, random_state=8888)
# idx = y_train.argsort()
# y_lab = np.repeat(list(range(50000 // 20)), 20)
# y_lab = np.asarray(sorted(list(zip(idx, y_lab))))[:, -1].astype(np.int32)
# splits = folds.split(X_train, y_lab)

folds = KFold(n_splits=5, shuffle=True, random_state=2019)
splits = folds.split(X_train, y_train)

for fold_, (trn_idx, val_idx) in enumerate(splits):
    print("fold n°{}".format(fold_ + 1))
    trn_data = lgb.Dataset(X_train[trn_idx], y_train[trn_idx])
    val_data = lgb.Dataset(X_train[val_idx], y_train[val_idx])

    num_round = 20000
    clf = lgb.train(param, trn_data, num_round, valid_sets=[trn_data, val_data], verbose_eval=100,
                    early_stopping_rounds=100)
    oof_lgb[val_idx] = clf.predict(X_train[val_idx], num_iteration=clf.best_iteration)
    predictions_lgb += clf.predict(X_test, num_iteration=clf.best_iteration) / folds.n_splits


print("MAE CV score: {:<8.8f}".format(1/(mean_absolute_error(oof_lgb, y_train)+1)))
print(predictions_lgb)

np.save('val.mse_lgb.npy',oof_lgb)
np.save('test.mse_lgb.npy',predictions_lgb)
github MetaLearners / NIPS-2018-AutoML-Challenge / src / boosting.py View on Github external
def suggest_learning_rate(self, X, y, max_boost_round):
        
        lr = [0.01, 0.02, 0.03, 0.04, 0.05]
        
        X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=0)

        params = self.setParams(self.default_hyper_param)

        max_round = max_boost_round // 500
        auc = np.zeros([len(lr), max_round])
        for i in range(len(lr)):
            print ('learning rate: %.2f' %(lr[i]))
            params['learning_rate'] = lr[i]
            train_data = lgb.Dataset(X_train, y_train, free_raw_data=False)
            clf = None
            for j in range(max_round):
                clf = lgb.train(params, train_data, num_boost_round=500, init_model=clf, keep_training_booster=True)
                # score with regularization
                auc[i, j] = roc_auc_score(y_valid, clf.predict(X_valid)) - lr[i] * 0.1 + j * 0.001

        print (auc)
        idx = np.argmax(auc)
        best_lr = lr[idx // max_round]
        best_boost_round = (idx % max_round + 1) * 500
        return best_lr, best_boost_round
github microsoft / LightGBM / tests / python_package_test / test_engine.py View on Github external
params = {'objective': 'multiclass',
                  'metric': 'auc_mu',
                  'verbose': -1,
                  'num_classes': 2,
                  'seed': 0}
        results_auc_mu = {}
        lgb.train(params, lgb_X, num_boost_round=10, valid_sets=[lgb_X], evals_result=results_auc_mu)
        params = {'objective': 'binary',
                  'metric': 'auc',
                  'verbose': -1,
                  'seed': 0}
        results_auc = {}
        lgb.train(params, lgb_X, num_boost_round=10, valid_sets=[lgb_X], evals_result=results_auc)
        np.testing.assert_allclose(results_auc_mu['training']['auc_mu'], results_auc['training']['auc'])
        # test the case where all predictions are equal
        lgb_X = lgb.Dataset(X[:10], label=y_new[:10])
        params = {'objective': 'multiclass',
                  'metric': 'auc_mu',
                  'verbose': -1,
                  'num_classes': 2,
                  'min_data_in_leaf': 20,
                  'seed': 0}
        results_auc_mu = {}
        lgb.train(params, lgb_X, num_boost_round=10, valid_sets=[lgb_X], evals_result=results_auc_mu)
        self.assertAlmostEqual(results_auc_mu['training']['auc_mu'][-1], 0.5)
        # should give 1 when accuracy = 1
        X = X[:10, :]
        y = y[:10]
        lgb_X = lgb.Dataset(X, label=y)
        params = {'objective': 'multiclass',
                  'metric': 'auc_mu',
                  'num_classes': 10,
github castorini / anserini / src / main / python / ecir2019_ccrf / rerank.py View on Github external
clf = sklearn.svm.SVC(kernel='linear', class_weight='balanced', probability=True, random_state=848)
        clf.fit(X_train, y_train)
        y_test = clf.predict_proba(X_test)[:,1]
        return y_test
    elif classifier == 'lgb':
        param = {
            'num_leaves':15,
            'num_iterations':100,
            'max_depth': 5,
            'objective':'binary',
            'is_unbalance': True,
            'metric': ['auc', 'binary_logloss'],
            'verbose': -1,
            'seed': 848
        }
        train_data = lgb.Dataset(X_train, label=y_train)
        clf = lgb.train(param, train_data)
        y_test = clf.predict(X_test)
        return y_test
github electricbrainio / hypermax / research / atpe_research_2 / process_results.py View on Github external
for atpeParamFeature in atpeParamFeatures:
                    if atpeParamFeature in result and result[atpeParamFeature] is not None and result[atpeParamFeature] != '':
                        if atpeParamFeature in atpeParameterValues:
                            for value in atpeParameterValues[atpeParamFeature]:
                                vector.append(1.0 if result[atpeParamFeature] == value else 0)
                        else:
                            vector.append(float(result[atpeParamFeature]))
                    else:
                        vector.append(-3) # We use -3 because none of our atpe parameters ever take this value
                vectors.append(vector)

                if key in classPredictorKeys:
                    targets.append(allTargets.index(result[key]))
                else:
                    targets.append(float(result[key]))
        return lightgbm.Dataset(numpy.array(vectors), label=numpy.array(targets), feature_name=names)
github MetaLearners / NIPS-2018-AutoML-Challenge / src / automl.py View on Github external
params = {
            'task': 'train',
            'boosting_type': 'goss', 
            'objective': 'binary', 
            'metric': 'auc', 
            'num_leaves': 31, 
            'learning_rate': 0.01, 
            'feature_fraction': 1.0, 
            'min_data_in_leaf': 5, 
            'top_rate': 0.1, 
            'other_rate': 0.05, 
            #'num_threads': 20, 
            'verbose': -1
        }
        data = lgb.Dataset(X, y)
        
        train_start = time.time()
        clf = lgb.train(params, data, num_boost_round=self.base_round_num)
        train_end = time.time()
        
        estimated_train_time = (np.arange(self.max_round) + 1) * (train_end - train_start) * ratio
        idx = np.arange(self.max_round)[estimated_train_time <= time_bedget]

        if (idx.shape[0] == 0):
            self.suggested_boost_round = self.base_round_num
            self.suggested_train_time = estimated_train_time[0]
        else:
            self.suggested_boost_round = (idx[-1] + 1) * self.base_round_num
            self.suggested_train_time = estimated_train_time[idx[-1]]

        return self.suggested_boost_round
github NVIDIA / gbm-bench / new_utils.py View on Github external
def prepare(self):
        self.dtrain = lgb.Dataset(self.data.X_train, self.data.y_train,
                                  free_raw_data=False)
github Cocoxili / DCASE2018Task2 / stacking_level2.py View on Github external
'boosting_type': 'gbdt',
    'objective': 'multiclass',
    'metric': 'multi_logloss',
    'max_depth': 5,
    'num_leaves': 31,
    'learning_rate': 0.025,
    'feature_fraction': 0.85,
    'lambda_l2': 1.5,
    'num_class': n_categories,
}

for i, (train_fold, validate) in enumerate(kf):
    print('Fold {}/{}'.format(i + 1, 5))
    X_train, X_validate, label_train, label_validate = \
                X.iloc[train_fold, :], X.iloc[validate, :], train_label[train_fold], train_label[validate]
    lgb_train = lgb.Dataset(X_train, label_train, feature_name=feature_names, weight=weights_train[train_fold])
    lgb_valid = lgb.Dataset(X_validate, label_validate, feature_name=feature_names, weight=weights_train[validate])
    lgb_test = lgb.Dataset(X_test, feature_name=feature_names,weight=weights_test)

    bst = lgb.train(
        params_lgb,
        lgb_train,
        num_boost_round=2000,
        valid_sets=[lgb_train, lgb_valid],
        early_stopping_rounds=100,
        verbose_eval=50,
    )

    best_trees.append(bst.best_iteration)
    #ax = lgb.plot_importance(bst, max_num_features=10, grid=False, height=0.8, figsize=(16, 8))
    #plt.show()
github ArkadiyD / KaggleLib / kagglelib / model_tuning / cross_validation.py View on Github external
if f == 0 and verbose:
            
            print ("Training with params : ")
            print (params)
        
        y_train = train_y[train_index]
        y_val = train_y[val_index]
    
        if isinstance(train_x, pd.DataFrame):
            X_train = train_x.ix[train_index]
            X_val = train_x.ix[val_index]       
        else:
            X_train = train_x[train_index]
            X_val = train_x[val_index]

        train_data = lgbm.Dataset(X_train, label = y_train, feature_name = 'auto', categorical_feature = categorical)
        valid_data = lgbm.Dataset(X_val, label = y_val, feature_name = 'auto', categorical_feature = categorical)
            
                     
        params['max_depth'] = int(params['max_depth'])
        params['num_leaves'] = int(params['num_leaves'])
        params['bagging_freq'] = int(params['bagging_freq'])
        params['max_bin'] = int(params['max_bin'])
        params['min_data_in_leaf'] = int(params['min_data_in_leaf'])

        if stopping_rounds < 0:
            params['num_boost_round'] = int(params['num_boost_round'])
            model = lgbm.train(params, train_data)
            preds_val = model.predict(X_val)

        else:
            model = lgbm.train(params, train_data, num_boost_round = 1000, valid_sets = valid_data, verbose_eval = verbose, early_stopping_rounds = stopping_rounds)
github IBM / adversarial-robustness-toolbox / examples / get_started_lightgbm.py View on Github external
# Step 1a: Flatten dataset

x_test = x_test[0:5]
y_test = y_test[0:5]

nb_samples_train = x_train.shape[0]
nb_samples_test = x_test.shape[0]
x_train = x_train.reshape((nb_samples_train, 28 * 28))
x_test = x_test.reshape((nb_samples_test, 28 * 28))

# Step 2: Create the model

params = {'objective': 'multiclass', 'metric': 'multi_logloss', 'num_class': 10}
train_set = lgb.Dataset(x_train, label=np.argmax(y_train, axis=1))
test_set = lgb.Dataset(x_test, label=np.argmax(y_test, axis=1))
model = lgb.train(params=params, train_set=train_set, num_boost_round=100, valid_sets=[test_set])

# Step 3: Create the ART classifier

classifier = LightGBMClassifier(model=model, clip_values=(min_pixel_value, max_pixel_value))

# Step 4: Train the ART classifier

# The model has already been trained in step 2

# Step 5: Evaluate the ART classifier on benign test examples

predictions = classifier.predict(x_test)
accuracy = np.sum(np.argmax(predictions, axis=1) == np.argmax(y_test, axis=1)) / len(y_test)
print('Accuracy on benign test examples: {}%'.format(accuracy * 100))