How to use the lightgbm.train function in lightgbm

To help you get started, we’ve selected a few lightgbm examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github dmitryikh / leaves / testdata / lg_dart_breast_cancer.py View on Github external
import lightgbm as lgb
import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split

X, y = datasets.load_breast_cancer(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)

n_estimators = 10
d_train = lgb.Dataset(X_train, label=y_train)
params = {
    'boosting_type': 'dart',
    'objective': 'binary',
}
clf = lgb.train(params, d_train, n_estimators)
y_pred = clf.predict(X_test)

clf.save_model('lg_dart_breast_cancer.model')  # save the model in txt format
np.savetxt('lg_dart_breast_cancer_true_predictions.txt', y_pred)
np.savetxt('breast_cancer_test.tsv', X_test, delimiter='\t')
d = clf.dump_model()
import json
with open('lg_dart_breast_cancer.json', 'w') as fout:
    json.dump(d, fout, indent=1)
github aduhovnik / sdsj2018-automl / model.py View on Github external
def get_params_to_params_estimation(x: pd.DataFrame, y: pd.Series, config: Config):
    params_to_check = get_sensitive_and_impervious_params(config)

    x_train, x_valid, y_train, y_valid = train_test_split(x, y, test_size=0.33, random_state=42)
    res = defaultdict(list)
    for p in params_to_check:
        model = lgb.train(p, lgb.Dataset(x_train, label=y_train), COLD_TOUCH_ITERATIONS)
        res['valid'].append(get_score(y_valid, model.predict(x_valid), config))

    chooser = np.argmax if config['mode'][0] == 'c' else np.argmin
    best_params = params_to_check[chooser(res['valid'])]
    log('Estimated params: {}'.format(best_params))
    return best_params
github jiuxianghedonglu / Context-Aware-Multi-Modal-Transportation-Recommendation / code / gbdt.py View on Github external
'lambda_l2': 10,
        'num_class': 12,
        'seed': 2019,
        'feature_fraction': 0.8,
        'bagging_fraction': 0.8,
        'bagging_freq': 4,
    }
    cate_cols = ['max_dist_mode', 'min_dist_mode', 'max_price_mode',
                 'min_price_mode', 'max_eta_mode', 'min_eta_mode', 'first_mode', 'weekday', 'hour']
    scores = []
    result_proba = []
    for tr_idx, val_idx in kfold.split(train_x, train_y):
        tr_x, tr_y, val_x, val_y = train_x.iloc[tr_idx], train_y[tr_idx], train_x.iloc[val_idx], train_y[val_idx]
        train_set = lgb.Dataset(tr_x, tr_y, categorical_feature=cate_cols)
        val_set = lgb.Dataset(val_x, val_y, categorical_feature=cate_cols)
        lgb_model = lgb.train(lgb_paras, train_set,
                              valid_sets=[val_set], early_stopping_rounds=50, num_boost_round=40000, verbose_eval=50, feval=eval_f)
        val_pred = np.argmax(lgb_model.predict(
            val_x, num_iteration=lgb_model.best_iteration), axis=1)
        val_score = f1_score(val_y, val_pred, average='weighted')
        result_proba.append(lgb_model.predict(
            test_x, num_iteration=lgb_model.best_iteration))
        scores.append(val_score)
    print('cv f1-score: ', np.mean(scores))
    pred_test = np.argmax(np.mean(result_proba, axis=0), axis=1)
    return pred_test
github castorini / anserini / src / main / python / ecir2019_ccrf / rerank.py View on Github external
clf.fit(X_train, y_train)
        y_test = clf.predict_proba(X_test)[:,1]
        return y_test
    elif classifier == 'lgb':
        param = {
            'num_leaves':15,
            'num_iterations':100,
            'max_depth': 5,
            'objective':'binary',
            'is_unbalance': True,
            'metric': ['auc', 'binary_logloss'],
            'verbose': -1,
            'seed': 848
        }
        train_data = lgb.Dataset(X_train, label=y_train)
        clf = lgb.train(param, train_data)
        y_test = clf.predict(X_test)
        return y_test
github closest-git / LiteMORT / python-package / case_ashrae_divide.py View on Github external
score = 0

    for fold, (train_index, valid_index) in enumerate(kf.split(X_train_site, y_train_site)):
        X_train, X_valid = X_train_site.loc[train_index, all_features], X_train_site.loc[valid_index, all_features]
        y_train, y_valid = y_train_site.iloc[train_index], y_train_site.iloc[valid_index]

        if isMORT:
            params['verbose'] = 667 if site_id==0 and fold == 0 else 1
            merge_datas = []
            model = LiteMORT(params,merge_infos=merge_infos)
            model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], categorical_feature=categorical_features)
        else:
            dtrain = lgb.Dataset(X_train, label=y_train, categorical_feature=categorical_features)
            dvalid = lgb.Dataset(X_valid, label=y_valid, categorical_feature=categorical_features)
            watchlist = [dtrain, dvalid]
            model = lgb.train(params, train_set=dtrain, num_boost_round=num_rounds, valid_sets=watchlist, verbose_eval=verbose_eval,
                                  early_stopping_rounds=early_stop)
        models[site_id].append(model)
        y_pred_valid = model.predict(X_valid, num_iteration=model.best_iteration)
        y_pred_train_site[valid_index] = y_pred_valid

        rmse = np.sqrt(mean_squared_error(y_valid, y_pred_valid))
        print("Site Id:", site_id, ", Fold:", fold + 1, ", RMSE:", rmse)
        score += rmse / cv
        del  X_train, X_valid,y_train, y_valid
        gc.collect()
        #input("......")

    cv_scores["site_id"].append(site_id)
    cv_scores["cv_score"].append(score)

    print("\nSite Id:", site_id, ", CV RMSE:", np.sqrt(mean_squared_error(y_train_site, y_pred_train_site)), "\n")
github Genpeng / datagrand-text-classification / src / 01_lgb-300d-embedding / lgb_char_300d_tuning.py View on Github external
'num_class': num_classes,
        'metric': 'multi_logloss',
        'num_leaves': 15,
        'max_depth': 4,
        'learning_rate': 0.05,
        'feature_fraction': 0.8,
        # 'bagging_fraction': 0.8,
        # 'bagging_freq': 5,
        'verbose': 0
    }
    num_boost_round = 2000
    feature_names = ['embed_' + str(col) for col in range(EMBEDDING_SIZE)]

    print("Start training...")
    start_time = time.time()
    gbm = lgb.train(params,
                    lgb_train,
                    num_boost_round=num_boost_round,
                    valid_sets=lgb_val,
                    feature_name=feature_names,
                    early_stopping_rounds=30)
    print("Training finished! ^_^")
    print("Total seconds: %ds" % (time.time() - start_time))

    # Calculate the f1 score and accuracy of training and validation set
    probs_train = gbm.predict(X_train, num_iteration=gbm.best_iteration)
    preds_train = np.argmax(probs_train, axis=1)
    score_train = f1_score(y_train, preds_train, average='weighted')
    acc_train = accuracy_score(y_train, preds_train)
    print("The f1 score of training set after %d epochs is: %f" % (gbm.best_iteration, score_train))
    print("The accuracy of training set after %d epochs is: %f" % (gbm.best_iteration, acc_train))
github drivendataorg / zamba / zamba / models / cnnensemble / src / second_stage_lgb.py View on Github external
X, y, video_ids = load_train_data(model_name, fold)

    y_cat = np.argmax(y, axis=1)
    print(X.shape, y.shape)
    print(np.unique(y_cat))

    train_data = lgb.Dataset(X, label=y_cat)

    param = {'num_leaves': 50,
             'objective': 'multiclass',
             'max_depth': 5,
             'learning_rate': .1,
             'max_bin': 200,
             'num_class': NB_CAT,
             'metric': ['multi_logloss']}
    model = lgb.train(param, train_data,  num_boost_round=100)
    pickle.dump(model, open(Path(__file__).parent.parent / f"output/lgb_{model_name}_{fold}_full.pkl", "wb"))
github lopuhin / kaggle-kuzushiji-2019 / kuzushiji / classify / level2.py View on Github external
train_data = lgb.Dataset(train_features, train_y)
    valid_data = lgb.Dataset(valid_features, valid_y, reference=train_data)
    params = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'learning_rate': lr,
        'bagging_fraction': 0.8,
        'bagging_freq': 5,
        'feature_fraction': 0.9,
        'min_data_in_leaf': 20,
        'num_leaves': 41,
        'scale_pos_weight': 1.2,
        'lambda_l2': 1,
    }
    print(params)
    return lgb.train(
        params=params,
        train_set=train_data,
        num_boost_round=num_boost_round,
        early_stopping_rounds=20,
        valid_sets=[valid_data],
        verbose_eval=10,
    )
github ptl2r / ptl2r.github.io / org / archive / ltr_tree / lambdamart / lambdaMART.py View on Github external
if do_validation:
            x_valid, y_valid = load_svmlight_file(file_vali_data)
            group_valid = np.loadtxt(file_vali_group)
            valid_set = Dataset(data=x_valid, label=y_valid, group=group_valid)

        x_test, y_test = load_svmlight_file(file_test_data)
        group_test = np.loadtxt(file_test_group)
        #test_set = Dataset(data=x_test, label=y_test, group=group_test)

        params = self.get_paras_LightGBM(para_dict=para_dict, eval_dict=eval_dict)

        if do_validation:
            gbm = lgb.train(params=params, train_set=train_set, valid_sets=[valid_set], verbose_eval=10, early_stopping_rounds=100)
        else:
            gbm = lgb.train(params=params, train_set=train_set, verbose_eval=10, num_boost_round=100)


        if data_id in YAHOO_L2R:
            model_file = save_dir+'model.txt'
        else:
            model_file = save_dir+'_'.join(['fold', str(fold_k), 'model'])+'.txt'

        gbm.save_model(model_file)

        y_pred = gbm.predict(x_test)  # fold-wise prediction

        return y_test, group_test, y_pred