Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
params = {'max_depth': 2, 'eta': 1, 'verbosity': 0,
'objective': 'binary:logistic', 'eval_metric': ['auc']}
cv = xgb.cv(params, dm, num_boost_round=10, nfold=10,
as_pandas=True, metrics='error')
assert 'eval_metric' in params
assert 'auc' not in cv.columns[0]
assert 'error' in cv.columns[0]
cv = xgb.cv(params, dm, num_boost_round=10, nfold=10,
as_pandas=True, metrics=['error'])
assert 'eval_metric' in params
assert 'auc' not in cv.columns[0]
assert 'error' in cv.columns[0]
params = list(params.items())
cv = xgb.cv(params, dm, num_boost_round=10, nfold=10,
as_pandas=True, metrics=['error'])
assert isinstance(params, list)
assert 'auc' not in cv.columns[0]
assert 'error' in cv.columns[0]
def modelfit(self, performCV=True, useTrainCV=False, TrainCVFolds=5, early_stopping_rounds=20, show_progress=True, printTopN='all'):
if useTrainCV:
xgb_param = self.alg.get_xgb_params()
if self.num_class>2:
xgb_param['num_class']=self.num_class
cvresult = xgb.cv(xgb_param,self.xgtrain, num_boost_round=self.alg.get_params()['n_estimators'], nfold=self.cv_folds,
metrics=self.scoring_metric_xgb, early_stopping_rounds=early_stopping_rounds, show_progress=show_progress)
self.alg.set_params(n_estimators=cvresult.shape[0])
print self.alg.get_params()
obj = self.alg.fit(self.data_train[self.predictors], self.data_train[self.target], eval_metric=self.eval_metric)
#Print feature importance
# self.set_feature_importance()
self.feature_imp = pd.Series(self.alg.booster().get_fscore()).sort_values(ascending=False)
num_print = len(self.feature_imp)
if printTopN != 'all':
num_print = min(printTopN,len(self.feature_imp))
self.feature_imp.iloc[:num_print-1].plot(kind='bar', title='Feature Importances')
plt.ylabel('Feature Importance Score')
plt.show(block=False)
def modelfit(alg,dtrain_x,dtrain_y,useTrainCV=True,cv_flods=5,early_stopping_rounds=50):
"""
:param alg: 初始模型
:param dtrain_x:训练数据X
:param dtrain_y:训练数据y(label)
:param useTrainCV: 是否使用cv函数来确定最佳n_estimators
:param cv_flods:交叉验证的cv数
:param early_stopping_rounds:在该数迭代次数之前,eval_metric都没有提升的话则停止
"""
if useTrainCV:
xgb_param = alg.get_xgb_params()
xgtrain = xgb.DMatrix(dtrain_x,dtrain_y)
cv_result = xgb.cv(xgb_param,xgtrain,num_boost_round = alg.get_params()['n_estimators'],
nfold = cv_flods,metrics = 'auc',early_stopping_rounds=early_stopping_rounds)
# print(cv_result)
alg.set_params(n_estimators=cv_result.shape[0])
# train data
alg.fit(train_X,train_y,eval_metric='auc')
#predict train data
train_y_pre = alg.predict(train_X)
print ("\nModel Report")
print ("Accuracy : %.4g" % metrics.accuracy_score(train_y, train_y_pre))
feat_imp = pd.Series(alg.get_booster().get_fscore()).sort_values(ascending=False)
feat_imp.plot(kind = 'bar',title='Feature Importance')
'eta': self.eta,
'gamma': self.gamma,
'max_depth': self.max_depth,
'min_child_weight': self.min_child_weight,
'subsample': self.subsample,
'colsample_bytree': self.colsample_bytree,
'max_delta_step': self.max_delta_step,
'l': self.l,
'alpha': self.alpha,
'lambda_bias': self.lambda_bias,
'objective': self.objective,
'eval_metric': self.eval_metric,
'seed': self.seed,
'num_class': self.num_class,
}
results = xgb.cv(param, X, self.num_round, cv)
return results
#!/usr/bin/python
import numpy as np
import xgboost as xgb
### load data in do training
dtrain = xgb.DMatrix('../data/agaricus.txt.train')
param = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'binary:logistic'}
num_round = 2
print ('running cross validation')
# do cross validation, this will print result out as
# [iteration] metric_name:mean_value+std_value
# std_value is standard deviation of the metric
xgb.cv(param, dtrain, num_round, nfold=5,
metrics={'error'}, seed = 0)
print ('running cross validation, disable standard deviation display')
# do cross validation, this will print result out as
# [iteration] metric_name:mean_value+std_value
# std_value is standard deviation of the metric
xgb.cv(param, dtrain, num_round, nfold=5,
metrics={'error'}, seed = 0, show_stdv = False)
print ('running cross validation, with preprocessing function')
# define the preprocessing function
# used to return the preprocessed training, test data, and parameter
# we can use this to do weight rescale, etc.
# as a example, we try to set scale_pos_weight
def fpreproc(dtrain, dtest, param):
label = dtrain.get_label()
xgb_params = {
'seed': 0,
'colsample_bytree': 0.8,
'silent': 1,
'subsample': 0.6,
'learning_rate': 0.01,
# 'booster': 'gblinear', # default is gbtree
'objective': 'reg:linear',
'max_depth': 1,
'num_parallel_tree': 1,
'min_child_weight': 1,
'eval_metric': 'rmse',
}
res = xgb.cv(xgb_params, dtrain, num_boost_round=10000, nfold=5, seed=seed, stratified=False,
early_stopping_rounds=100, verbose_eval=10, show_stdv=True)
best_nrounds = res.shape[0] - 1
cv_mean = res.iloc[-1, 0]
cv_std = res.iloc[-1, 1]
print('Ensemble-CV: {0}+{1}'.format(cv_mean, cv_std))
title_name = 'xgb.cv'
two_sigma_fin_mod_tools.predicted_vs_actual_y_xgb(xgb, best_nrounds, xgb_params, x_train, test_data,
y_train, y_test_data, title_name)
# plt.show()
gbdt = xgb.train(xgb_params, dtrain, best_nrounds)
output_xgb_cv = gbdt.predict(dtest)
# Averaging the output using four different machine learning estimators
output = (output_feature_selection_lasso + output_xgb_cv) / 2.0
ratio = float(np.sum(label == 0)) / np.sum(label==1)
param['scale_pos_weight'] = ratio
wtrain = dtrain.get_weight()
wtest = dtest.get_weight()
sum_weight = sum(wtrain) + sum(wtest)
wtrain *= sum_weight / sum(wtrain)
wtest *= sum_weight / sum(wtest)
dtrain.set_weight(wtrain)
dtest.set_weight(wtest)
return (dtrain, dtest, param)
# do cross validation, for each fold
# the dtrain, dtest, param will be passed into fpreproc
# then the return value of fpreproc will be used to generate
# results of that fold
xgb.cv(param, dtrain, num_round, nfold=5,
metrics={'ams@0.15', 'auc'}, seed = 0, fpreproc = fpreproc)
def xgb_evaluate(min_child_weight,
colsample_bytree,
max_depth,
subsample,
gamma,
alpha):
params['min_child_weight'] = int(min_child_weight)
params['cosample_bytree'] = max(min(colsample_bytree, 1), 0)
params['max_depth'] = int(max_depth)
params['subsample'] = max(min(subsample, 1), 0)
params['gamma'] = max(gamma, 0)
params['alpha'] = max(alpha, 0)
cv_result = xgb.cv(params, xgtrain, num_boost_round=num_rounds, nfold=5,
seed=random_state,
feval=evalerror,
callbacks=[xgb.callback.early_stop(50)])
return -cv_result['test-mae-mean'].values[-1]
def cv(self, grid_params):
"""
Calculate the Cross-Validation (CV) score for XGBoost model based on ``grid_params`` parameters. Sets xgb.cv_results variable to the resulting dataframe.
"""
xgdmat = xgb.DMatrix(Base.train_X, Base.train_y)
self.cv_results = xgb.cv(
params = grid_params, dtrain = xgdmat,
num_boost_round = 1000, nfold = 5,
metrics = ['error'], early_stopping_rounds = 20)
self.error = self.cv_results.get_value(len(self.cv_results) - 1, 'test-error-mean')