Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
# step 2: Select Feature
data = extract_feature_and_label(data, feature_name_list=conf['feature_name'], label_name_list=conf['label_name'])
# step 3: Preprocess
train, test = divide_train_and_test(data, conf['training_set_proportion'])
train_x, train_y = data_transform_for_xgboost(train)
test_x, test_y = data_transform_for_xgboost(test)
train_y = sign(train_y)
test_y = sign(test_y)
indices = find_all_indices(train_y, 1)
indices.extend(find_all_indices(train_y, -1))
train_x = np.array(train_x)[indices]
train_y = np.array(train_y)[indices]
dtrain = xgb.DMatrix(train_x, train_y)
param = {
'booster': 'gbtree',
'silent': True,
'eta': 0.01,
'max_depth': 5,
'gamma': 0.1,
'objective': 'multi:softmax',
'num_class': 3,
'seed': 1000,
'scale_pos_weight': 1
}
clf = xgb.XGBClassifier(**param)
if conf['use_previous_model'] is False:
clf.fit(train_x, train_y)
def is_correctly_constrained(learner):
n = 100
variable_x = np.linspace(0, 1, n).reshape((n, 1))
fixed_xs_values = np.linspace(0, 1, n)
for i in range(n):
fixed_x = fixed_xs_values[i] * np.ones((n, 1))
monotonically_increasing_x = np.column_stack((variable_x, fixed_x))
monotonically_increasing_dset = xgb.DMatrix(monotonically_increasing_x)
monotonically_increasing_y = learner.predict(
monotonically_increasing_dset
)
monotonically_decreasing_x = np.column_stack((fixed_x, variable_x))
monotonically_decreasing_dset = xgb.DMatrix(monotonically_decreasing_x)
monotonically_decreasing_y = learner.predict(
monotonically_decreasing_dset
)
if not (
is_increasing(monotonically_increasing_y) and
is_decreasing(monotonically_decreasing_y)
):
return False
for j in range(X2.shape[1]):
for i in np.random.choice(X2.shape[0], size=10, replace=False):
X2[i, j] = 2
dtrain3 = xgb.DMatrix(X2, label=y2)
res = {}
xgb.train(param, dtrain3, 10, [(dtrain3, 'train')], evals_result=res)
assert self.non_decreasing(res['train']['auc'])
assert res['train']['auc'][0] >= 0.85
for j in range(X2.shape[1]):
for i in np.random.choice(X2.shape[0], size=10, replace=False):
X2[i, j] = 3
dtrain4 = xgb.DMatrix(X2, label=y2)
res = {}
xgb.train(param, dtrain4, 10, [(dtrain4, 'train')], evals_result=res)
assert self.non_decreasing(res['train']['auc'])
assert res['train']['auc'][0] >= 0.85
# fail-safe test for max_bin=2
param = {'objective': 'binary:logistic',
'tree_method': 'hist',
'grow_policy': 'depthwise',
'max_depth': 2,
'eval_metric': 'auc',
'max_bin': 2}
res = {}
xgb.train(param, dtrain2, 10, [(dtrain2, 'train')], evals_result=res)
assert self.non_decreasing(res['train']['auc'])
assert res['train']['auc'][0] >= 0.85
def test_xgboost(clf, X, y):
ddata = xgb.DMatrix(data=X, label=y, nthread=-1)
with Timer() as t:
y_pred = clf.predict(ddata)
return y_pred, t.interval
def test_predict_nopickle(self):
X, y = makeXy()
dm = xgb.DMatrix(X, label=y)
watchlist = [(dm, 'train')]
res = {}
param = {
"objective": "binary:logistic",
"predictor": "gpu_predictor",
'eval_metric': 'auc',
}
bst = xgb.train(param, dm, n_estimators,
evals=watchlist, evals_result=res)
assert self.non_decreasing(res["train"]["auc"])
print("Before model.predict on GPU")
sys.stdout.flush()
tmp = time.time()
gpu_pred = bst.predict(dm, output_margin=True)
print(gpu_pred)
def test(self, data):
if isinstance(data.X_test, np.ndarray):
data.X_test = pd.DataFrame(data=data.X_test, columns=np.arange(0,
data.X_test.shape[1]),
index=np.arange(0, data.X_test.shape[0]))
data.X_test.columns = [str(i) for i in range(0, data.X_test.shape[1])]
dtest = xgb.DMatrix(data.X_test, data.y_test)
return self.model.predict(dtest)
param['eval_metric'] = 'auc'
param['max_depth'] = 5
param['eta'] = 0.3
param['silent'] = 0
param['tree_method'] = 'gpu_exact'
num_round = 20
skf = StratifiedKFold(n_splits=5)
for i, (train, test) in enumerate(skf.split(X, y)):
dtrain = xgb.DMatrix(X[train], label=y[train])
tmp = time.time()
bst = xgb.train(param, dtrain, num_round)
boost_time = time.time() - tmp
res = bst.eval(xgb.DMatrix(X[test], label=y[test]))
print("Fold {}: {}, Boost Time {}".format(i, res, str(boost_time)))
del bst
#!/usr/bin/python
import xgboost as xgb
##
# this script demonstrate how to fit generalized linear model in xgboost
# basically, we are using linear model, instead of tree for our boosters
##
dtrain = xgb.DMatrix('../data/agaricus.txt.train')
dtest = xgb.DMatrix('../data/agaricus.txt.test')
# change booster to gblinear, so that we are fitting a linear model
# alpha is the L1 regularizer
# lambda is the L2 regularizer
# you can also set lambda_bias which is L2 regularizer on the bias term
param = {'silent':1, 'objective':'binary:logistic', 'booster':'gblinear',
'alpha': 0.0001, 'lambda': 1}
# normally, you do not need to set eta (step_size)
# XGBoost uses a parallel coordinate descent algorithm (shotgun),
# there could be affection on convergence with parallelization on certain cases
# setting eta to be smaller value, e.g 0.5 can make the optimization more stable
# param['eta'] = 1
##
# the rest of settings are the same
def convert(self, X, y=None):
if y is None:
if isinstance(X, xgb.DMatrix):
return X
if hasattr(X,'values'):
X = xgb.DMatrix(X.values)
return X
return xgb.DMatrix(X)
else:
if hasattr(X,'values'):
X = xgb.DMatrix(X.values, y.values, missing=np.nan)
return X
return xgb.DMatrix(X, y, missing=np.nan)
train = pd.read_csv('../../data/train.csv').drop(["EventId", "Weight"], axis=1)
val = pd.read_csv('../../data/test.csv').drop(["EventId", "Weight"], axis=1)
train.replace(to_replace=-999., value=np.nan, inplace=True)
train.replace(to_replace='s', value=1, inplace=True)
train.replace(to_replace='b', value=0, inplace=True)
val.replace(to_replace=-999, value=np.nan, inplace=True)
val.replace(to_replace='s', value=1, inplace=True)
val.replace(to_replace='b', value=0, inplace=True)
train_y = train.Label
train_X = train.drop('Label', axis=1)
val_y = val.Label
val_X = val.drop('Label', axis=1)
dtrain = xgb.DMatrix(train_X, label=train_y)
dval = xgb.DMatrix(val_X, label=val_y)
params = {'booster':'gbtree',
'objective': 'binary:logistic',
'eta': 0.3,
'max_depth': 6,
'num_boost_round': 200,
'scale_pos_weight': 1.0,
'subsample': 0.8,
'colsample_bytree': 0.8,
'colsample_bylevel': 1.0,
'min_sample_split': 50,
'min_child_weight': 1,
'lambda': 10,
'gamma': 1,
'eval_metric': "auc",