Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
"x2": [0, 1, 1, 0, 1, 0],
"w": [2, 1, 2, 0.5, 2, 0.5],
'y': [0, 1, 2, 1, 2, 0]
})
df_test_multinomial = pd.DataFrame({
'id': ["id4", "id4", "id5", "id6", "id5", "id6"],
'x1': [12.0, 1000.0, -4.0, 0.0, -4.0, 0.0],
"x2": [1, 1, 0, 1, 0, 1],
"w": [1, 2, 0, 0.5, 0, 0.5],
'y': [1, 2, 0, 1, 2, 0]
})
features = ["x1", "x2"]
learner_binary = xgb_classification_learner(features=features,
target="y",
learning_rate=0.1,
num_estimators=20,
extra_params={"max_depth": 4, "seed": 42},
prediction_column="prediction",
weight_column="w")
predict_fn_binary, pred_train_binary, log = learner_binary(df_train_binary)
pred_test_binary = predict_fn_binary(df_test_binary)
expected_col_train = df_train_binary.columns.tolist() + ["prediction"]
expected_col_test = df_test_binary.columns.tolist() + ["prediction"]
assert Counter(expected_col_train) == Counter(pred_train_binary.columns.tolist())
assert Counter(expected_col_test) == Counter(pred_test_binary.columns.tolist())
def param_train_fn(space, train_set):
return xgb_classification_learner(features=["x"],
target="target",
learning_rate=space["learning_rate"],
num_estimators=space["num_estimators"])(train_set)
def param_train_fn(space, train_set):
return xgb_classification_learner(features=["x"],
target="target",
learning_rate=space["learning_rate"],
num_estimators=space["num_estimators"])(train_set)
expected_col_test = df_test_binary.columns.tolist() + ["prediction"]
assert Counter(expected_col_train) == Counter(pred_train_binary.columns.tolist())
assert Counter(expected_col_test) == Counter(pred_test_binary.columns.tolist())
assert pred_test_binary.prediction.max() < 1
assert pred_test_binary.prediction.min() > 0
assert (pred_test_binary.columns == pred_train_binary.columns).all()
# SHAP test
pred_shap = predict_fn_binary(df_test_binary, apply_shap=True)
assert "shap_values" in pred_shap.columns
assert "shap_expected_value" in pred_shap.columns
assert np.vstack(pred_shap["shap_values"]).shape == (4, 2)
# test multinomial case
learner_multinomial = xgb_classification_learner(features=features,
target="y",
learning_rate=0.1,
num_estimators=20,
extra_params={"max_depth": 2,
"seed": 42,
"objective": 'multi:softprob',
"num_class": 3},
prediction_column="prediction")
predict_fn_multinomial, pred_train_multinomial, log = learner_multinomial(df_train_multinomial)
pred_test_multinomial = predict_fn_multinomial(df_test_multinomial)
expected_col_train = df_train_binary.columns.tolist() + ["prediction_0", "prediction_1", "prediction_2",
"prediction"]
expected_col_test = df_test_binary.columns.tolist() + ["prediction_0", "prediction_1", "prediction_2",
train_split_bins: list
A list with the actual values of the categories from the `train_split_col`. Ex: if you want to split your
training by tenure and you have a tenure column with integer values [1,2,3,...,12] you can pass this list and
you will split your training into 12 different models.
nthread: int
Number of threads for the XGBoost learners.
target_column: str
The name of the target column.
prediction_column: str
The name of the column with the predictions from the model.
"""
train_fns = {b: xgb_classification_learner(features=features_by_bin[b],
learning_rate=learning_rate_by_bin[b],
num_estimators=num_estimators_by_bin[b],
target=target_column,
extra_params=assoc(extra_params_by_bin[b], 'nthread', nthread),
prediction_column=prediction_column + "_bin_" + str(b))
for b in train_split_bins}
train_sets = {b: train_set[train_set[train_split_col] == b]
for b in train_split_bins}
train_results = {b: train_fns[b](train_sets[b])
for b in train_split_bins}
# train_results is a 3-tuple (prediction functions, predicted train dataset, train logs)
pred_fns = {b: train_results[b][0] for b in train_split_bins}
train_logs = {b: train_results[b][2] for b in train_split_bins}