Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def catboost_regressor(pandas_data, catboost_params):
return CatBoostRegressor(**catboost_params).fit(pandas_data, [1, 0])
def test_catboost():
try:
import catboost
except:
print("Skipping test_catboost!")
return
import shap
# train catboost model
X, y = shap.datasets.boston()
X["RAD"] = X["RAD"].astype(np.int)
model = catboost.CatBoostRegressor(iterations=300, learning_rate=0.1, random_seed=123)
p = catboost.Pool(X, y, cat_features=["RAD"])
model.fit(p, verbose=False, plot=False)
# explain the model's predictions using SHAP values
ex = shap.TreeExplainer(model)
shap_values = ex.shap_values(p)
predicted = model.predict(X)
assert np.abs(shap_values.sum(1) + ex.expected_value - predicted).max() < 1e-6, \
"SHAP values don't sum to model output!"
def test_catboost():
try:
import catboost
except:
print("Skipping test_catboost!")
return
import shap
# train catboost model
X, y = shap.datasets.boston()
X["RAD"] = X["RAD"].astype(np.int)
model = catboost.CatBoostRegressor(iterations=300, learning_rate=0.1, random_seed=123)
p = catboost.Pool(X, y, cat_features=["RAD"])
model.fit(p, verbose=False, plot=False)
# explain the model's predictions using SHAP values
ex = shap.TreeExplainer(model)
shap_values = ex.shap_values(p)
predicted = model.predict(X)
assert np.abs(shap_values.sum(1) + ex.expected_value - predicted).max() < 1e-6, \
"SHAP values don't sum to model output!"
def test_catboost():
try:
import catboost
except:
print("Skipping test_catboost!")
return
import shap
# train catboost model
X, y = shap.datasets.boston()
X["RAD"] = X["RAD"].astype(np.int)
model = catboost.CatBoostRegressor(iterations=300, learning_rate=0.1, random_seed=123)
p = catboost.Pool(X, y, cat_features=["RAD"])
model.fit(p, verbose=False, plot=False)
# explain the model's predictions using SHAP values
ex = shap.TreeExplainer(model)
shap_values = ex.shap_values(p)
predicted = model.predict(X)
assert np.abs(shap_values.sum(1) + ex.expected_value - predicted).max() < 1e-6, \
"SHAP values don't sum to model output!"
'-f': train_path,
'-t': test_path,
'--column-description': cd_path,
'-i': '10',
'-T': '4',
'-m': model_path,
'--use-best-model': 'false',
'--test-err-log': test_error_path
}
fit_catboost_gpu(fit_params)
eval_metric(model_path, METRIC_CHECKING_MULTICLASS_NO_WEIGHTS, test_path, cd_path, eval_error_path)
compare_metrics_with_diff(METRIC_CHECKING_MULTICLASS_NO_WEIGHTS, test_error_path, eval_error_path)
py_catboost = catboost.CatBoost()
py_catboost.load_model(model_path)
assert json.loads(py_catboost.get_metadata()['multiclass_params'])['class_to_label'] == [0, 1, 2, 3]
assert json.loads(py_catboost.get_metadata()['multiclass_params'])['class_names'] == ['a', 'b', 'c', 'd']
assert json.loads(py_catboost.get_metadata()['multiclass_params'])['classes_count'] == 0
assert json.loads(py_catboost.get_metadata()['params'])['data_processing_options']['class_names'] == ['a', 'b', 'c', 'd']
return [local_canonical_file(test_error_path)]
fit_params = {
'--loss-function': loss_function,
'--boosting-type': 'Plain',
'--classes-count': '4',
'-f': train_path,
'--column-description': cd_path,
'-i': '10',
'-T': '4',
'-m': model_path,
'--use-best-model': 'false'
}
fit_catboost_gpu(fit_params)
py_catboost = catboost.CatBoost()
py_catboost.load_model(model_path)
assert json.loads(py_catboost.get_metadata()['multiclass_params'])['class_to_label'] == [1, 2]
assert json.loads(py_catboost.get_metadata()['multiclass_params'])['classes_count'] == 4
assert json.loads(py_catboost.get_metadata()['multiclass_params'])['class_names'] == []
calc_cmd = (
CATBOOST_PATH,
'calc',
'--input-path', test_path,
'--column-description', cd_path,
'-m', model_path,
'--output-path', eval_path,
'--prediction-type', prediction_type
)
base_dir + 'train_data_catboost_format.tsv'
)
train_targets = np.argmax(train_targets, axis=1)
test_documents, test_targets = read_train_documents_and_one_hot_targets(
base_dir + 'train_data_catboost_format.tsv'
)
train_dir = base_dir + 'ut_tmp/'
if not isdir(train_dir):
mkdir(train_dir)
cbc_params = read_json_params(base_dir + 'catboost_params.json')
cbc_params['leaf_estimation_method'] = method
cbc_params['random_seed'] = 10
cbc_params['train_dir'] = train_dir
cbc = CatBoostClassifier(**cbc_params)
cbc.fit(train_documents, train_targets)
cbc.save_model(train_dir + 'model.bin', format='cbm')
export_catboost_to_json(train_dir + 'model.bin', train_dir + 'model.json')
full_model = CBOneStepLeafRefitEnsemble(train_dir + 'model.json', train_documents, train_targets,
learning_rate=0.2, loss_function=BinaryCrossEntropyLoss(),
leaf_method=method,
update_set='AllPoints')
assert np.allclose(full_model(train_documents), cbc.predict(train_documents, prediction_type='RawFormulaVal'),
atol=1e-5),\
(full_model(train_documents), cbc.predict(train_documents, prediction_type='RawFormulaVal'))
assert np.allclose(full_model(test_documents), cbc.predict(test_documents, prediction_type='RawFormulaVal'),
atol=1e-5)
train_targets = np.argmax(train_targets, axis=1)
test_documents, test_targets = read_train_documents_and_one_hot_targets(
base_dir + 'test_data_catboost_format.tsv'
)
test_targets = np.argmax(test_targets, axis=1)
train_dir = base_dir + 'ut_tmp/'
if not isdir(train_dir):
mkdir(train_dir)
cbc_params = read_json_params(base_dir + 'catboost_params.json')
cbc_params['iterations'] = 2
cbc_params['leaf_estimation_method'] = leaf_method
cbc_params['random_seed'] = 10
cbc_params['train_dir'] = train_dir
cbc = CatBoostClassifier(**cbc_params)
cbc.set_params(boosting_type='Plain')
cbc.fit(train_documents, train_targets)
cbc.save_model(train_dir + 'model.bin', format='cbm')
export_catboost_to_json(train_dir + 'model.bin', train_dir + 'model.json')
full_model = CBLeafInfluenceEnsemble(train_dir + 'model.json', train_documents, train_targets,
leaf_method=leaf_method,
learning_rate=cbc_params['learning_rate'],
loss_function=BinaryCrossEntropyLoss(),
update_set='AllPoints')
retrained_model_our = deepcopy(full_model)
tf_checker = TFGBApplier(full_model, train_documents, train_targets, leaf_method)
for remove_idx in np.random.randint(len(train_targets), size=30):
full_model.fit(remove_idx, retrained_model_our)
pred_ours = full_model(train_documents)
pred_theirs = tf_checker.get_predicts()
pred_cbc = cbc.predict(train_documents, prediction_type='RawFormulaVal')
def CatBoost_First(self, data, catsign, depth=8, iterations=80000):
model = cb.CatBoostRegressor(iterations=iterations, depth=depth, learning_rate=0.8, loss_function='RMSE')
model.fit(data['train'][:, :-1], data['train'][:, -1], cat_features=catsign)
# 注意存储验证数据集结果和预测数据集结果的不同
# 训练数据集的预测结果
xul = model.predict(data['train'][:, :-1])
# 验证的预测结果
yanre = model.predict(data['test'][:, :-1])
# 预测的预测结果
prer = model.predict(data['predict'][:, :-1])
# 储存
self.yanzhneg_pr.append(yanre)
self.predi.append(prer)
# 分别计算训练、验证、预测的误差
# 每计算一折后,要计算训练、验证、预测数据的误差
xx = self.RMSE(xul, data['train'][:, -1])
yy = self.RMSE(yanre, data['test'][:, -1])
pp = self.RMSE(prer, data['predict'][:, -1])
def fit(self, X_train, y_train):
bst = cv(
Pool(X_train, y_train),
self.params
)
best_rounds = int(bst['test-{}-mean'.format(self.metric)].idxmax() * 1.5) + 1
print('Best Iteration: {}'.format(best_rounds))
self.params['iterations'] = best_rounds
self.model = CatBoostClassifier(**self.params)
self.model.fit(
X_train, y_train
)