Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
contamination,
verbose=True)
for i in range(n_jobs))
print('Orig Predict time:', time.time() - start)
print()
# unfold and generate the label matrix
predicted_labels_orig = np.zeros([X_test.shape[0], n_estimators])
for i in range(n_jobs):
predicted_labels_orig[:, starts[i]:starts[i + 1]] = np.asarray(
all_results_pred[i]).T
start = time.time()
n_estimators = len(base_estimators)
n_estimators_list, starts, n_jobs = _partition_estimators(n_estimators,
n_jobs)
# model prediction
all_results_scores = Parallel(n_jobs=n_jobs, max_nbytes=None,
verbose=True)(
delayed(_parallel_decision_function)(
n_estimators_list[i],
trained_estimators[starts[i]:starts[i + 1]],
None,
X_test,
n_estimators,
# rp_flags[starts[i]:starts[i + 1]],
jl_transformers,
approx_flags[starts[i]:starts[i + 1]],
verbose=True)
for i in range(n_jobs))
start = time.time()
predicted_labels = model.predict(X_test) # predict labels
print('Predict time:', time.time() - start)
print()
start = time.time()
predicted_scores = model.decision_function(X_test) # predict scores
print('Decision Function time:', time.time() - start)
print()
##########################################################################
# compare with no projection, no bps, and no approximation
print("******************************************************************")
start = time.time()
n_estimators = len(base_estimators)
n_estimators_list, starts, n_jobs = _partition_estimators(n_estimators,
n_jobs)
rp_flags = np.zeros([n_estimators, 1])
approx_flags = np.zeros([n_estimators, 1])
objective_dim = None
rp_method = None
all_results = Parallel(n_jobs=n_jobs, max_nbytes=None, verbose=True)(
delayed(_parallel_fit)(
n_estimators_list[i],
base_estimators[starts[i]:starts[i + 1]],
X_train,
n_estimators,
rp_flags[starts[i]:starts[i + 1]],
objective_dim,
rp_method=rp_method,
n_samples, n_features = X.shape[0], X.shape[1]
# decide whether bps is needed
# it is turned off
if self.bps_flag:
# load the pre-trained cost predictor to forecast the train cost
cost_predictor = joblib.load(self.cost_forecast_loc_pred_)
time_cost_pred = cost_forecast_meta(cost_predictor, X,
self.base_estimator_names)
n_estimators_list, starts, n_jobs = balanced_scheduling(
time_cost_pred, self.n_estimators, self.n_jobs)
else:
# use simple equal split by sklearn
n_estimators_list, starts, n_jobs = _partition_estimators(
self.n_estimators, self.n_jobs)
# fit the base models
if self.verbose:
print('Parallel label prediction...')
start = time.time()
# TODO: code cleanup. There is an existing bug for joblib on Windows:
# https://github.com/joblib/joblib/issues/806
# max_nbytes can be dropped on other OS
all_results_pred = Parallel(n_jobs=n_jobs, max_nbytes=None,
verbose=True)(
delayed(_parallel_predict)(
n_estimators_list[i],
self.base_estimators[starts[i]:starts[i + 1]],
self.approximators[starts[i]:starts[i + 1]],
start = time.time()
predicted_labels = model.predict(X) # predict labels
print('Predict time:', time.time() - start)
print()
start = time.time()
predicted_scores = model.decision_function(X) # predict scores
print('Decision Function time:', time.time() - start)
print()
##########################################################################
# compare with no projection, no bps, and no approximation
print("******************************************************************")
n_estimators = len(base_estimators)
n_jobs = 6
n_estimators_list, starts, n_jobs = _partition_estimators(n_estimators,
n_jobs)
rp_flags = np.zeros([n_estimators, 1])
approx_flags = np.zeros([n_estimators, 1])
objective_dim = None
rp_method = None
start = time.time()
all_results = Parallel(n_jobs=n_jobs, max_nbytes=None, verbose=True)(
delayed(_parallel_fit)(
n_estimators_list[i],
base_estimators[starts[i]:starts[i + 1]],
X,
n_estimators,
rp_flags[starts[i]:starts[i + 1]],
objective_dim,
approx_clf_list = ['LOF', 'KNN']
approx_ng_clf_list = ['IForest', 'PCA', 'HBOS', 'ABOD']
approx_flag_global = True
# build approx code
# this can be a pre-defined list and directly supply to the system
approx_clf = RandomForestRegressor(n_estimators=100)
approx_flags, base_estimator_names = build_codes(base_estimators,
approx_clf_list,
approx_ng_clf_list,
approx_flag_global)
n_estimators_list, starts, n_jobs = _partition_estimators(n_estimators,
n_jobs=n_jobs)
print(starts) # this is the list of being split
start = time.time()
# TODO: here has a bug. For some reason, approximators do not match approx_flags
all_approx_results = Parallel(n_jobs=n_jobs, max_nbytes=None, verbose=True)(
delayed(_parallel_approx_estimators)(
n_estimators_list[i],
trained_estimators[starts[i]:starts[i + 1]],
X, # if it is a PyOD model, we do not need this
n_estimators,
approx_flags[starts[i]:starts[i + 1]],
approx_clf,
verbose=True)
for i in range(n_jobs))
n_samples, n_features = X.shape[0], X.shape[1]
# decide whether bps is needed
# it is turned off
if self.bps_flag:
# load the pre-trained cost predictor to forecast the train cost
cost_predictor = joblib.load(self.cost_forecast_loc_pred_)
time_cost_pred = cost_forecast_meta(cost_predictor, X,
self.base_estimator_names)
n_estimators_list, starts, n_jobs = balanced_scheduling(
time_cost_pred, self.n_estimators, self.n_jobs)
else:
# use simple equal split by sklearn
n_estimators_list, starts, n_jobs = _partition_estimators(
self.n_estimators, self.n_jobs)
# fit the base models
if self.verbose:
print('Parallel score prediction...')
start = time.time()
# TODO: code cleanup. There is an existing bug for joblib on Windows:
# https://github.com/joblib/joblib/issues/806
# max_nbytes can be dropped on other OS
all_results_scores = Parallel(n_jobs=n_jobs, max_nbytes=None,
verbose=True)(
delayed(_parallel_decision_function)(
n_estimators_list[i],
self.base_estimators[starts[i]:starts[i + 1]],
self.approximators[starts[i]:starts[i + 1]],
contamination,
verbose=True)
for i in range(n_jobs))
print('Orig Predict time:', time.time() - start)
print()
# unfold and generate the label matrix
predicted_labels_orig = np.zeros([X_test.shape[0], n_estimators])
for i in range(n_jobs):
predicted_labels_orig[:, starts[i]:starts[i + 1]] = np.asarray(
all_results_pred[i]).T
start = time.time()
n_estimators = len(base_estimators)
n_estimators_list, starts, n_jobs = _partition_estimators(n_estimators,
n_jobs)
# model prediction
all_results_scores = Parallel(n_jobs=n_jobs, max_nbytes=None,
verbose=True)(
delayed(_parallel_decision_function)(
n_estimators_list[i],
trained_estimators[starts[i]:starts[i + 1]],
None,
X_test,
n_estimators,
jl_transformers,
approx_flags[starts[i]:starts[i + 1]],
verbose=True)
for i in range(n_jobs))
print('Orig decision_function time:', time.time() - start)
start = time.time()
predicted_labels = model.predict(X_test) # predict labels
print('Predict time:', time.time() - start)
print()
start = time.time()
predicted_scores = model.decision_function(X_test) # predict scores
print('Decision Function time:', time.time() - start)
print()
##########################################################################
# compare with no projection, no bps, and no approximation
print("******************************************************************")
start = time.time()
n_estimators = len(base_estimators)
n_estimators_list, starts, n_jobs = _partition_estimators(n_estimators,
n_jobs)
rp_flags = np.zeros([n_estimators, 1])
approx_flags = np.zeros([n_estimators, 1])
objective_dim = None
rp_method = None
all_results = Parallel(n_jobs=n_jobs, max_nbytes=None, verbose=True)(
delayed(_parallel_fit)(
n_estimators_list[i],
base_estimators[starts[i]:starts[i + 1]],
X_train,
n_estimators,
rp_flags[starts[i]:starts[i + 1]],
objective_dim,
rp_method=rp_method,
n_samples, n_features = X.shape[0], X.shape[1]
# decide whether bps is needed
# it is turned off
if self.bps_flag:
# load the pre-trained cost predictor to forecast the train cost
cost_predictor = joblib.load(self.cost_forecast_loc_pred_)
time_cost_pred = cost_forecast_meta(cost_predictor, X,
self.base_estimator_names)
n_estimators_list, starts, n_jobs = balanced_scheduling(
time_cost_pred, self.n_estimators, self.n_jobs)
else:
# use simple equal split by sklearn
n_estimators_list, starts, n_jobs = _partition_estimators(
self.n_estimators, self.n_jobs)
# fit the base models
if self.verbose:
print('Parallel score prediction...')
start = time.time()
# TODO: code cleanup. There is an existing bug for joblib on Windows:
# https://github.com/joblib/joblib/issues/806
# max_nbytes can be dropped on other OS
all_results_scores = Parallel(n_jobs=n_jobs, max_nbytes=None,
verbose=True)(
delayed(_parallel_predict_proba)(
n_estimators_list[i],
self.base_estimators[starts[i]:starts[i + 1]],
self.approximators[starts[i]:starts[i + 1]],
Returns
-------
self : object
The estimator after with approximation.
"""
# todo: X may be optional
# todo: allow to use a list of scores for approximation, instead of
# todo: decision_scores
self.approx_flags, _ = build_codes(self.base_estimators,
self.approx_clf_list,
self.approx_ng_clf_list,
self.approx_flag_global)
n_estimators_list, starts, n_jobs = _partition_estimators(
self.n_estimators, n_jobs=self.n_jobs)
all_approx_results = Parallel(n_jobs=n_jobs, verbose=True)(
delayed(_parallel_approx_estimators)(
n_estimators_list[i],
self.base_estimators[starts[i]:starts[i + 1]],
X, # if it is a PyOD model, we do not need this
self.n_estimators,
self.approx_flags[starts[i]:starts[i + 1]],
self.approx_clf,
self.jl_transformers_[starts[i]:starts[i + 1]],
verbose=True)
for i in range(n_jobs))
# print('Balanced Scheduling Total Test Time:', time.time() - start)