Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
Returns
-------
self : object
Fitted estimator.
"""
X = check_array(X)
n_samples, n_features = X.shape[0], X.shape[1]
# Validate target_dim_frac for random projection
if isinstance(self.target_dim_frac, (numbers.Integral, np.integer)):
self.target_dim_frac_ = self.target_dim_frac
else: # float
self.target_dim_frac_ = int(self.target_dim_frac * n_features)
# build flags for random projection
self.rp_flags_, _ = build_codes(self.base_estimators, self.rp_clf_list,
self.rp_ng_clf_list,
self.rp_flag_global)
# decide whether bps is needed
# it is turned off
if self.bps_flag:
# load the pre-trained cost predictor to forecast the train cost
cost_predictor = joblib.load(self.cost_forecast_loc_fit_)
time_cost_pred = cost_forecast_meta(cost_predictor, X,
self.base_estimator_names)
# use BPS
n_estimators_list, starts, n_jobs = balanced_scheduling(
time_cost_pred, self.n_estimators, self.n_jobs)
else:
----------
X : numpy array of shape (n_samples, n_features)
The input samples. The same feature space of the unsupervised
outlier detector will be used.
Returns
-------
self : object
The estimator after with approximation.
"""
# todo: X may be optional
# todo: allow to use a list of scores for approximation, instead of
# todo: decision_scores
self.approx_flags, _ = build_codes(self.base_estimators,
self.approx_clf_list,
self.approx_ng_clf_list,
self.approx_flag_global)
n_estimators_list, starts, n_jobs = _partition_estimators(
self.n_estimators, n_jobs=self.n_jobs)
all_approx_results = Parallel(n_jobs=n_jobs, verbose=True)(
delayed(_parallel_approx_estimators)(
n_estimators_list[i],
self.base_estimators[starts[i]:starts[i + 1]],
X, # if it is a PyOD model, we do not need this
self.n_estimators,
self.approx_flags[starts[i]:starts[i + 1]],
self.approx_clf,
self.jl_transformers_[starts[i]:starts[i + 1]],
# number of the parallel jobs
n_jobs = 6
n_estimators = len(base_estimators)
# the algorithms that should be be using random projection
rp_clf_list = ['LOF', 'KNN', 'ABOD']
# the algorithms that should NOT use random projection
rp_ng_clf_list = ['IForest', 'PCA', 'HBOS']
# global flag for random projection
rp_flag_global = True
objective_dim = 6
rp_method = 'discrete'
# build flags for random projection
rp_flags, base_estimator_names = build_codes(base_estimators, rp_clf_list,
rp_ng_clf_list, rp_flag_global)
# load the pre-trained cost predictor to forecast the train cost
clf_train = joblib.load(
os.path.join('../suod', 'models', 'saved_models', 'bps_train.joblib'))
time_cost_pred = cost_forecast_meta(clf_train, X, base_estimator_names)
# schedule the tasks
n_estimators_list, starts, n_jobs = balanced_scheduling(time_cost_pred,
n_estimators, n_jobs)
print(starts) # this is the list of being split
start = time.time()
print('Parallel Training...')
self.rp_flag_global = rp_flag_global
self.target_dim_frac = target_dim_frac
self.jl_method = jl_method
self.bps_flag = bps_flag
self.verbose = verbose
self.approx_flag_global = approx_flag_global
self.contamination = contamination
self._parameter_validation(contamination, n_jobs, rp_clf_list,
rp_ng_clf_list, approx_clf_list,
approx_ng_clf_list, approx_clf,
cost_forecast_loc_fit,
cost_forecast_loc_pred)
# build flags for random projection
self.rp_flags, self.base_estimator_names = build_codes(
self.base_estimators, self.rp_clf_list, self.rp_ng_clf_list,
self.rp_flag_global)
trained_estimators = _unfold_parallel(all_results[0], n_jobs)
jl_transformers = _unfold_parallel(all_results[1], n_jobs)
###############################################################################
# %% Model Approximation
approx_clf_list = ['LOF', 'KNN']
approx_ng_clf_list = ['IForest', 'PCA', 'HBOS', 'ABOD']
approx_flag_global = True
# build approx code
# this can be a pre-defined list and directly supply to the system
approx_clf = RandomForestRegressor(n_estimators=100)
approx_flags, base_estimator_names = build_codes(base_estimators,
approx_clf_list,
approx_ng_clf_list,
approx_flag_global)
n_estimators_list, starts, n_jobs = _partition_estimators(n_estimators,
n_jobs=n_jobs)
print(starts) # this is the list of being split
start = time.time()
# TODO: here has a bug. For some reason, approximators do not match approx_flags
all_approx_results = Parallel(n_jobs=n_jobs, max_nbytes=None, verbose=True)(
delayed(_parallel_approx_estimators)(
n_estimators_list[i],
trained_estimators[starts[i]:starts[i + 1]],
X, # if it is a PyOD model, we do not need this
n_estimators,