Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_get_avg_metric_from_extractor(logs, base_extractor, metric_name):
result = get_avg_metric_from_extractor(logs[0], base_extractor, metric_name)
assert result == 0.8
parallel: bool (default False)
nthread: int (default 1)
seed: int (default 7)
Random seed
Returns
----------
features: list of str
The remaining features after removing based on feature importance
"""
random.seed(seed)
curr_metric = get_avg_metric_from_extractor(log, extractor, metric_name)
eval_size = eval_data.shape[0]
features_to_shuffle = order_feature_importance_avg_from_logs(log)[-max_removed_by_step:] \
if speed_up_by_importance else get_used_features(log)
def shuffle(feature: str) -> pd.DataFrame:
return eval_data.assign(**{feature: eval_data[feature].sample(frac=1.0)})
feature_to_delta_metric = compose(lambda m: curr_metric - m,
get_avg_metric_from_extractor(extractor=extractor, metric_name=metric_name),
gen_validator_log(fold_num=0, test_size=eval_size), eval_fn, predict_fn, shuffle)
if parallel:
metrics = Parallel(n_jobs=nthread, backend="threading")(
delayed(feature_to_delta_metric)(feature) for feature in features_to_shuffle)
feature_to_delta_metric = dict(zip(features_to_shuffle, metrics))
The remaining features after removing based on feature importance
"""
random.seed(seed)
curr_metric = get_avg_metric_from_extractor(log, extractor, metric_name)
eval_size = eval_data.shape[0]
features_to_shuffle = order_feature_importance_avg_from_logs(log)[-max_removed_by_step:] \
if speed_up_by_importance else get_used_features(log)
def shuffle(feature: str) -> pd.DataFrame:
return eval_data.assign(**{feature: eval_data[feature].sample(frac=1.0)})
feature_to_delta_metric = compose(lambda m: curr_metric - m,
get_avg_metric_from_extractor(extractor=extractor, metric_name=metric_name),
gen_validator_log(fold_num=0, test_size=eval_size), eval_fn, predict_fn, shuffle)
if parallel:
metrics = Parallel(n_jobs=nthread, backend="threading")(
delayed(feature_to_delta_metric)(feature) for feature in features_to_shuffle)
feature_to_delta_metric = dict(zip(features_to_shuffle, metrics))
gc.collect()
else:
feature_to_delta_metric = {feature: feature_to_delta_metric(feature) for feature in features_to_shuffle}
return pipe(feature_to_delta_metric,
valfilter(lambda delta_metric: delta_metric < threshold),
sorted(key=lambda f: feature_to_delta_metric.get(f)),
take(max_removed_by_step),
list)
threshold: float (default 0.001)
Threshold for model performance comparison
Returns
----------
stop: bool
A boolean whether to stop recursion or not
"""
if len(logs) < early_stop:
return False
log_list = [get_best_performing_log(log, extractor, metric_name) for log in logs]
limited_logs = list(take(early_stop, log_list))
curr_auc = get_avg_metric_from_extractor(limited_logs[-1], extractor, metric_name)
return all(
[(curr_auc - get_avg_metric_from_extractor(log, extractor, metric_name)) <= threshold
for log in limited_logs[:-1]])
Threshold for model performance comparison
Returns
----------
stop: bool
A boolean whether to stop recursion or not
"""
if len(logs) < early_stop:
return False
limited_logs = list(take(early_stop, logs))
curr_auc = get_avg_metric_from_extractor(limited_logs[-1], extractor, metric_name)
return all(
[(curr_auc - get_avg_metric_from_extractor(log, extractor, metric_name)) <= threshold
for log in limited_logs[:-1]]
)
Number of iteration without improval before stopping
threshold: float (default 0.001)
Threshold for model performance comparison
Returns
----------
stop: bool
A boolean whether to stop recursion or not
"""
if len(logs) < early_stop:
return False
limited_logs = list(take(early_stop, logs))
curr_auc = get_avg_metric_from_extractor(limited_logs[-1], extractor, metric_name)
return all(
[(curr_auc - get_avg_metric_from_extractor(log, extractor, metric_name)) <= threshold
for log in limited_logs[:-1]]
)
Returns
----------
stop: bool
A boolean whether to stop recursion or not
"""
if len(logs) < early_stop:
return False
log_list = [get_best_performing_log(log, extractor, metric_name) for log in logs]
limited_logs = list(take(early_stop, log_list))
curr_auc = get_avg_metric_from_extractor(limited_logs[-1], extractor, metric_name)
return all(
[(curr_auc - get_avg_metric_from_extractor(log, extractor, metric_name)) <= threshold
for log in limited_logs[:-1]])