Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_parallel_validator():
result = parallel_validator(data, split_fn, train_fn, eval_fn, n_jobs=2)
validator_log = result["validator_log"]
assert len(validator_log) == 2
assert validator_log[0]['fold_num'] == 0
assert result['train_log'][0]['xgb_classification_learner']['features'] == ['f1']
assert len(validator_log[0]['eval_results']) == 3
assert validator_log[1]['fold_num'] == 1
assert len(validator_log[1]['eval_results']) == 1
Random seed
save_intermediary_fn : function(log) -> save to file
Partially defined saver function that receives a log result from a
tuning step and appends it into a file
Example: save_intermediary_result(save_path='tuning.pkl')
n_jobs : int
Number of parallel processes to spawn when evaluating a training function
Returns
----------
tuning_log : list of dict
A list of tuning log, each containing a training log and a validation log.
"""
validation_fn = partial(parallel_validator, n_jobs=n_jobs) if n_jobs > 1 else validator
def tune_iteration() -> ValidatorReturnType:
iter_space = {k: space[k]() for k in space}
train_fn = param_train_fn(iter_space)
validator_log = validation_fn(train_data=train_set, split_fn=split_fn, train_fn=train_fn, eval_fn=eval_fn)
if save_intermediary_fn is not None:
save_intermediary_fn(validator_log)
return validator_log
seed(random_seed)
return [tune_iteration() for _ in range(iterations)]
warm_start_file: str
File containing intermediary results for grid search. If this file
is present, we will perform grid search from the last combination of
parameters.
n_jobs : int
Number of parallel processes to spawn when evaluating a training function
Returns
----------
tuning_log : list of dict
A list of tuning log, each containing a training log and a validation log.
"""
validation_fn = partial(parallel_validator, n_jobs=n_jobs) if n_jobs > 1 else validator
def tune_iteration(iter_space: LogType) -> ValidatorReturnType:
train_fn = param_train_fn(iter_space)
validator_log = validation_fn(train_data=train_set, split_fn=split_fn, train_fn=train_fn, eval_fn=eval_fn)
validator_log['iter_space'] = OrderedDict(sorted(iter_space.items()))
if save_intermediary_fn is not None:
save_intermediary_fn(validator_log)
return validator_log
sorted_space_keys = sorted(space.keys())
params = (space[k]() for k in sorted_space_keys)
combinations = set(product(*params))
if warm_start_file is not None and load_intermediary_fn is not None:
Logs: list of list of dict
A list log-like lists of dictionaries evaluations. Each element of the
list is validation step of the algorithm.
"""
selector_fn = remove_by_feature_importance(num_removed_by_step=num_removed_by_step)
stop_fn = aggregate_stop_funcs(
stop_by_no_improvement(extractor=extractor, metric_name=metric_name, early_stop=early_stop,
threshold=threshold),
stop_by_iter_num(iter_limit=iter_limit),
stop_by_num_features(min_num_features=min_remaining_features))
train_fn = lambda df: param_train_fn(df, features)
first_logs = parallel_validator(train_data, split_fn, train_fn, eval_fn, n_jobs=n_jobs)
logs = [first_logs]
while not stop_fn(logs):
curr_log = first(logs)
new_features = selector_fn(curr_log)
new_train_fn = lambda df: param_train_fn(df, new_features)
next_log = parallel_validator(train_data, split_fn, new_train_fn, eval_fn, n_jobs=n_jobs)
if save_intermediary_fn is not None:
save_intermediary_fn(next_log)
logs = [next_log] + logs
return logs
used_features = [list(mapcat(lambda key: features_sets[key], subset)) for subset in used_subsets]
trainers = [lambda df: param_train_fn(df, feat) for feat in used_features]
first_val_logs = [parallel_validator(train_data, split_fn, train_func, eval_fn, n_jobs) for train_func in trainers]
logs = [[dict(log, **{"used_subsets": list(subset)}) for log, subset in zip(first_val_logs, used_subsets)]]
while not stop_fn(logs):
curr_log = first(logs)
new_subsets = selector_fn(curr_log)
new_features = [list(mapcat(lambda key: features_sets[key], subset)) for subset in new_subsets]
trainers = [lambda df: param_train_fn(df, feat) for feat in new_features]
val_logs = [parallel_validator(train_data, split_fn, train_func, eval_fn, n_jobs) for train_func in trainers]
new_logs = [dict(log, **{"used_subsets": subset}) for log, subset in zip(val_logs, new_subsets)]
if save_intermediary_fn is not None:
save_intermediary_fn(new_logs)
logs = [new_logs] + logs
return logs
stop_fn = aggregate_stop_funcs(
stop_by_no_improvement(extractor=extractor, metric_name=metric_name, early_stop=early_stop,
threshold=threshold),
stop_by_iter_num(iter_limit=iter_limit),
stop_by_num_features(min_num_features=min_remaining_features))
train_fn = lambda df: param_train_fn(df, features)
first_logs = parallel_validator(train_data, split_fn, train_fn, eval_fn, n_jobs=n_jobs)
logs = [first_logs]
while not stop_fn(logs):
curr_log = first(logs)
new_features = selector_fn(curr_log)
new_train_fn = lambda df: param_train_fn(df, new_features)
next_log = parallel_validator(train_data, split_fn, new_train_fn, eval_fn, n_jobs=n_jobs)
if save_intermediary_fn is not None:
save_intermediary_fn(next_log)
logs = [next_log] + logs
return logs