How to use the fklearn.tuning.utils.get_used_features function in fklearn

To help you get started, we’ve selected a few fklearn examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github nubank / fklearn / tests / tuning / test_selectors.py View on Github external
def test_poor_man_boruta_selection(train_df, holdout_df, train_fn, eval_fn, base_extractor, metric_name):
    features = ["x1", "x2", "x3", "x4", "x5", "x6"]
    logs = poor_man_boruta_selection(train_df, holdout_df, train_fn,
                                     features,
                                     eval_fn, base_extractor, metric_name,
                                     max_removed_by_step=1, threshold=0,
                                     early_stop=10, iter_limit=50,
                                     min_remaining_features=5)

    assert len(get_used_features(first(logs))) <= 6  # Assert stop by remaining features

    logs = poor_man_boruta_selection(train_df, holdout_df,
                                     train_fn, features,
                                     eval_fn, base_extractor, metric_name,
                                     max_removed_by_step=1, threshold=0,
                                     early_stop=10, iter_limit=1,
                                     min_remaining_features=3)
    assert len(logs) == 1  # Assert stop by iter limit

    logs = poor_man_boruta_selection(train_df, holdout_df,
                                     train_fn, features,
                                     eval_fn, base_extractor, metric_name,
                                     max_removed_by_step=1, threshold=1,
                                     early_stop=2, iter_limit=50,
                                     min_remaining_features=1)
    assert len(logs) == 2  # Assert stop by early_stop
github nubank / fklearn / tests / tuning / test_utils.py View on Github external
def test_get_used_features(logs):
    result = get_used_features(logs[0])
    assert result == ['x1', 'x2', 'x4', 'x5', 'x3', 'x6']
github nubank / fklearn / tests / tuning / test_selectors.py View on Github external
def test_backward_subset_feature_selection(train_df, train_fn, eval_fn, split_fn, base_extractor, metric_name):
    features_sets = {"first": ["x1", "x2"], "second": ["x4", "x5"], "third": ["x3", "x6"]}

    logs = backward_subset_feature_selection(train_df, train_fn, features_sets, split_fn, eval_fn, base_extractor,
                                             metric_name,
                                             num_removed_by_step=1, threshold=-1, early_stop=10, iter_limit=50,
                                             min_remaining_features=5)
    assert len(get_used_features(first(logs)[0])) <= 5  # Assert stop by remaining features

    logs = backward_subset_feature_selection(train_df, train_fn, features_sets, split_fn, eval_fn, base_extractor,
                                             metric_name,
                                             num_removed_by_step=1, threshold=0, early_stop=10, iter_limit=1,
                                             min_remaining_features=3)

    assert len(logs) == 1  # Assert stop by iter limit

    logs = backward_subset_feature_selection(train_df, train_fn, features_sets, split_fn, eval_fn, base_extractor,
                                             metric_name,
                                             num_removed_by_step=1, threshold=1, early_stop=2, iter_limit=50,
                                             min_remaining_features=1)

    assert len(logs) == 2  # Assert stop by early_stop
github nubank / fklearn / tests / tuning / test_selectors.py View on Github external
def test_feature_importance_backward_selection(train_df, train_fn, eval_fn, split_fn, base_extractor, metric_name):
    features = ["x1", "x2", "x3", "x4", "x5", "x6"]
    logs = feature_importance_backward_selection(train_df, train_fn, features, split_fn, eval_fn,
                                                 base_extractor, metric_name,
                                                 num_removed_by_step=1, threshold=0,
                                                 early_stop=10, iter_limit=50, min_remaining_features=5)
    assert len(get_used_features(first(logs))) <= 5  # Assert stop by remaining features

    logs = feature_importance_backward_selection(train_df, train_fn, features, split_fn, eval_fn,
                                                 base_extractor, metric_name,
                                                 num_removed_by_step=1, threshold=0,
                                                 early_stop=10, iter_limit=1, min_remaining_features=3)
    assert len(logs) == 1  # Assert stop by iter limit

    logs = feature_importance_backward_selection(train_df, train_fn, features, split_fn, eval_fn, base_extractor,
                                                 metric_name,
                                                 num_removed_by_step=1, threshold=1, early_stop=2, iter_limit=50,
                                                 min_remaining_features=1)
    assert len(logs) == 2  # Assert stop by early_stop
github nubank / fklearn / src / fklearn / tuning / samplers.py View on Github external
seed: int (default 7)
            Random seed

        Returns
        ----------
        features: list of str
            The remaining features after removing based on feature importance

    """
    random.seed(seed)

    curr_metric = get_avg_metric_from_extractor(log, extractor, metric_name)
    eval_size = eval_data.shape[0]

    features_to_shuffle = order_feature_importance_avg_from_logs(log)[-max_removed_by_step:] \
        if speed_up_by_importance else get_used_features(log)

    def shuffle(feature: str) -> pd.DataFrame:
        return eval_data.assign(**{feature: eval_data[feature].sample(frac=1.0)})

    feature_to_delta_metric = compose(lambda m: curr_metric - m,
                                      get_avg_metric_from_extractor(extractor=extractor, metric_name=metric_name),
                                      gen_validator_log(fold_num=0, test_size=eval_size), eval_fn, predict_fn, shuffle)

    if parallel:
        metrics = Parallel(n_jobs=nthread, backend="threading")(
            delayed(feature_to_delta_metric)(feature) for feature in features_to_shuffle)
        feature_to_delta_metric = dict(zip(features_to_shuffle, metrics))
        gc.collect()

    else:
        feature_to_delta_metric = {feature: feature_to_delta_metric(feature) for feature in features_to_shuffle}
github nubank / fklearn / src / fklearn / tuning / stoppers.py View on Github external
Parameters
    ----------
    logs : list of list of dict
        A list of log-like lists of dictionaries evaluations.

    min_num_features: int (default 50)
        The minimun number of features the model can have before stopping

    Returns
    -------
    stop: bool
        A boolean whether to stop recursion or not
    """

    return len(get_used_features(first(logs))) <= min_num_features