How to use the fklearn.common_docstrings.learner_pred_fn_docstring function in fklearn

To help you get started, we’ve selected a few fklearn examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github nubank / fklearn / src / fklearn / training / transformation.py View on Github external
It must contain all columns listed in `columns_to_scale`.

    columns_to_scale : list of str
        A list of names of the columns for standard scaling.
    """

    scaler = StandardScaler()

    scaler.fit(df[columns_to_scale].values)

    def p(new_data_set: pd.DataFrame) -> pd.DataFrame:
        new_data = scaler.transform(new_data_set[columns_to_scale].values)
        new_cols = pd.DataFrame(data=new_data, columns=columns_to_scale).to_dict('list')
        return new_data_set.assign(**new_cols)

    p.__doc__ = learner_pred_fn_docstring("standard_scaler")

    log = {'standard_scaler': {
        'standard_scaler': scaler.get_params(),
        'transformed_column': columns_to_scale}}

    return p, p(df), log
github nubank / fklearn / src / fklearn / training / unsupervised.py View on Github external
default_params = {"n_jobs": -1, "random_state": 1729}
    params = default_params if not params else merge(default_params, params)

    features = features if not encode_extra_cols else expand_features_encoded(df, features)

    model = IsolationForest()
    model.set_params(**params)
    model.fit(df[features].values)

    def p(new_df: pd.DataFrame) -> pd.DataFrame:
        output_col = {prediction_column: model.decision_function(
            new_df[features])}

        return new_df.assign(**output_col)

    p.__doc__ = learner_pred_fn_docstring("isolation_forest_learner")

    log = {'isolation_forest_learner': {
        'features': features,
        'parameters': params,
        'prediction_column': prediction_column,
        'package': "sklearn",
        'package_version': sklearn.__version__,
        'training_samples': len(df)}}

    return p, p(df), log
github nubank / fklearn / src / fklearn / training / transformation.py View on Github external
def categ_target_dict(column: str) -> Dict:
        column_agg = df.groupby(column)[target_column].agg(['count', 'mean'])
        column_target_mean = column_agg['mean']
        column_target_count = column_agg['count']

        smoothed_target_mean = (column_target_count * column_target_mean + smoothing * target_mean) / \
                               (column_target_count + smoothing)

        return smoothed_target_mean.to_dict()

    vec = {column: categ_target_dict(column) for column in columns_to_categorize}

    def p(new_df: pd.DataFrame) -> pd.DataFrame:
        return apply_replacements(new_df, columns_to_categorize, vec, replace_unseen)

    p.__doc__ = learner_pred_fn_docstring("target_categorizer")

    log = {'target_categorizer': {
        'transformed_columns': columns_to_categorize,
        'target_column': target_column,
        'smoothing': smoothing,
        'ignore_unseen': ignore_unseen}
    }

    if store_mapping:
        log['target_categorizer']['mapping'] = vec

    return p, p(df), log
github nubank / fklearn / src / fklearn / training / regression.py View on Github external
col_dict = {prediction_column: bst.predict(dtest)}

        if apply_shap:
            import shap
            explainer = shap.TreeExplainer(bst)
            shap_values = list(explainer.shap_values(new_df[features]))
            shap_expected_value = explainer.expected_value

            shap_output = {"shap_values": shap_values,
                           "shap_expected_value": np.repeat(shap_expected_value, len(shap_values))}

            col_dict = merge(col_dict, shap_output)

        return new_df.assign(**col_dict)

    p.__doc__ = learner_pred_fn_docstring("xgb_regression_learner", shap=True)

    log = {'xgb_regression_learner': {
        'features': features,
        'target': target,
        'prediction_column': prediction_column,
        'package': "xgboost",
        'package_version': xgb.__version__,
        'parameters': assoc(params, "num_estimators", num_estimators),
        'feature_importance': bst.get_score(),
        'training_samples': len(df)},
        'object': bst}

    return p, p(df), log
github nubank / fklearn / src / fklearn / training / transformation.py View on Github external
if ascending:
        base = 0
        sign = 1
    else:
        base = max_range
        sign = -1

    values = df[prediction_column]

    ecdf = ed.ECDF(values)

    def p(new_df: pd.DataFrame) -> pd.DataFrame:
        return new_df.assign(**{ecdf_column: (base + sign * max_range * ecdf(new_df[prediction_column]))})

    p.__doc__ = learner_pred_fn_docstring("ecdefer")

    log = {'ecdfer': {
        'nobs': len(values),
        'prediction_column': prediction_column,
        'ascending': ascending,
        'transformed_column': [ecdf_column]}}

    return p, p(df), log
github nubank / fklearn / src / fklearn / training / transformation.py View on Github external
The value to impute unseen categories.

    store_mapping : bool (default: False)
        Whether to store the feature value -> integer dictionary in the log
    """

    def categ_dict(series: pd.Series) -> Dict:
        categs = series.dropna().unique()
        return dict(map(reversed, enumerate(categs)))  # type: ignore

    vec = {column: categ_dict(df[column]) for column in columns_to_categorize}

    def p(new_df: pd.DataFrame) -> pd.DataFrame:
        return apply_replacements(new_df, columns_to_categorize, vec, replace_unseen)

    p.__doc__ = learner_pred_fn_docstring("label_categorizer")

    log: LearnerLogType = {'label_categorizer': {
        'transformed_column': columns_to_categorize,
        'replace_unseen': replace_unseen}
    }

    if store_mapping:
        log['label_categorizer']['mapping'] = vec

    return p, p(df), log
github nubank / fklearn / src / fklearn / training / transformation.py View on Github external
n_rows = df.shape[0]

    groups = [[f] for f in columns_to_inject] if columns_to_inject is not None else groups

    null_cols = {}  # type: ignore
    for seed_i, group in enumerate(groups):  # type: ignore
        np.random.seed(seed + seed_i)
        replace_mask = np.random.binomial(1, 1 - proportion, n_rows).astype(bool)
        null_cols = merge(null_cols, {feature: df[feature].where(replace_mask) for feature in group})

    null_data = df.assign(**null_cols)

    def p(new_data_set: pd.DataFrame) -> pd.DataFrame:
        return new_data_set

    p.__doc__ = learner_pred_fn_docstring("null_injector")

    log = {'null_injector': {
        "columns_to_inject": columns_to_inject,
        "proportion": proportion,
        "groups": groups
    }}

    return p, null_data, log
github nubank / fklearn / src / fklearn / training / classification.py View on Github external
shap_expected_value_multiclass = {f"shap_expected_value_{class_index}":
                                                  np.repeat(expected_value, len(class_shap_values))
                                                  for (class_index, (expected_value, class_shap_values))
                                                  in enumerate(zip(shap_expected_value, shap_values))}
                shap_output = merge(shap_values_multiclass, shap_expected_value_multiclass)

            else:
                shap_values = list(shap_values)
                shap_output = {"shap_values": shap_values,
                               "shap_expected_value": np.repeat(shap_expected_value, len(shap_values))}

            col_dict = merge(col_dict, shap_output)

        return new_df.assign(**col_dict)

    p.__doc__ = learner_pred_fn_docstring("xgb_classification_learner", shap=True)

    log = {'xgb_classification_learner': {
        'features': features,
        'target': target,
        'prediction_column': prediction_column,
        'package': "xgboost",
        'package_version': xgb.__version__,
        'parameters': assoc(params, "num_estimators", num_estimators),
        'feature_importance': bst.get_score(),
        'training_samples': len(df)},
        'object': bst}

    return p, p(df), log
github nubank / fklearn / src / fklearn / training / transformation.py View on Github external
A list of categorical column names.

    replace_unseen : int
        The value to impute unseen categories.

    store_mapping : bool (default: False)
        Whether to store the feature value -> integer dictionary in the log
    """

    categ_getter = lambda col: df[col].value_counts().to_dict()
    vec = {column: categ_getter(column) for column in columns_to_categorize}

    def p(new_df: pd.DataFrame) -> pd.DataFrame:
        return apply_replacements(new_df, columns_to_categorize, vec, replace_unseen)

    p.__doc__ = learner_pred_fn_docstring("count_categorizer")

    log: LearnerLogType = {'count_categorizer': {
        'transformed_column': columns_to_categorize,
        'replace_unseen': replace_unseen}
    }

    if store_mapping:
        log['count_categorizer']['mapping'] = vec

    return p, p(df), log
github nubank / fklearn / src / fklearn / training / transformation.py View on Github external
prediction_min : float
        The floor for the prediction.

    prediction_max : float
        The cap for the prediction.

    prediction_column : str
        The name of the column in `df` to cap and floor
    """

    def p(new_df: pd.DataFrame) -> pd.DataFrame:
        return new_df.assign(
            **{prediction_column: new_df[prediction_column].clip(lower=prediction_min, upper=prediction_max)}
        )

    p.__doc__ = learner_pred_fn_docstring("prediction_ranger")

    log = {'prediction_ranger': {
        'prediction_min': prediction_min,
        'prediction_max': prediction_max,
        'transformed_column': [prediction_column]}}

    return p, p(df), log