How to use the imblearn.pipeline.make_pipeline function in imblearn

To help you get started, we’ve selected a few imblearn examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github scikit-learn-contrib / imbalanced-learn / examples / under-sampling / plot_comparison_under_sampling.py View on Github external
###############################################################################
# ``InstanceHardnessThreshold`` uses the prediction of classifier to exclude
# samples. All samples which are classified with a low probability will be
# removed.

fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(20, 6))
X, y = create_dataset(n_samples=5000, weights=(0.01, 0.05, 0.94),
                      class_sep=0.8)

clf = LinearSVC().fit(X, y)
plot_decision_function(X, y, clf, ax1)
ax1.set_title('Linear SVC with y={}'.format(Counter(y)))
sampler = InstanceHardnessThreshold(
    random_state=0, estimator=LogisticRegression(solver='lbfgs',
                                                 multi_class='auto'))
clf = make_pipeline(sampler, LinearSVC())
clf.fit(X, y)
plot_decision_function(X, y, clf, ax2)
ax2.set_title('Decision function for {}'.format(sampler.__class__.__name__))
plot_resampling(X, y, sampler, ax3)
ax3.set_title('Resampling using {}'.format(sampler.__class__.__name__))
fig.tight_layout()

plt.show()
github scikit-learn-contrib / imbalanced-learn / examples / over-sampling / plot_comparison_over_sampling.py View on Github external
# nearest-neighbors rule while regular SMOTE will not make any distinction.
# Therefore, the decision function depending of the algorithm.

fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(20, 6))
X, y = create_dataset(n_samples=10000, weights=(0.01, 0.05, 0.94))

clf = LinearSVC().fit(X, y)
plot_decision_function(X, y, clf, ax1)
ax1.set_title('Linear SVC with y={}'.format(Counter(y)))
sampler = SMOTE()
clf = make_pipeline(sampler, LinearSVC())
clf.fit(X, y)
plot_decision_function(X, y, clf, ax2)
ax2.set_title('Decision function for {}'.format(sampler.__class__.__name__))
sampler = ADASYN()
clf = make_pipeline(sampler, LinearSVC())
clf.fit(X, y)
plot_decision_function(X, y, clf, ax3)
ax3.set_title('Decision function for {}'.format(sampler.__class__.__name__))
fig.tight_layout()

###############################################################################
# Due to those sampling particularities, it can give rise to some specific
# issues as illustrated below.

fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 15))
X, y = create_dataset(n_samples=5000, weights=(0.01, 0.05, 0.94),
                      class_sep=0.8)

ax_arr = ((ax1, ax2), (ax3, ax4))
for ax, sampler in zip(ax_arr, (SMOTE(random_state=0),
                                ADASYN(random_state=0))):
github rikturr / aws-ml-experimenter / experiments / learning_curve.py View on Github external
def pseudo_label(pipeline, x_lab, y_lab, x_unlab, y_unlab, threshold=None):
    model = make_pipeline(*pipeline)
    model.fit(x_lab, y_lab)

    pseudo_lab = pd.DataFrame({
        'actual': y_unlab,
        'predict_proba': model.predict_proba(x_unlab)[:, 1]
    })
    if threshold:
        results = threshold_metrics(pseudo_lab['actual'], pseudo_lab['predict_proba'], threshold=threshold)
    else:
        results = threshold_metrics(pseudo_lab['actual'], pseudo_lab['predict_proba'], rank_best='lab_gmean')
    pseudo_lab['predicted'] = (pseudo_lab['predict_proba'] > results['lab_threshold']).astype(int)

    y_pseudo = pseudo_lab['predicted'].values
    results['lab_num_pos'] = np.sum(y_pseudo)
    results['lab_num_neg'] = y_pseudo.shape[0] - results['lab_num_pos']
github scikit-learn-contrib / imbalanced-learn / examples / combine / plot_comparison_combine.py View on Github external
# literature: (i) Tomek's link and (ii) edited nearest neighbours cleaning
# methods. Imbalanced-learn provides two ready-to-use samplers ``SMOTETomek``
# and ``SMOTEENN``. In general, ``SMOTEENN`` cleans more noisy data than
# ``SMOTETomek``.


fig, ((ax1, ax2), (ax3, ax4), (ax5, ax6)) = plt.subplots(3, 2,
                                                         figsize=(15, 25))
X, y = create_dataset(n_samples=1000, weights=(0.1, 0.2, 0.7))

ax_arr = ((ax1, ax2), (ax3, ax4), (ax5, ax6))
for ax, sampler in zip(ax_arr, (
        SMOTE(random_state=0),
        SMOTEENN(random_state=0),
        SMOTETomek(random_state=0))):
    clf = make_pipeline(sampler, LinearSVC())
    clf.fit(X, y)
    plot_decision_function(X, y, clf, ax[0])
    ax[0].set_title('Decision function for {}'.format(
        sampler.__class__.__name__))
    plot_resampling(X, y, sampler, ax[1])
    ax[1].set_title('Resampling using {}'.format(
        sampler.__class__.__name__))
fig.tight_layout()

plt.show()
github scikit-learn-contrib / imbalanced-learn / examples / under-sampling / plot_comparison_under_sampling.py View on Github external
# Prototype generation: under-sampling by generating new samples
###############################################################################

###############################################################################
# ``ClusterCentroids`` under-samples by replacing the original samples by the
# centroids of the cluster found.

fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(20, 6))
X, y = create_dataset(n_samples=5000, weights=(0.01, 0.05, 0.94),
                      class_sep=0.8)

clf = LinearSVC().fit(X, y)
plot_decision_function(X, y, clf, ax1)
ax1.set_title('Linear SVC with y={}'.format(Counter(y)))
sampler = ClusterCentroids(random_state=0)
clf = make_pipeline(sampler, LinearSVC())
clf.fit(X, y)
plot_decision_function(X, y, clf, ax2)
ax2.set_title('Decision function for {}'.format(sampler.__class__.__name__))
plot_resampling(X, y, sampler, ax3)
ax3.set_title('Resampling using {}'.format(sampler.__class__.__name__))
fig.tight_layout()

###############################################################################
# Prototype selection: under-sampling by selecting existing samples
###############################################################################

###############################################################################
# The algorithm performing prototype selection can be subdivided into two
# groups: (i) the controlled under-sampling methods and (ii) the cleaning
# under-sampling methods.
github edyoda / ai-project-fraud-detection / ml_model.py View on Github external
def create_pipelines(self):
        self.model_pipelines = []
        for estimator in self.estimators:
            for sampler in self.samplers:
                for scaler in self.scalers:
                    pipeline = make_pipeline(scaler, sampler, estimator)
                    self.model_pipelines.append(pipeline)
github AlgoWit / geometric-smote / examples / plot_validation_curves.py View on Github external
"geometricsmote__selection_strategy",
    SCORER,
)
plot_validation_curve(validation_curve_info, scoring_name, 'Selection Strategy')

###############################################################################
# High Imbalance Ratio or low Samples to Features Ratio
###############################################################################

###############################################################################
# When :math:`\text{IR}` is high or :math:`\text{SFR}` is low then the majority
# or combined selection strategies and lower absolute values of the truncation
# and deformation factors dominate as optimal hyperparameters.

X, y = generate_imbalanced_data([0.1, 0.9], 2000, 400, 200)
gsmote_gbc = make_pipeline(
    GeometricSMOTE(random_state=RANDOM_STATE),
    LinearSVC(random_state=RANDOM_STATE, max_iter=1e5),
)

scoring_name = 'Geometric Mean Score'
validation_curve_info = generate_validation_curve_info(
    gsmote_gbc, X, y, range(1, 8), "geometricsmote__k_neighbors", SCORER
)
plot_validation_curve(validation_curve_info, scoring_name, 'K Neighbors')

validation_curve_info = generate_validation_curve_info(
    gsmote_gbc,
    X,
    y,
    np.linspace(-1.0, 1.0, 9),
    "geometricsmote__truncation_factor",
github scikit-learn-contrib / imbalanced-learn / examples / applications / plot_impact_imbalanced_classes.py View on Github external
df_scores = evaluate_classifier(
    lr_clf, df_scores, "LR with class weight"
)
rf_clf.set_params(randomforestclassifier__class_weight="balanced")
df_scores = evaluate_classifier(
    rf_clf, df_scores, "RF with class weight"
)
lr_clf = make_pipeline_with_sampler(
    preprocessor_linear,
    RandomUnderSampler(random_state=42),
    LogisticRegression(max_iter=1000)
)
df_scores = evaluate_classifier(
    lr_clf, df_scores, "LR with under-sampling"
)
rf_clf = make_pipeline_with_sampler(
    preprocessor_tree,
    RandomUnderSampler(random_state=42),
    RandomForestClassifier(random_state=42, n_jobs=2)
)
df_scores = evaluate_classifier(
    rf_clf, df_scores, "RF with under-sampling"
)
rf_clf = make_pipeline(
    preprocessor_tree,
    BalancedRandomForestClassifier(random_state=42, n_jobs=2)
)
df_scores = evaluate_classifier(rf_clf, df_scores)
df_scores = evaluate_classifier(
    bag_clf, df_scores, "Balanced bagging"
)
df_scores