How to use the imblearn.under_sampling.NearMiss function in imblearn

To help you get started, we’ve selected a few imblearn examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github scikit-learn-contrib / imbalanced-learn / examples / applications / plot_multi_class_under_sampling.py View on Github external
RANDOM_STATE = 42

# Create a folder to fetch the dataset
iris = load_iris()
X, y = make_imbalance(iris.data, iris.target,
                      sampling_strategy={0: 25, 1: 50, 2: 50},
                      random_state=RANDOM_STATE)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=RANDOM_STATE)

print('Training target statistics: {}'.format(Counter(y_train)))
print('Testing target statistics: {}'.format(Counter(y_test)))

# Create a pipeline
pipeline = make_pipeline(NearMiss(version=2),
                         LinearSVC(random_state=RANDOM_STATE))
pipeline.fit(X_train, y_train)

# Classify and report the results
print(classification_report_imbalanced(y_test, pipeline.predict(X_test)))
github scikit-learn-contrib / imbalanced-learn / examples / under-sampling / plot_comparison_under_sampling.py View on Github external
# ``NearMiss`` algorithms implement some heuristic rules in order to select
# samples. NearMiss-1 selects samples from the majority class for which the
# average distance of the :math:`k`` nearest samples of the minority class is
# the smallest. NearMiss-2 selects the samples from the majority class for
# which the average distance to the farthest samples of the negative class is
# the smallest. NearMiss-3 is a 2-step algorithm: first, for each minority
# sample, their ::math:`m` nearest-neighbors will be kept; then, the majority
# samples selected are the on for which the average distance to the :math:`k`
# nearest neighbors is the largest.

fig, ((ax1, ax2), (ax3, ax4), (ax5, ax6)) = plt.subplots(3, 2,
                                                         figsize=(15, 25))
X, y = create_dataset(n_samples=5000, weights=(0.1, 0.2, 0.7), class_sep=0.8)

ax_arr = ((ax1, ax2), (ax3, ax4), (ax5, ax6))
for ax, sampler in zip(ax_arr, (NearMiss(version=1),
                                NearMiss(version=2),
                                NearMiss(version=3))):
    clf = make_pipeline(sampler, LinearSVC())
    clf.fit(X, y)
    plot_decision_function(X, y, clf, ax[0])
    ax[0].set_title('Decision function for {}-{}'.format(
        sampler.__class__.__name__, sampler.version))
    plot_resampling(X, y, sampler, ax[1])
    ax[1].set_title('Resampling using {}-{}'.format(
        sampler.__class__.__name__, sampler.version))
fig.tight_layout()

###############################################################################
# ``EditedNearestNeighbours`` removes samples of the majority class for which
# their class differ from the one of their nearest-neighbors. This sieve can be
# repeated which is the principle of the
github HunterMcGushion / hyperparameter_hunter / examples / feature_engineering_examples / imblearn_resampling_example.py View on Github external
def under_sample_NearMiss(train_inputs, train_targets):
    sampler = NearMiss(random_state=32)
    train_inputs, train_targets = _sampler_helper(sampler, train_inputs, train_targets)
    return train_inputs, train_targets
github ScottfreeLLC / AlphaPy / alphapy / data.py View on Github external
target_index = np.where(uv == target_value)[0][0]
        nontarget_index = np.where(uv != target_value)[0][0]
        ratio = (uc[nontarget_index] / uc[target_index]) - 1.0
    logger.info("Sampling Ratio for target %s [%r]: %f",
                target, target_value, ratio)

    # Choose the sampling method.

    if sampling_method == SamplingMethod.under_random:
        sampler = RandomUnderSampler()
    elif sampling_method == SamplingMethod.under_tomek:
        sampler = TomekLinks()
    elif sampling_method == SamplingMethod.under_cluster:
        sampler = ClusterCentroids()
    elif sampling_method == SamplingMethod.under_nearmiss:
        sampler = NearMiss(version=1)
    elif sampling_method == SamplingMethod.under_ncr:
        sampler = NeighbourhoodCleaningRule()
    elif sampling_method == SamplingMethod.over_random:
        sampler = RandomOverSampler(ratio=ratio)
    elif sampling_method == SamplingMethod.over_smote:
        sampler = SMOTE(ratio=ratio, kind='regular')
    elif sampling_method == SamplingMethod.over_smoteb:
        sampler = SMOTE(ratio=ratio, kind='borderline1')
    elif sampling_method == SamplingMethod.over_smotesv:
        sampler = SMOTE(ratio=ratio, kind='svm')
    elif sampling_method == SamplingMethod.overunder_smote_tomek:
        sampler = SMOTETomek(ratio=ratio)
    elif sampling_method == SamplingMethod.overunder_smote_enn:
        sampler = SMOTEENN(ratio=ratio)
    elif sampling_method == SamplingMethod.ensemble_easy:
        sampler = EasyEnsemble()
github scikit-learn-contrib / imbalanced-learn / examples / under-sampling / plot_nearmiss_3.py View on Github external
palette = sns.color_palette()


# Generate the dataset
X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9],
                           n_informative=3, n_redundant=1, flip_y=0,
                           n_features=20, n_clusters_per_class=1,
                           n_samples=5000, random_state=10)

# Instanciate a PCA object for the sake of easy visualisation
pca = PCA(n_components=2)
# Fit and transform x to visualise inside a 2D feature space
X_vis = pca.fit_transform(X)

# Apply Nearmiss 3
nm3 = NearMiss(version=3)
X_resampled, y_resampled = nm3.fit_sample(X, y)
X_res_vis = pca.transform(X_resampled)

# Two subplots, unpack the axes array immediately
f, (ax1, ax2) = plt.subplots(1, 2)

ax1.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0", alpha=0.5,
            edgecolor=almost_black, facecolor=palette[0], linewidth=0.15)
ax1.scatter(X_vis[y == 1, 0], X_vis[y == 1, 1], label="Class #1", alpha=0.5,
            edgecolor=almost_black, facecolor=palette[2], linewidth=0.15)
ax1.set_title('Original set')

ax2.scatter(X_res_vis[y_resampled == 0, 0], X_res_vis[y_resampled == 0, 1],
            label="Class #0", alpha=.5, edgecolor=almost_black,
            facecolor=palette[0], linewidth=0.15)
ax2.scatter(X_res_vis[y_resampled == 1, 0], X_res_vis[y_resampled == 1, 1],
github scikit-learn-contrib / imbalanced-learn / examples / under-sampling / plot_nearmiss_1.py View on Github external
palette = sns.color_palette()


# Generate the dataset
X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9],
                           n_informative=3, n_redundant=1, flip_y=0,
                           n_features=20, n_clusters_per_class=1,
                           n_samples=5000, random_state=10)

# Instanciate a PCA object for the sake of easy visualisation
pca = PCA(n_components=2)
# Fit and transform x to visualise inside a 2D feature space
X_vis = pca.fit_transform(X)

# Apply Nearmiss 1
nm1 = NearMiss(version=1)
X_resampled, y_resampled = nm1.fit_sample(X, y)
X_res_vis = pca.transform(X_resampled)

# Two subplots, unpack the axes array immediately
f, (ax1, ax2) = plt.subplots(1, 2)

ax1.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0", alpha=0.5,
            edgecolor=almost_black, facecolor=palette[0], linewidth=0.15)
ax1.scatter(X_vis[y == 1, 0], X_vis[y == 1, 1], label="Class #1", alpha=0.5,
            edgecolor=almost_black, facecolor=palette[2], linewidth=0.15)
ax1.set_title('Original set')

ax2.scatter(X_res_vis[y_resampled == 0, 0], X_res_vis[y_resampled == 0, 1],
            label="Class #0", alpha=.5, edgecolor=almost_black,
            facecolor=palette[0], linewidth=0.15)
ax2.scatter(X_res_vis[y_resampled == 1, 0], X_res_vis[y_resampled == 1, 1],
github scikit-learn-contrib / imbalanced-learn / imblearn / utils / estimator_checks.py View on Github external
def check_samplers_pandas(name, Sampler):
    pd = pytest.importorskip("pandas")
    # Check that the samplers handle pandas dataframe and pandas series
    X, y = make_classification(
        n_samples=1000,
        n_classes=3,
        n_informative=4,
        weights=[0.2, 0.3, 0.5],
        random_state=0,
    )
    X_pd = pd.DataFrame(X, columns=[str(i) for i in range(X.shape[1])])
    y_pd = pd.Series(y, name="class")
    sampler = Sampler()
    if isinstance(Sampler(), NearMiss):
        samplers = [Sampler(version=version) for version in (1, 2, 3)]

    else:
        samplers = [Sampler()]

    for sampler in samplers:
        set_random_state(sampler)
        X_res_pd, y_res_pd = sampler.fit_resample(X_pd, y_pd)
        X_res, y_res = sampler.fit_resample(X, y)

        # check that we return a pandas dataframe if a dataframe was given in
        assert isinstance(X_res_pd, pd.DataFrame)
        assert isinstance(y_res_pd, pd.Series)
        assert X_pd.columns.to_list() == X_res_pd.columns.to_list()
        assert y_pd.name == y_res_pd.name
        assert_allclose(X_res_pd.to_numpy(), X_res)
github MStarmans91 / WORC / WORC / classification / ObjectSampler.py View on Github external
def init_NearMiss(self, sampling_strategy, n_jobs):
        """Creata a near miss sampler object."""
        self.object = under_sampling.NearMiss(sampling_strategy=sampling_strategy,
                                              random_state=self.random_state,
                                              n_jobs=n_jobs)

        self.sampling_strategy = sampling_strategy
        self.n_jobs = n_jobs
github scikit-learn-contrib / imbalanced-learn / examples / under-sampling / plot_nearmiss.py View on Github external
# Generate the dataset
X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9],
                           n_informative=3, n_redundant=1, flip_y=0,
                           n_features=20, n_clusters_per_class=1,
                           n_samples=200, random_state=10)

# Instanciate a PCA object for the sake of easy visualisation
pca = PCA(n_components=2)
# Fit and transform x to visualise inside a 2D feature space
X_vis = pca.fit_transform(X)

# Apply Nearmiss
version = [1, 2, 3]
nm = [NearMiss(version=v, return_indices=True) for v in version]

X_resampled = []
y_resampled = []
X_res_vis = []
idx_samples_removed = []
for method in nm:
    X_res, y_res, idx_res = method.fit_resample(X, y)
    X_resampled.append(X_res)
    y_resampled.append(y_res)
    X_res_vis.append(pca.transform(X_res))
    idx_samples_removed = np.setdiff1d(np.arange(X_vis.shape[0]),
                                       idx_res)

# Two subplots, unpack the axes array immediately
f, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2)
ax_res = [ax2, ax3, ax4]