How to use the imblearn.under_sampling.TomekLinks function in imblearn

To help you get started, we’ve selected a few imblearn examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github scikit-learn-contrib / imbalanced-learn / examples / under-sampling / plot_tomek_links.py View on Github external
from imblearn.under_sampling import TomekLinks

print(__doc__)

rng = np.random.RandomState(0)
n_samples_1 = 500
n_samples_2 = 50
X_syn = np.r_[1.5 * rng.randn(n_samples_1, 2),
              0.5 * rng.randn(n_samples_2, 2) + [2, 2]]
y_syn = np.array([0] * (n_samples_1) + [1] * (n_samples_2))
X_syn, y_syn = shuffle(X_syn, y_syn)
X_syn_train, X_syn_test, y_syn_train, y_syn_test = train_test_split(X_syn,
                                                                    y_syn)

# remove Tomek links
tl = TomekLinks(return_indices=True)
X_resampled, y_resampled, idx_resampled = tl.fit_resample(X_syn, y_syn)

fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)

idx_samples_removed = np.setdiff1d(np.arange(X_syn.shape[0]),
                                   idx_resampled)
idx_class_0 = y_resampled == 0
plt.scatter(X_resampled[idx_class_0, 0], X_resampled[idx_class_0, 1],
            alpha=.8, label='Class #0')
plt.scatter(X_resampled[~idx_class_0, 0], X_resampled[~idx_class_0, 1],
            alpha=.8, label='Class #1')
plt.scatter(X_syn[idx_samples_removed, 0], X_syn[idx_samples_removed, 1],
            alpha=.8, label='Removed samples')

# make nice plotting
github melqkiades / yelp / source / python / etl / sampler_factory.py View on Github external
def create_sampler(sampler_name, random_state=None):

    if sampler_name is None or sampler_name == 'None':
        return None
    if sampler_name.lower() == 'randomundersampler':
        return RandomUnderSampler(random_state=random_state)
    if sampler_name.lower() == 'tomeklinks':
        return TomekLinks(random_state=random_state)
    if sampler_name.lower() == 'enn':
        return EditedNearestNeighbours(random_state=random_state)
    if sampler_name.lower() == 'ncl':
        return NeighbourhoodCleaningRule(random_state=random_state)
    if sampler_name.lower() == 'randomoversampler':
        return RandomOverSampler(random_state=random_state)
    if sampler_name.lower() == 'smote':
        return SMOTE(random_state=random_state)
    if sampler_name.lower() == 'smotetomek':
        return SMOTETomek(random_state=random_state)
    if sampler_name.lower() == 'smoteenn':
        return SMOTEENN(random_state=random_state)
    else:
        raise ValueError('Unsupported value \'%s\' for sampler' % sampler_name)
github scikit-learn-contrib / imbalanced-learn / examples / plot_sampling_strategy_usage.py View on Github external
X_res, y_res = ros.fit_resample(X, y)
print('Information of the iris data set after making it '
      'balanced by over-sampling: \n sampling_strategy={} \n y: {}'
      .format(sampling_strategy, Counter(y_res)))
plot_pie(y_res)

###############################################################################
# ``sampling_strategy`` as a ``list``
# ...................................
#
# When ``sampling_strategy`` is a ``list``, the list contains the targeted
# classes. It is used only for **cleaning methods** and raise an error
# otherwise.

sampling_strategy = [0, 1, 2]
tl = TomekLinks(sampling_strategy=sampling_strategy)
X_res, y_res = tl.fit_resample(X, y)
print('Information of the iris data set after making it '
      'balanced by cleaning sampling: \n sampling_strategy={} \n y: {}'
      .format(sampling_strategy, Counter(y_res)))
plot_pie(y_res)

###############################################################################
# ``sampling_strategy`` as a callable
# ...................................
#
# When callable, function taking ``y`` and returns a ``dict``. The keys
# correspond to the targeted classes. The values correspond to the desired
# number of samples for each class.


def ratio_multiplier(y):
github UKPLab / coling2018_fake-news-challenge / fnc / pipeline.py View on Github external
#kind = ['regular', 'borderline1', 'borderline2', 'svm']
        #sm = SMOTE(kind='regular',)
        #X_res, y_res = sm.fit_sample(X_all, y_all)

        #ros = RandomOverSampler()
        #X_res, y_res = ros.fit_sample(X_all, y_all)

        #ada = ADASYN()
        #X_res, y_res = ada.fit_sample(X_all, y_all)

        ######################################################
        # Undersampling
        from imblearn.under_sampling import TomekLinks, EditedNearestNeighbours, CondensedNearestNeighbour, \
            NeighbourhoodCleaningRule, InstanceHardnessThreshold
        # remove Tomek links
        tl = TomekLinks(return_indices=True)
        X_res, y_res, idx_resampled = tl.fit_sample(X_all, y_all)

        #enn = EditedNearestNeighbours(random_state=0)
        #X_res, y_res = enn.fit_sample(X_all, y_all)

        #cnn = CondensedNearestNeighbour(random_state=0)
        #X_res, y_res = cnn.fit_sample(X_all, y_all)

        #ncr = NeighbourhoodCleaningRule(random_state=0)
        #X_res, y_res = ncr.fit_sample(X_all, y_all)

        #iht = InstanceHardnessThreshold(random_state=0, estimator=clf)
        #X_res, y_res = iht.fit_sample(X_all, y_all)


        ##################
github scikit-learn-contrib / imbalanced-learn / examples / plot_sampling_strategy_usage.py View on Github external
sampling_strategy = 'not majority'

ros = RandomOverSampler(sampling_strategy=sampling_strategy)
X_res, y_res = ros.fit_resample(X, y)
print('Information of the iris data set after making it '
      'balanced by over-sampling: \n sampling_strategy={} \n y: {}'
      .format(sampling_strategy, Counter(y_res)))
plot_pie(y_res)

###############################################################################
# With **cleaning method**, the number of samples in each class will not be
# equalized even if targeted.

sampling_strategy = 'not minority'
tl = TomekLinks(sampling_strategy)
X_res, y_res = tl.fit_resample(X, y)
print('Information of the iris data set after making it '
      'balanced by cleaning sampling: \n sampling_strategy={} \n y: {}'
      .format(sampling_strategy, Counter(y_res)))
plot_pie(y_res)

###############################################################################
# ``sampling_strategy`` as a ``dict``
# ...................................
#
# When ``sampling_strategy`` is a ``dict``, the keys correspond to the targeted
# classes. The values correspond to the desired number of samples for each
# targeted class. This is working for both **under- and over-sampling**
# algorithms but not for the **cleaning algorithms**. Use a ``list`` instead.
github melqkiades / yelp / source / python / evaluation / classifier_evaluator.py View on Github external
from sklearn.tree import tree

from etl import ETLUtils
from etl import sampler_factory
from nlp import nlp_utils
from topicmodeling.context import review_metrics_extractor
from utils.constants import Constants


RANDOM_STATE = 0
SCORE_METRIC = 'accuracy'
# SCORE_METRIC = 'roc_auc'
resamplers = [
    None,
    RandomUnderSampler(random_state=RANDOM_STATE),
    TomekLinks(random_state=RANDOM_STATE),
    EditedNearestNeighbours(random_state=RANDOM_STATE),
    NeighbourhoodCleaningRule(random_state=RANDOM_STATE),
    RandomOverSampler(random_state=RANDOM_STATE),
    SMOTE(random_state=RANDOM_STATE),
    SMOTETomek(random_state=RANDOM_STATE),
    SMOTEENN(random_state=RANDOM_STATE)
]


PARAM_GRID_MAP = {
    'DummyClassifier': {
        'resampler': resamplers,
        'classifier': [DummyClassifier(random_state=RANDOM_STATE)],
        'classifier__strategy': ['most_frequent', 'stratified', 'uniform']
    },
    'LogisticRegression': {
github scikit-learn-contrib / imbalanced-learn / examples / under-sampling / plot_illustration_tomek_links.py View on Github external
# highlight the samples of interest
ax.scatter([X_minority[-1, 0], X_majority[1, 0]],
           [X_minority[-1, 1], X_majority[1, 1]],
           label='Tomek link', s=200, alpha=0.3)
ax.set_title('Illustration of a Tomek link')
make_plot_despine(ax)
fig.tight_layout()

###############################################################################
# We can run the ``TomekLinks`` sampling to remove the corresponding
# samples. If ``sampling_strategy='auto'`` only the sample from the majority
# class will be removed. If ``sampling_strategy='all'`` both samples will be
# removed.

sampler = TomekLinks()

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 6))

ax_arr = (ax1, ax2)
title_arr = ('Removing only majority samples',
             'Removing all samples')
for ax, title, sampler in zip(ax_arr,
                              title_arr,
                              [TomekLinks(sampling_strategy='auto'),
                               TomekLinks(sampling_strategy='all')]):
    X_res, y_res = sampler.fit_resample(np.vstack((X_minority, X_majority)),
                                        np.array([0] * X_minority.shape[0] +
                                                 [1] * X_majority.shape[0]))
    ax.scatter(X_res[y_res == 0][:, 0], X_res[y_res == 0][:, 1],
               label='Minority class', s=200, marker='_')
    ax.scatter(X_res[y_res == 1][:, 0], X_res[y_res == 1][:, 1],
github scikit-learn-contrib / imbalanced-learn / examples / under-sampling / plot_illustration_tomek_links.py View on Github external
# We can run the ``TomekLinks`` sampling to remove the corresponding
# samples. If ``sampling_strategy='auto'`` only the sample from the majority
# class will be removed. If ``sampling_strategy='all'`` both samples will be
# removed.

sampler = TomekLinks()

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 6))

ax_arr = (ax1, ax2)
title_arr = ('Removing only majority samples',
             'Removing all samples')
for ax, title, sampler in zip(ax_arr,
                              title_arr,
                              [TomekLinks(sampling_strategy='auto'),
                               TomekLinks(sampling_strategy='all')]):
    X_res, y_res = sampler.fit_resample(np.vstack((X_minority, X_majority)),
                                        np.array([0] * X_minority.shape[0] +
                                                 [1] * X_majority.shape[0]))
    ax.scatter(X_res[y_res == 0][:, 0], X_res[y_res == 0][:, 1],
               label='Minority class', s=200, marker='_')
    ax.scatter(X_res[y_res == 1][:, 0], X_res[y_res == 1][:, 1],
               label='Majority class', s=200, marker='+')

    # highlight the samples of interest
    ax.scatter([X_minority[-1, 0], X_majority[1, 0]],
               [X_minority[-1, 1], X_majority[1, 1]],
               label='Tomek link', s=200, alpha=0.3)

    ax.set_title(title)
    make_plot_despine(ax)
fig.tight_layout()