How to use the imblearn.combine.SMOTETomek function in imblearn

To help you get started, we’ve selected a few imblearn examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github ZhiningLiu1998 / self-paced-ensemble / canonical_resampling.py View on Github external
elif by == 'OSS':
            sampler = OneSidedSelection(random_state=random_state)
        elif by == 'NM':
            sampler = NearMiss(random_state=random_state)
        elif by == 'CC':
            sampler = ClusterCentroids(random_state=random_state)
        elif by == 'SMOTE':
            sampler = SMOTE(random_state=random_state)
        elif by == 'ADASYN':
            sampler = ADASYN(random_state=random_state)
        elif by == 'BorderSMOTE':
            sampler = BorderlineSMOTE(random_state=random_state)
        elif by == 'SMOTEENN':
            sampler = SMOTEENN(random_state=random_state)
        elif by == 'SMOTETomek':
            sampler = SMOTETomek(random_state=random_state)
        elif by == 'ORG':
            sampler = None
        else:
            raise Error('Unexpected \'by\' type {}'.format(by))
        
        if by != 'ORG':
            X_train, y_train = sampler.fit_resample(X, y)
        else:
            X_train, y_train = X, y
        if visualize:
            df = pd.DataFrame(X_train)
            df['label'] = y_train
            df.plot.scatter(x=0, y=1, c='label', s=3, colormap='coolwarm', title='{} training set'.format(by))
        self.base_estimator.fit(X_train, y_train)
github scikit-learn-contrib / imbalanced-learn / examples / combine / plot_comparison_combine.py View on Github external
# algorithm to clean the noisy samples. Two methods are usually used in the
# literature: (i) Tomek's link and (ii) edited nearest neighbours cleaning
# methods. Imbalanced-learn provides two ready-to-use samplers ``SMOTETomek``
# and ``SMOTEENN``. In general, ``SMOTEENN`` cleans more noisy data than
# ``SMOTETomek``.


fig, ((ax1, ax2), (ax3, ax4), (ax5, ax6)) = plt.subplots(3, 2,
                                                         figsize=(15, 25))
X, y = create_dataset(n_samples=1000, weights=(0.1, 0.2, 0.7))

ax_arr = ((ax1, ax2), (ax3, ax4), (ax5, ax6))
for ax, sampler in zip(ax_arr, (
        SMOTE(random_state=0),
        SMOTEENN(random_state=0),
        SMOTETomek(random_state=0))):
    clf = make_pipeline(sampler, LinearSVC())
    clf.fit(X, y)
    plot_decision_function(X, y, clf, ax[0])
    ax[0].set_title('Decision function for {}'.format(
        sampler.__class__.__name__))
    plot_resampling(X, y, sampler, ax[1])
    ax[1].set_title('Resampling using {}'.format(
        sampler.__class__.__name__))
fig.tight_layout()

plt.show()
github MStarmans91 / WORC / WORC / classification / ObjectSampler.py View on Github external
def init_SMOTETomek(self, sampling_strategy, ratio, n_jobs):
        """Creata a SMOTE Tomek sampler object."""
        self.object =\
            combine.SMOTETomek(random_state=self.random_state,
                               sampling_strategy=sampling_strategy,
                               ratio=ratio,
                               n_jobs=n_jobs)

        self.ratio = ratio
        self.sampling_strategy = sampling_strategy
        self.n_jobs = n_jobs
github HunterMcGushion / hyperparameter_hunter / examples / feature_engineering_examples / imblearn_resampling_example.py View on Github external
def resample_smote_tomek(train_inputs, train_targets):
    sampler = SMOTETomek(random_state=32)
    train_inputs, train_targets = _sampler_helper(sampler, train_inputs, train_targets)
    return train_inputs, train_targets
github ScottfreeLLC / AlphaPy / alphapy / data.py View on Github external
elif sampling_method == SamplingMethod.under_cluster:
        sampler = ClusterCentroids()
    elif sampling_method == SamplingMethod.under_nearmiss:
        sampler = NearMiss(version=1)
    elif sampling_method == SamplingMethod.under_ncr:
        sampler = NeighbourhoodCleaningRule()
    elif sampling_method == SamplingMethod.over_random:
        sampler = RandomOverSampler(ratio=ratio)
    elif sampling_method == SamplingMethod.over_smote:
        sampler = SMOTE(ratio=ratio, kind='regular')
    elif sampling_method == SamplingMethod.over_smoteb:
        sampler = SMOTE(ratio=ratio, kind='borderline1')
    elif sampling_method == SamplingMethod.over_smotesv:
        sampler = SMOTE(ratio=ratio, kind='svm')
    elif sampling_method == SamplingMethod.overunder_smote_tomek:
        sampler = SMOTETomek(ratio=ratio)
    elif sampling_method == SamplingMethod.overunder_smote_enn:
        sampler = SMOTEENN(ratio=ratio)
    elif sampling_method == SamplingMethod.ensemble_easy:
        sampler = EasyEnsemble()
    elif sampling_method == SamplingMethod.ensemble_bc:
        sampler = BalanceCascade()
    else:
        raise ValueError("Unknown Sampling Method %s" % sampling_method)

    # Get the newly sampled features.

    X, y = sampler.fit_sample(X_train, y_train)

    logger.info("Original Samples : %d", X_train.shape[0])
    logger.info("New Samples      : %d", X.shape[0])
github scikit-learn-contrib / imbalanced-learn / examples / combine / plot_smote_tomek.py View on Github external
print(__doc__)

# Generate the dataset
X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9],
                           n_informative=3, n_redundant=1, flip_y=0,
                           n_features=20, n_clusters_per_class=1,
                           n_samples=100, random_state=10)

# Instanciate a PCA object for the sake of easy visualisation
pca = PCA(n_components=2)
# Fit and transform x to visualise inside a 2D feature space
X_vis = pca.fit_transform(X)

# Apply SMOTE + Tomek links
sm = SMOTETomek()
X_resampled, y_resampled = sm.fit_resample(X, y)
X_res_vis = pca.transform(X_resampled)

# Two subplots, unpack the axes array immediately
f, (ax1, ax2) = plt.subplots(1, 2)

c0 = ax1.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0",
                 alpha=0.5)
c1 = ax1.scatter(X_vis[y == 1, 0], X_vis[y == 1, 1], label="Class #1",
                 alpha=0.5)
ax1.set_title('Original set')

ax2.scatter(X_res_vis[y_resampled == 0, 0], X_res_vis[y_resampled == 0, 1],
            label="Class #0", alpha=0.5)
ax2.scatter(X_res_vis[y_resampled == 1, 0], X_res_vis[y_resampled == 1, 1],
            label="Class #1", alpha=0.5)
github melqkiades / yelp / source / python / evaluation / classifier_evaluator.py View on Github external
from topicmodeling.context import review_metrics_extractor
from utils.constants import Constants


RANDOM_STATE = 0
SCORE_METRIC = 'accuracy'
# SCORE_METRIC = 'roc_auc'
resamplers = [
    None,
    RandomUnderSampler(random_state=RANDOM_STATE),
    TomekLinks(random_state=RANDOM_STATE),
    EditedNearestNeighbours(random_state=RANDOM_STATE),
    NeighbourhoodCleaningRule(random_state=RANDOM_STATE),
    RandomOverSampler(random_state=RANDOM_STATE),
    SMOTE(random_state=RANDOM_STATE),
    SMOTETomek(random_state=RANDOM_STATE),
    SMOTEENN(random_state=RANDOM_STATE)
]


PARAM_GRID_MAP = {
    'DummyClassifier': {
        'resampler': resamplers,
        'classifier': [DummyClassifier(random_state=RANDOM_STATE)],
        'classifier__strategy': ['most_frequent', 'stratified', 'uniform']
    },
    'LogisticRegression': {
        'resampler': resamplers,
        'classifier': [LogisticRegression(random_state=RANDOM_STATE)],
        'classifier__C': [0.1, 1.0, 10, 100, 1000]
        # 'classifier__C': [0.1, 1.0, 10]
    },
github salan668 / FAE / FAE / FeatureAnalysis / DataBalance.py View on Github external
def __init__(self, **kwargs):
        super(SmoteTomekSampling, self).__init__(SMOTETomek(**kwargs, random_state=RANDOM_SEED[BALANCE_SMOTE_TOMEK]),
                                                 BALANCE_SMOTE_TOMEK)