How to use the imblearn.over_sampling.SMOTE function in imblearn

To help you get started, we’ve selected a few imblearn examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github scikit-learn-contrib / imbalanced-learn / examples / over-sampling / plot_comparison_over_sampling.py View on Github external
_sampling_type = 'bypass'

    def _fit_resample(self, X, y):
        return X, y


fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 15))
X, y = create_dataset(n_samples=10000, weights=(0.01, 0.05, 0.94))
sampler = FakeSampler()
clf = make_pipeline(sampler, LinearSVC())
plot_resampling(X, y, sampler, ax1)
ax1.set_title('Original data - y={}'.format(Counter(y)))

ax_arr = (ax2, ax3, ax4)
for ax, sampler in zip(ax_arr, (RandomOverSampler(random_state=0),
                                SMOTE(random_state=0),
                                ADASYN(random_state=0))):
    clf = make_pipeline(sampler, LinearSVC())
    clf.fit(X, y)
    plot_resampling(X, y, sampler, ax)
    ax.set_title('Resampling using {}'.format(sampler.__class__.__name__))
fig.tight_layout()

###############################################################################
# The following plot illustrate the difference between ADASYN and SMOTE. ADASYN
# will focus on the samples which are difficult to classify with a
# nearest-neighbors rule while regular SMOTE will not make any distinction.
# Therefore, the decision function depending of the algorithm.

fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(20, 6))
X, y = create_dataset(n_samples=10000, weights=(0.01, 0.05, 0.94))
github rikturr / aws-ml-experimenter / experiments / helpers.py View on Github external
self.pos_samples = pos_samples
        self.ratio_sampler = None
        super(ModifiedRandomOverSampler, self).__init__(random_state=random_state)

    def fit(self, X, y):
        pos = self.pos_samples
        neg = len(y[y == 0])
        self.ratio_sampler = RandomOverSampler(random_state=self.random_state, ratio={0: neg, 1: pos})
        self.ratio_sampler.fit(X, y)
        return self

    def sample(self, X, y):
        return self.ratio_sampler.sample(X, y)


class ModifiedSMOTE(SMOTE):
    def __init__(self, pos_samples, random_state=0):
        self.pos_samples = pos_samples
        self.ratio_sampler = None
        super(ModifiedSMOTE, self).__init__(random_state=random_state)

    def fit(self, X, y):
        pos = self.pos_samples
        neg = len(y[y == 0])
        self.ratio_sampler = SMOTE(random_state=self.random_state, ratio={0: neg, 1: pos})
        self.ratio_sampler.fit(X, y)
        return self

    def sample(self, X, y):
        return self.ratio_sampler.sample(X, y)
github rikturr / aws-ml-experimenter / experiments / helpers.py View on Github external
def fit(self, X, y):
        pos = self.pos_samples
        neg = len(y[y == 0])
        self.ratio_sampler = SMOTE(random_state=self.random_state, ratio={0: neg, 1: pos})
        self.ratio_sampler.fit(X, y)
        return self
github ZhiningLiu1998 / self-paced-ensemble / canonical_ensemble.py View on Github external
def fit(self, X, y):
        self.model_list = []
        df = pd.DataFrame(X); df['label'] = y
        df_maj = df[df['label']==0]; n_maj = len(df_maj)
        df_min = df[df['label']==1]; n_min = len(df_min)
        cols = df.columns.tolist(); cols.remove('label')
        for ibagging in range(self.n_estimators):
            b = min(0.1*((ibagging%10)+1), 1)
            train_maj = df_maj.sample(frac=b, replace=True)
            train_min = df_min.sample(frac=b, replace=True)
            # train_maj = df_maj.sample(frac=1/self.n_estimators, replace=True)
            # train_min = df_min.sample(frac=1/self.n_estimators, replace=True)
            # train_maj = df_maj.sample(n=n_min, replace=True)
            # train_min = df_min.sample(frac=1/self.n_estimators, replace=True)
            df_k = train_maj.append(train_min)
            X_train, y_train = SMOTE_IMB(k_neighbors=min(5, len(train_min)-1)).fit_resample(df_k[cols], df_k['label'])
            # print ('Bagging Iter: {} |b: {:.1f}|n_train: {}|n_smote: {}'.format(
            #     ibagging, b, len(y_train), len(y_train)-len(df_k)))
            model = DT().fit(X_train, y_train)
            self.model_list.append(model)
        return self
github scikit-learn-contrib / imbalanced-learn / examples / over-sampling / plot_smote_bordeline_1.py View on Github external
palette = sns.color_palette()


# Generate the dataset
X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9],
                           n_informative=3, n_redundant=1, flip_y=0,
                           n_features=20, n_clusters_per_class=1,
                           n_samples=5000, random_state=10)

# Instanciate a PCA object for the sake of easy visualisation
pca = PCA(n_components=2)
# Fit and transform x to visualise inside a 2D feature space
X_vis = pca.fit_transform(X)

# Apply Borderline SMOTE 1
sm = SMOTE(kind='borderline1')
X_resampled, y_resampled = sm.fit_sample(X, y)
X_res_vis = pca.transform(X_resampled)

# Two subplots, unpack the axes array immediately
f, (ax1, ax2) = plt.subplots(1, 2)

ax1.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0", alpha=0.5,
            edgecolor=almost_black, facecolor=palette[0], linewidth=0.15)
ax1.scatter(X_vis[y == 1, 0], X_vis[y == 1, 1], label="Class #1", alpha=0.5,
            edgecolor=almost_black, facecolor=palette[2], linewidth=0.15)
ax1.set_title('Original set')

ax2.scatter(X_res_vis[y_resampled == 0, 0], X_res_vis[y_resampled == 0, 1],
            label="Class #0", alpha=.5, edgecolor=almost_black,
            facecolor=palette[0], linewidth=0.15)
ax2.scatter(X_res_vis[y_resampled == 1, 0], X_res_vis[y_resampled == 1, 1],
github salan668 / FAE / FAE / FeatureAnalysis / DataBalance.py View on Github external
def __init__(self, **kwargs):
        super(SmoteSampling, self).__init__(SMOTE(**kwargs, random_state=RANDOM_SEED[BALANCE_SMOTE]), BALANCE_SMOTE)
github felix-last / evaluate-kmeans-smote / imbalanced_benchmark.py View on Github external
[{
                'n_neighbors': [3,5,8]
            }]
        )
    ]
    oversampling_methods = [
        ('None',None),
        ('RandomOverSampler', RandomOverSampler()),
        (
            'SMOTE', SMOTE(),
            [{
                'k_neighbors': [3,5,20]
            }]
        ),
        (
            'B1-SMOTE', SMOTE(kind='borderline1'),
            [{
                'k_neighbors': [3,5,20]
            }]
        ),
        (
            'B2-SMOTE', SMOTE(kind='borderline2'),
            [{
                'k_neighbors': [3,5,20]
            }]
        ),
        (
            'KMeansSMOTE', KMeansSMOTE(),
            [
                {
                    'imbalance_ratio_threshold': [1,float('Inf')],
                    'density_power': [0, 2, None], # None corresponds to n_features