How to use the imblearn.over_sampling.RandomOverSampler function in imblearn

To help you get started, we’ve selected a few imblearn examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github dermatologist / nlp-qrmine / src / nlp_qrmine / nnet.py View on Github external
dataset.dropna(inplace=True)

# summarize the number of rows and columns in the dataset after listwise drop
(sample, vnum) = dataset.shape
print(sample, vnum)

# Get the number of variables
vnum = vnum - 1

# splice into IVs and DV
values = dataset.values
X = values[:, 0:vnum]
y = values[:, vnum]

# Oversampling
ros = RandomOverSampler(random_state=0)
X_R, y_R = ros.fit_sample(X, y)

# create model
model = Sequential()
model.add(Dense(12, input_dim=vnum, kernel_initializer='uniform', activation='relu'))
model.add(Dense(8, kernel_initializer='uniform', activation='relu'))
model.add(Dense(1, kernel_initializer='uniform', activation='sigmoid'))
# Compile model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# Fit the model
model.fit(X_R, y_R, epochs=150, batch_size=10, verbose=2)

# calculate predictions
predictions = model.predict(X)
# round predictions
rounded = [round(x[0]) for x in predictions]
github dLutscher / MixMatch-TransferLearning / make_x_ray_dataset.py View on Github external
_ = save_dataset(args.data_dir, X = X_test, y = Y_test,
                     train=False, balanced=False)


    print('UPSAMPLING ORIGINAL DATASET TO IMPROVE IMBALANCES\n')
    class_weighting = cw.compute_class_weight('balanced',
                                                        np.unique(y),
                                                        y)
    print(f'ORIGINAL CLASS BALACE: {class_weighting}')

    # save original dimensions (except nr of samples) for reshaping later
    X_train_shape = list(X_train.shape[1:])
    X_test_shape = list(X_test.shape[1:])

    # Do the oversampling
    ros = RandomOverSampler(ratio='auto')
    X_train_balanced, Y_train_balanced = ros.fit_sample(
                              X = np.reshape(X_train, [X_train.shape[0], -1]),
                              y = Y_train)
    X_test_balanced, Y_test_balanced = ros.fit_sample(
                              X = np.reshape(X_test, [X_test.shape[0], -1]),
                              y = Y_test)

    # Reshape into original dimensions
    X_train_balanced = np.reshape(X_train_balanced,
                                 [len(X_train_balanced)] + X_train_shape)
    X_test_balanced = np.reshape(X_test_balanced,
                                 [len(X_test_balanced)] + X_test_shape)

    class_weight = cw.compute_class_weight('balanced',
                                           np.unique(Y_train_balanced),
                                           Y_train_balanced)
github HunterMcGushion / hyperparameter_hunter / examples / feature_engineering_examples / imblearn_resampling_example.py View on Github external
def over_sample_random(train_inputs, train_targets):
    sampler = RandomOverSampler(random_state=32)
    train_inputs, train_targets = _sampler_helper(sampler, train_inputs, train_targets)
    return train_inputs, train_targets
github pierre-chaville / automlk / automlk / solutions_pp.py View on Github external
PpSolution('FR-SVD', 'Truncated SVD', TransformerTruncatedSVD, default_truncated_svd, space_truncated_svd,
               'feature',
               limit_size=50),
    PpSolution('FR-ICA', 'Fast ICA', TransformerFastICA, default_fast_ica, space_fast_ica, 'feature', limit_size=50),
    PpSolution('FR-PCA', 'PCA', TransformerPCA, default_pca, space_pca, 'feature', limit_size=50),

    # feature selection from model
    PpSolution('FR-RFR', 'Selection RF', TransformerSelectionRfR, default_sel_rf, space_sel_rf, 'feature',
               problem_type='regression'),
    PpSolution('FR-RFC', 'Selection RF', TransformerSelectionRfC, default_sel_rf, space_sel_rf, 'feature',
               problem_type='classification'),
    PpSolution('FR-LR', 'Selection LSVR', TransformerSelectionLinearSVR, {}, {}, 'feature', problem_type='regression'),

    # sampling solutions
    PpSolution('SP-PASS', 'No re-sampling', NoSampling, {}, {}, 'sampling'),
    PpSolution('SP-ROS', 'Random Over', RandomOverSampler, {}, {}, 'sampling'),
    PpSolution('SP-SMOTE', 'SMOTE', SMOTE, {}, {}, 'sampling'),
]

# mapping table
pp_solutions_map = {s.ref: s for s in pp_solutions}

# default pre-processing lists
pp_def_lgbm = ['MS-FIXED', 'FL-PASS', 'DT-DT', 'CE-LAB', 'TX-W2V', 'SC-PASS', 'FR-PASS']
pp_def_trees = ['MS-FIXED', 'FL-PASS', 'DT-DT', 'CE-LAB', 'TX-W2V', 'SC-PASS', 'FR-PASS']
pp_def_knn = ['MS-FIXED', 'FL-PASS', 'DT-DT', 'CE-HOT', 'TX-W2V', 'SC-STD', 'FR-PASS']
pp_def_linear = ['MS-FIXED', 'FL-LOG', 'DT-DT', 'CE-HOT', 'TX-W2V', 'SC-ROBUST', 'FR-PASS']
pp_def_NN = ['MS-FIXED', 'FL-LOG', 'DT-DT', 'CE-HOT', 'TX-W2V', 'SC-MINMAX', 'FR-PASS']

pp_list_lgbm = ['CE-LAB', 'CE-HOT', 'CE-BASE', 'CE-HASH',
                'FL-PASS', 'FL-LOG', 'FL-SQRT',
                'DT-DT', 'DT-YMD', 'DT-MD',
github IBM / lale / lale / lib / imblearn / random_over_sampler.py View on Github external
def __init__(self, operator = None, sampling_strategy='auto', random_state=None):
        if operator is None:
            raise ValueError("Operator is a required argument.")

        self._hyperparams = {
            'sampling_strategy': sampling_strategy,
            'random_state': random_state}

        resampler_instance = OrigModel(**self._hyperparams)
        super(RandomOverSamplerImpl, self).__init__(
            operator = operator,
            resampler = resampler_instance)
github scikit-learn-contrib / imbalanced-learn / examples / over-sampling / plot_random_over_sampling.py View on Github external
print(__doc__)

# Generate the dataset
X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9],
                           n_informative=3, n_redundant=1, flip_y=0,
                           n_features=20, n_clusters_per_class=1,
                           n_samples=200, random_state=10)

# Instanciate a PCA object for the sake of easy visualisation
pca = PCA(n_components=2)
# Fit and transform x to visualise inside a 2D feature space
X_vis = pca.fit_transform(X)

# Apply the random over-sampling
ros = RandomOverSampler()
X_resampled, y_resampled = ros.fit_resample(X, y)
X_res_vis = pca.transform(X_resampled)

# Two subplots, unpack the axes array immediately
f, (ax1, ax2) = plt.subplots(1, 2)

c0 = ax1.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0",
                 alpha=0.5)
c1 = ax1.scatter(X_vis[y == 1, 0], X_vis[y == 1, 1], label="Class #1",
                 alpha=0.5)
ax1.set_title('Original set')

ax2.scatter(X_res_vis[y_resampled == 0, 0], X_res_vis[y_resampled == 0, 1],
            label="Class #0", alpha=.5)
ax2.scatter(X_res_vis[y_resampled == 1, 0], X_res_vis[y_resampled == 1, 1],
            label="Class #1", alpha=.5)
github HealthCatalyst / healthcareai-py / healthcareai / common / transformers.py View on Github external
def transform(self, X, y=None):
        """Transform the dataframe."""
        # TODO how do we validate this happens before train/test split? Or do we need to? Can we implement it in the
        # TODO      simple trainer in the correct order and leave this to advanced users?

        # Extract predicted column
        y = np.squeeze(X[[self.predicted_column]])

        # Copy the dataframe without the predicted column
        temp_dataframe = X.drop([self.predicted_column], axis=1)

        # Initialize and fit the under sampler
        over_sampler = RandomOverSampler(random_state=self.random_seed)
        x_over_sampled, y_over_sampled = over_sampler.fit_sample(temp_dataframe, y)

        # Build the resulting under sampled dataframe
        result = pd.DataFrame(x_over_sampled)

        # Restore the column names
        result.columns = temp_dataframe.columns

        # Restore the y values
        y_over_sampled = pd.Series(y_over_sampled)
        result[self.predicted_column] = y_over_sampled

        return result
github ustunb / dcptree / dcptree / group_helper.py View on Github external
group_names = []
    group_labels = []
    group_values = []
    for n, g in groups.items():
        group_names.append(n)
        group_values.append(g['indices'])
        group_labels.append(g['labels'][g['indices']])
    group_values = np.transpose(np.vstack(group_values))
    group_labels = np.transpose(np.vstack(group_labels))

    # get unique ids for each combination of group attributes
    _, profile_idx = np.unique(group_values, axis = 0, return_inverse = True)
    profile_labels = range(0, np.max(profile_idx) + 1)

    # oversample labels
    ros = RandomOverSampler(**kwargs)
    X = np.array(data['X'])
    Y = np.array(data['Y'])
    X_res = []
    Y_res = []
    G_res = []
    assert np.isin((-1, 1), Y).all()

    for i in profile_labels:
        row_idx = np.isin(profile_idx, i)
        profile_values = group_labels[row_idx, :][0]
        Xg = X[row_idx, :]
        Yg = Y[row_idx]
        if np.isin((-1, 1), Yg).all():
            Xs, Ys = ros.fit_sample(Xg, Yg)
            X_res.append(Xs)
            Y_res.append(Ys)
github salan668 / FAE / FAE / FeatureAnalysis / DataBalance.py View on Github external
def __init__(self):
        super(UpSampling, self).__init__(RandomOverSampler(random_state=RANDOM_SEED[BALANCE_UP_SAMPLING]),
                                         BALANCE_UP_SAMPLING)