How to use the sklearn.preprocessing.StandardScaler function in sklearn

To help you get started, we’ve selected a few sklearn examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github mne-tools / mne-python / examples / realtime / offline_testing / test_pipeline.py View on Github external
y = np.concatenate(y)

from sklearn import preprocessing
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.cross_validation import ShuffleSplit

cv = ShuffleSplit(len(y), 10, test_size=0.2)

pipe = True  # use pipeline?

for train_idx, test_idx in cv:
    y_train, y_test = y[train_idx], y[test_idx]

    # define transformer objects
    scaler = preprocessing.StandardScaler()
    concatenator = ConcatenateChannels()
    clf = SVC(C=1, kernel='linear')

    if pipe is not True:

        # Concatenate channels
        concatenator = concatenator.fit(X[train_idx, :, :], y_train)
        X_train = concatenator.transform(X[train_idx, :, :])

        # Scale data across trials
        X_train = scaler.fit_transform(X_train)

        X_test = concatenator.transform(X[test_idx, :, :])
        X_test = scaler.fit_transform(X_test)

        clf = clf.fit(X_train, y_train)
github onnx / sklearn-onnx / tests / test_sklearn_pipeline.py View on Github external
def test_combine_inputs_floats_ints(self):
        data = [[0, 0.0], [0, 0.0], [1, 1.0], [1, 1.0]]
        scaler = StandardScaler()
        scaler.fit(data)
        model = Pipeline([("scaler1", scaler), ("scaler2", scaler)])

        model_onnx = convert_sklearn(
            model,
            "pipeline",
            [
                ("input1", Int64TensorType([None, 1])),
                ("input2", FloatTensorType([None, 1])),
            ],
        )
        self.assertTrue(len(model_onnx.graph.node[-1].output) == 1)
        self.assertTrue(model_onnx is not None)
        data = numpy.array(data)
        data = {
            "input1": data[:, 0].reshape((-1, 1)).astype(numpy.int64),
github mouradmourafiq / pandas2sklearn / pandas_sklearn / preprocessing.py View on Github external
def standard_scale(data, data_mean=None, data_std=None):
    """
    Does a standardization over data.
    Sometimes data do not fit in memory and we need to process chunks of data,
    in this case, `data_mean` and `data_std` are required to be calculated before
    scaling.
    :param data (array):
    :param data_mean (array):
    :param data_std (array):
    :return (array): standardized data.
    """
    data = data.astype('float')
    if (data_mean is None) and (data_std is None):
        # use sklearn default standardScaler
        std_scale = preprocessing.StandardScaler().fit(data)
        return std_scale.transform(data)

    # Custom standardization, since data is probably spread on
    # several parts and stats were collected before
    return (data - data_mean) / data_std
github bentruitt / MF_MBS_Default_Risk / mf_mrtg_default_model.py View on Github external
def plot_roc_curve(X, y, plot_dir, trial, cv, model):

    mean_tpr = 0.0
    mean_fpr = np.linspace(0, 1, 100)
    thresh_plt = 0.0
    thresh_mean = 0.0
    model_nm = str(model).split("(")[0]
    ### Create StratifiedKFold generator
    cv = StratifiedKFold(y, n_folds=5, shuffle=True)
    ### Initialize StandardScaler
    scaler = StandardScaler()

    for i, (train, test) in enumerate(cv):
        X_train = scaler.fit_transform(X[train])
        X_test = scaler.transform(X[test])
        probas_ = model.fit(X_train, y[train]).predict_proba(X_test)
        # Compute ROC curve and area the curve
        fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1])
        mean_tpr += interp(mean_fpr, fpr, tpr)
        mean_tpr[0] = 0.0
        roc_auc = auc(fpr, tpr)
        plt.plot(fpr, tpr, lw=1, label='ROC fold %d (area = %0.2f)' % (i+1, roc_auc))
        thresholds[0] = min(1.0, thresholds[0])
        thresholds[-1] = max(0.0, thresholds[-1])
        thresh_mean += interp(mean_fpr, np.linspace(0,1,len(thresholds)), thresholds)
        # plt.plot(fpr, thresholds, lw=1, label='Thresholds %d (%0.2f - %0.2f)' % (i+1, thresholds.max(), thresholds.min())) # np.linspace(0,1,len(thresholds))
github robertmartin8 / udemyML / 05_classification / naivebayes.py View on Github external
import matplotlib.pyplot as plt
plt.style.use('seaborn-deep')
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix


# Reading in data
ds = pd.read_csv("Social_Network_Ads.csv")
X = ds.iloc[:, 2:4].values
y = ds.iloc[:,4].values

# Splitting and scaling
X_train, X_test, y_train, y_test = train_test_split(X,y)

sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.fit_transform(X_test)

# Classifier
from sklearn.naive_bayes import GaussianNB

clf = GaussianNB()
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
cm = confusion_matrix(y_test, y_pred)

# Plot
from matplotlib.colors import ListedColormap

X_set, y_set = X_train, y_train
github pagea / unstyle / unstyle / controller.py View on Github external
print("np.array(userDocFeatures):", np.array(userDocFeatures))
    print("Initial cosine similarity between doc and means: ", initCosineSim)
    # Basic sanity check to make sure cosine threshold correctly identifies
    # authorship of user's document.
    print("Cosine similarity below threshold? ", str(
        initCosineSim < unstyle.controller.t))

    # Combine documents and labels. This creates the training set.
    X = np.vstack((userOtherFeatures, otherAuthorFeatures))
    y = []
    y.extend(userLabels)
    y.extend(otherAuthorLabels)
    print("Training labels: ", y)

    # Instantiate classifier; train and predict on scaled data.
    scaler = preprocessing.StandardScaler().fit(X)
    clf = svm.SVC(probability=True, kernel='linear', C=1.0, class_weight='auto')
    clf.fit(scaler.transform(X), y)
    print("Predicted author of doc: " +
          str(clf.predict(scaler.transform(userDocFeatures))))
    print("Certainty: ", clf.predict_proba(scaler.transform(userDocFeatures)))
    print("Classifier internal label rep: ", clf.classes_)

    # Get feature ranks
    unstyle.controller.feature_ranks = rank_features_rfe(
        scaler.transform(X), y, featset)
    print(str(feature_ranks))

    # Get target values for features.
    authors = unstyle.controller.numAuthors
    unstyle.controller.targets = unstyle.adversarial.compute_target_vals(
        userDocFeatures,
github weaponsjtu / Kaggle_xBle / model_library.py View on Github external
def preprocess_data(x_train, x_test):
    # log(x+1)
    x_train = np.array(x_train)
    x_test = np.array(x_test)
    x_train = np.log(x_train.astype(int)+1)
    x_test = np.log(x_test.astype(int)+1)

    # standazition
    sc = StandardScaler(copy=True, with_mean=True, with_std=True)
    sc.fit(x_train)
    x_train = sc.transform(x_train)
    x_test = sc.transform(x_test)
    return x_train, x_test
github nsu-ai-team / noise_supression / noise_supression / prepare_data.py View on Github external
"""
    workspace = args.workspace
    data_type = args.data_type
    snr = args.snr
    
    # Load data. 
    t1 = time.time()
    hdf5_path = os.path.join(workspace, "packed_features", "spectrogram", data_type, "%ddb" % int(snr), "data.h5")
    with h5py.File(hdf5_path, 'r') as hf:
        x = hf.get('x')     
        x = np.array(x)     # (n_segs, n_concat, n_freq)
    
    # Compute scaler. 
    (n_segs, n_concat, n_freq) = x.shape
    x2d = x.reshape((n_segs * n_concat, n_freq))
    scaler = preprocessing.StandardScaler(with_mean=True, with_std=True).fit(x2d)
    print(scaler.mean_)
    print(scaler.scale_)
    
    # Write out scaler. 
    out_path = os.path.join(workspace, "packed_features", "spectrogram", data_type, "%ddb" % int(snr), "scaler.p")
    create_folder(os.path.dirname(out_path))
    pickle.dump(scaler, open(out_path, 'wb'))
    
    print("Save scaler to %s" % out_path)
    print("Compute scaler finished! %s s" % (time.time() - t1,))
github deephyper / deephyper / deephyper / benchmarks / nas / candleP1B3 / p1b3.py View on Github external
"""

    df = df.dropna(axis=1, how='all')

    imputer = Imputer(strategy='mean', axis=0)
    mat = imputer.fit_transform(df)

    if scaling is None or scaling.lower() == 'none':
        return pd.DataFrame(mat, columns=df.columns)

    if scaling == 'maxabs':
        scaler = MaxAbsScaler()
    elif scaling == 'minmax':
        scaler = MinMaxScaler()
    else:
        scaler = StandardScaler()

    mat = scaler.fit_transform(mat)

    df = pd.DataFrame(mat, columns=df.columns)

    return df
github mne-tools / mne-python / mne / decoding / transformer.py View on Github external
self.with_std = with_std
        self.scalings = scalings

        if not (scalings is None or isinstance(scalings, (dict, str))):
            raise ValueError('scalings type should be dict, str, or None, '
                             'got %s' % type(scalings))
        if isinstance(scalings, str):
            _check_option('scalings', scalings, ['mean', 'median'])
        if scalings is None or isinstance(scalings, dict):
            if info is None:
                raise ValueError('Need to specify "info" if scalings is'
                                 '%s' % type(scalings))
            self._scaler = _ConstantScaler(info, scalings, self.with_std)
        elif scalings == 'mean':
            from sklearn.preprocessing import StandardScaler
            self._scaler = StandardScaler(self.with_mean, self.with_std)
        else:  # scalings == 'median':
            if not check_version('sklearn', '0.17'):
                raise ValueError("median requires version 0.17 of "
                                 "sklearn library")
            from sklearn.preprocessing import RobustScaler
            self._scaler = RobustScaler(self.with_mean, self.with_std)