Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
y = np.concatenate(y)
from sklearn import preprocessing
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.cross_validation import ShuffleSplit
cv = ShuffleSplit(len(y), 10, test_size=0.2)
pipe = True # use pipeline?
for train_idx, test_idx in cv:
y_train, y_test = y[train_idx], y[test_idx]
# define transformer objects
scaler = preprocessing.StandardScaler()
concatenator = ConcatenateChannels()
clf = SVC(C=1, kernel='linear')
if pipe is not True:
# Concatenate channels
concatenator = concatenator.fit(X[train_idx, :, :], y_train)
X_train = concatenator.transform(X[train_idx, :, :])
# Scale data across trials
X_train = scaler.fit_transform(X_train)
X_test = concatenator.transform(X[test_idx, :, :])
X_test = scaler.fit_transform(X_test)
clf = clf.fit(X_train, y_train)
def test_combine_inputs_floats_ints(self):
data = [[0, 0.0], [0, 0.0], [1, 1.0], [1, 1.0]]
scaler = StandardScaler()
scaler.fit(data)
model = Pipeline([("scaler1", scaler), ("scaler2", scaler)])
model_onnx = convert_sklearn(
model,
"pipeline",
[
("input1", Int64TensorType([None, 1])),
("input2", FloatTensorType([None, 1])),
],
)
self.assertTrue(len(model_onnx.graph.node[-1].output) == 1)
self.assertTrue(model_onnx is not None)
data = numpy.array(data)
data = {
"input1": data[:, 0].reshape((-1, 1)).astype(numpy.int64),
def standard_scale(data, data_mean=None, data_std=None):
"""
Does a standardization over data.
Sometimes data do not fit in memory and we need to process chunks of data,
in this case, `data_mean` and `data_std` are required to be calculated before
scaling.
:param data (array):
:param data_mean (array):
:param data_std (array):
:return (array): standardized data.
"""
data = data.astype('float')
if (data_mean is None) and (data_std is None):
# use sklearn default standardScaler
std_scale = preprocessing.StandardScaler().fit(data)
return std_scale.transform(data)
# Custom standardization, since data is probably spread on
# several parts and stats were collected before
return (data - data_mean) / data_std
def plot_roc_curve(X, y, plot_dir, trial, cv, model):
mean_tpr = 0.0
mean_fpr = np.linspace(0, 1, 100)
thresh_plt = 0.0
thresh_mean = 0.0
model_nm = str(model).split("(")[0]
### Create StratifiedKFold generator
cv = StratifiedKFold(y, n_folds=5, shuffle=True)
### Initialize StandardScaler
scaler = StandardScaler()
for i, (train, test) in enumerate(cv):
X_train = scaler.fit_transform(X[train])
X_test = scaler.transform(X[test])
probas_ = model.fit(X_train, y[train]).predict_proba(X_test)
# Compute ROC curve and area the curve
fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1])
mean_tpr += interp(mean_fpr, fpr, tpr)
mean_tpr[0] = 0.0
roc_auc = auc(fpr, tpr)
plt.plot(fpr, tpr, lw=1, label='ROC fold %d (area = %0.2f)' % (i+1, roc_auc))
thresholds[0] = min(1.0, thresholds[0])
thresholds[-1] = max(0.0, thresholds[-1])
thresh_mean += interp(mean_fpr, np.linspace(0,1,len(thresholds)), thresholds)
# plt.plot(fpr, thresholds, lw=1, label='Thresholds %d (%0.2f - %0.2f)' % (i+1, thresholds.max(), thresholds.min())) # np.linspace(0,1,len(thresholds))
import matplotlib.pyplot as plt
plt.style.use('seaborn-deep')
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
# Reading in data
ds = pd.read_csv("Social_Network_Ads.csv")
X = ds.iloc[:, 2:4].values
y = ds.iloc[:,4].values
# Splitting and scaling
X_train, X_test, y_train, y_test = train_test_split(X,y)
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.fit_transform(X_test)
# Classifier
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
# Plot
from matplotlib.colors import ListedColormap
X_set, y_set = X_train, y_train
print("np.array(userDocFeatures):", np.array(userDocFeatures))
print("Initial cosine similarity between doc and means: ", initCosineSim)
# Basic sanity check to make sure cosine threshold correctly identifies
# authorship of user's document.
print("Cosine similarity below threshold? ", str(
initCosineSim < unstyle.controller.t))
# Combine documents and labels. This creates the training set.
X = np.vstack((userOtherFeatures, otherAuthorFeatures))
y = []
y.extend(userLabels)
y.extend(otherAuthorLabels)
print("Training labels: ", y)
# Instantiate classifier; train and predict on scaled data.
scaler = preprocessing.StandardScaler().fit(X)
clf = svm.SVC(probability=True, kernel='linear', C=1.0, class_weight='auto')
clf.fit(scaler.transform(X), y)
print("Predicted author of doc: " +
str(clf.predict(scaler.transform(userDocFeatures))))
print("Certainty: ", clf.predict_proba(scaler.transform(userDocFeatures)))
print("Classifier internal label rep: ", clf.classes_)
# Get feature ranks
unstyle.controller.feature_ranks = rank_features_rfe(
scaler.transform(X), y, featset)
print(str(feature_ranks))
# Get target values for features.
authors = unstyle.controller.numAuthors
unstyle.controller.targets = unstyle.adversarial.compute_target_vals(
userDocFeatures,
def preprocess_data(x_train, x_test):
# log(x+1)
x_train = np.array(x_train)
x_test = np.array(x_test)
x_train = np.log(x_train.astype(int)+1)
x_test = np.log(x_test.astype(int)+1)
# standazition
sc = StandardScaler(copy=True, with_mean=True, with_std=True)
sc.fit(x_train)
x_train = sc.transform(x_train)
x_test = sc.transform(x_test)
return x_train, x_test
"""
workspace = args.workspace
data_type = args.data_type
snr = args.snr
# Load data.
t1 = time.time()
hdf5_path = os.path.join(workspace, "packed_features", "spectrogram", data_type, "%ddb" % int(snr), "data.h5")
with h5py.File(hdf5_path, 'r') as hf:
x = hf.get('x')
x = np.array(x) # (n_segs, n_concat, n_freq)
# Compute scaler.
(n_segs, n_concat, n_freq) = x.shape
x2d = x.reshape((n_segs * n_concat, n_freq))
scaler = preprocessing.StandardScaler(with_mean=True, with_std=True).fit(x2d)
print(scaler.mean_)
print(scaler.scale_)
# Write out scaler.
out_path = os.path.join(workspace, "packed_features", "spectrogram", data_type, "%ddb" % int(snr), "scaler.p")
create_folder(os.path.dirname(out_path))
pickle.dump(scaler, open(out_path, 'wb'))
print("Save scaler to %s" % out_path)
print("Compute scaler finished! %s s" % (time.time() - t1,))
"""
df = df.dropna(axis=1, how='all')
imputer = Imputer(strategy='mean', axis=0)
mat = imputer.fit_transform(df)
if scaling is None or scaling.lower() == 'none':
return pd.DataFrame(mat, columns=df.columns)
if scaling == 'maxabs':
scaler = MaxAbsScaler()
elif scaling == 'minmax':
scaler = MinMaxScaler()
else:
scaler = StandardScaler()
mat = scaler.fit_transform(mat)
df = pd.DataFrame(mat, columns=df.columns)
return df
self.with_std = with_std
self.scalings = scalings
if not (scalings is None or isinstance(scalings, (dict, str))):
raise ValueError('scalings type should be dict, str, or None, '
'got %s' % type(scalings))
if isinstance(scalings, str):
_check_option('scalings', scalings, ['mean', 'median'])
if scalings is None or isinstance(scalings, dict):
if info is None:
raise ValueError('Need to specify "info" if scalings is'
'%s' % type(scalings))
self._scaler = _ConstantScaler(info, scalings, self.with_std)
elif scalings == 'mean':
from sklearn.preprocessing import StandardScaler
self._scaler = StandardScaler(self.with_mean, self.with_std)
else: # scalings == 'median':
if not check_version('sklearn', '0.17'):
raise ValueError("median requires version 0.17 of "
"sklearn library")
from sklearn.preprocessing import RobustScaler
self._scaler = RobustScaler(self.with_mean, self.with_std)