Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
image = img_to_array(image)
data.append(image)
# extract the class label from the image path and update the
# labels list
label = int(imagePath.split(os.path.sep)[-2])
labels.append(label)
# scale the raw pixel intensities to the range [0, 1]
data = np.array(data, dtype="float") / 255.0
labels = np.array(labels)
# partition the data into training and testing splits using 75% of
# the data for training and the remaining 25% for testing
(trainX, testX, trainY, testY) = train_test_split(data,
labels, test_size=0.25, random_state=42)
# convert the labels from integers to vectors
trainY = to_categorical(trainY, num_classes=CLASS_NUM)
testY = to_categorical(testY, num_classes=CLASS_NUM)
return trainX,trainY,testX,testY
"""
TensorFlow feedforward net on German Traffic Sign Dataset
"""
import tensorflow as tf
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
with open('data/train.p', mode='rb') as f:
train = pickle.load(f)
with open('data/test.p', mode='rb') as f:
test = pickle.load(f)
X_train, X_val, y_train, y_val = train_test_split(train['features'], train['labels'], test_size=0.33, random_state=0)
X_test, y_test = test['features'], test['labels']
X_train = X_train.astype('float32')
X_val = X_val.astype('float32')
X_test = X_test.astype('float32')
# 0-255 -> 0-1
X_train /= 255
X_val /= 255
X_test /= 255
batch_size = 64
n_classes = 43 # number of traffic signs
epochs = 10
input_shape = X_train.shape[1:]
feature_size = np.prod(input_shape)
def check_multi_label_classification(X, Y, test_ratio = 0.9):
def small_trick(y_test, y_pred):
y_pred_new = np.zeros(y_pred.shape,np.bool)
sort_index = np.flip(np.argsort(y_pred, axis = 1), 1)
for i in range(y_test.shape[0]):
num = sum(y_test[i])
for j in range(num):
y_pred_new[i][sort_index[i][j]] = True
return y_pred_new
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = test_ratio)
clf = OneVsRestClassifier(LogisticRegression())
clf.fit(x_train, y_train)
y_pred = clf.predict_proba(x_test)
## small trick : we assume that we know how many label to predict
y_pred = small_trick(y_test, y_pred)
micro = f1_score(y_test, y_pred, average = "micro")
macro = f1_score(y_test, y_pred, average = "macro")
return "micro_f1: %.4f macro_f1 : %.4f" % (micro, macro)
#############################################
def load_extract(test_size=0.3):
# combination load_data and extract_feature
x, y = load_data()
# train test split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=test_size, random_state=0)
# extract feature from train
train_data, x_train, y_train = extract_feature(x=x_train, y=y_train, is_train=True)
# extract feature from test
x_test, y_test = extract_feature(x=x_test,y=y_test)
return train_data, x_train, y_train, x_test, y_test
os.makedirs('./outputs', exist_ok=True)
parser = argparse.ArgumentParser()
parser.add_argument('--data-folder', type=str,
dest='data_folder', help='data folder')
args = parser.parse_args()
print('Data folder is at:', args.data_folder)
print('List all files: ', os.listdir(args.data_folder))
X = np.load(os.path.join(args.data_folder, 'features.npy'))
y = np.load(os.path.join(args.data_folder, 'labels.npy'))
run = Run.get_context()
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=0)
data = {"train": {"X": X_train, "y": y_train},
"test": {"X": X_test, "y": y_test}}
# list of numbers from 0.0 to 1.0 with a 0.05 interval
alphas = np.arange(0.0, 1.0, 0.05)
for alpha in alphas:
# Use Ridge algorithm to create a regression model
reg = Ridge(alpha=alpha)
reg.fit(data["train"]["X"], data["train"]["y"])
preds = reg.predict(data["test"]["X"])
mse = mean_squared_error(preds, data["test"]["y"])
run.log('alpha', alpha)
run.log('mse', mse)
def train(self):
"""
Train Random Forest
:return: pipeline, best_param, best_estimator, perf
"""
logging.info('Splitting train and test set. Test set size: 0.25%')
# Split into training and test set
X_train, X_test, y_train, y_test = train_test_split(self.X, self.y,
test_size=0.25,
random_state=0,
stratify=self.y)
logging.info('Train set size: {0}. Test set size: {1}'.format(y_train.size, y_test.size))
pipeline = Pipeline([
('scl', StandardScaler()),
# ('lda', LinearDiscriminantAnalysis()),
('clf', SVC(probability=True))
])
# GridSearch
param_grid = [{'clf__kernel': ['linear', 'rbf'],
'clf__C': [0.001, 0.01, 0.1, 1, 10, 100],
'clf__gamma': np.logspace(-2, 2, 5),
regr = sklearn.ensemble.RandomForestRegressor(n_estimators=100, max_depth=12)
self.pipe = sklearn.pipeline.Pipeline([
('chooser',chooser),
('scaler', scaler),
('regr', regr)
])
test_size = 0.2
test_start=len(df_labels)-int(len(df_labels)*test_size)
print(test_start, len(df_labels))
# print("self.args.split_randomly ", self.args.split_randomly)
if ast.literal_eval(self.args.split_train_test) and (ast.literal_eval(self.args.split_randomly)):
tr_features, ev_features, tr_labels, ev_labels = sklearn.model_selection.train_test_split(df_features, df_labels, test_size=test_size)
print("splitting randomly")
elif ast.literal_eval(self.args.split_train_test):
tr_features, tr_labels, ev_features, ev_labels = df_features[:test_start], df_labels[:test_start], df_features[test_start:], df_labels[test_start:]
print("splitting non-randomly")
else:
tr_features, tr_labels, ev_features, ev_labels = df_features,df_labels,df_features,df_labels
print("not splitting")
print("fitting the model...")
self.pipe.fit(tr_features, tr_labels)
ev_pred = self.pipe.predict(ev_features)
# unlog the runtimes (they were previously log transformed in the function clean_data())
cq=pd.DataFrame()
cq["labels"] = np.expm1(ev_labels)
end_idx = grp.index[-1]
return list(range(split_idx, end_idx + 1))
test_index = df.groupby('user').apply(time_split).explode().values
test_set = df.loc[test_index, :]
train_set = df[~df.index.isin(test_index)]
elif test_method == 'tfo':
# df = df.sample(frac=1)
df = df.sort_values(['timestamp']).reset_index(drop=True)
split_idx = int(np.ceil(len(df) * (1 - test_size)))
train_set, test_set = df.iloc[:split_idx, :].copy(), df.iloc[split_idx:, :].copy()
elif test_method == 'fo':
train_set, test_set = train_test_split(df, test_size=test_size, random_state=2019)
elif test_method == 'tloo':
# df = df.sample(frac=1)
df = df.sort_values(['timestamp']).reset_index(drop=True)
df['rank_latest'] = df.groupby(['user'])['timestamp'].rank(method='first', ascending=False)
train_set, test_set = df[df['rank_latest'] > 1].copy(), df[df['rank_latest'] == 1].copy()
del train_set['rank_latest'], test_set['rank_latest']
elif test_method == 'loo':
# # slow method
# test_set = df.groupby(['user']).apply(pd.DataFrame.sample, n=1).reset_index(drop=True)
# test_key = test_set[['user', 'item']].copy()
# train_set = df.set_index(['user', 'item']).drop(pd.MultiIndex.from_frame(test_key)).reset_index().copy()
# # quick method
test_index = df.groupby(['user']).apply(lambda grp: np.random.choice(grp.index))
def train_validation_split(x, y):
'''
Splits train and validation data and images. (Will also load test images, names from saved array).
Input: saved numpy array, files/columns in that array
Output: Train/validation data (e.g., X_train, X_test, y_train, y_test), test images, test image names (file names minus '.png')
'''
# Encode flower categories as numerical values
number = LabelEncoder()
y = number.fit_transform(y.astype('str'))
# Split train and test subsets to get final text data (don't change this)
X_training, X_test_holdout, y_training, y_test_holdout = train_test_split(x, y, stratify=y, random_state=42, test_size=.2)
print('Initial split for (holdout) test data:\n \
X_training: {} \n \
y_training: {} \n \
X_test_holdout: {} \n \
y_test_holdout: {} \n'.format(X_training.shape, y_training.shape, X_test_holdout.shape, y_test_holdout.shape))
# Split train into train and validation data (different for each model):
X_train, X_test, y_train, y_test = train_test_split(X_training, y_training, stratify=y_training, random_state=seed, test_size=.2)
train_classes = len(np.unique(y_train))
test_classes = len(np.unique(y_test))
print('Train/validation split for this model:\n \
X_train: {} \n \
y_train: {} \n \
X_test: {} \n \
y_test: {} \n \