How to use the sklearn.model_selection.train_test_split function in sklearn

To help you get started, we’ve selected a few sklearn examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github AstarLight / Keras-image-classifer-framework / invoice-code / train.py View on Github external
image = img_to_array(image)
        data.append(image)

        # extract the class label from the image path and update the
        # labels list
        label = int(imagePath.split(os.path.sep)[-2])       
        labels.append(label)  
        
    # scale the raw pixel intensities to the range [0, 1]
    data = np.array(data, dtype="float") / 255.0
    labels = np.array(labels)


    # partition the data into training and testing splits using 75% of
    # the data for training and the remaining 25% for testing
    (trainX, testX, trainY, testY) = train_test_split(data,
            labels, test_size=0.25, random_state=42)

    # convert the labels from integers to vectors
    trainY = to_categorical(trainY, num_classes=CLASS_NUM)
    testY = to_categorical(testY, num_classes=CLASS_NUM)   
    return trainX,trainY,testX,testY
github tomaszkacmajor / CarND-Traffic-Sign-Classifier-P2 / tf_feedforward.py View on Github external
"""
TensorFlow feedforward net on German Traffic Sign Dataset
"""
import tensorflow as tf
import numpy as np
import pickle
from sklearn.model_selection import train_test_split

with open('data/train.p', mode='rb') as f:
    train = pickle.load(f)
with open('data/test.p', mode='rb') as f:
    test = pickle.load(f)

X_train, X_val, y_train, y_val = train_test_split(train['features'], train['labels'], test_size=0.33, random_state=0)
X_test, y_test = test['features'], test['labels']

X_train = X_train.astype('float32')
X_val = X_val.astype('float32')
X_test = X_test.astype('float32')

# 0-255 -> 0-1
X_train /= 255
X_val /= 255
X_test /= 255

batch_size = 64
n_classes = 43 # number of traffic signs
epochs = 10
input_shape = X_train.shape[1:]
feature_size = np.prod(input_shape)
github suanrong / SDNE / utils / utils.py View on Github external
def check_multi_label_classification(X, Y, test_ratio = 0.9):
    def small_trick(y_test, y_pred):
        y_pred_new = np.zeros(y_pred.shape,np.bool)
        sort_index = np.flip(np.argsort(y_pred, axis = 1), 1)
        for i in range(y_test.shape[0]):
            num = sum(y_test[i])
            for j in range(num):
                y_pred_new[i][sort_index[i][j]] = True
        return y_pred_new
        
    x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = test_ratio)
    clf = OneVsRestClassifier(LogisticRegression())
    clf.fit(x_train, y_train)
    y_pred = clf.predict_proba(x_test)
    
    ## small trick : we assume that we know how many label to predict
    y_pred = small_trick(y_test, y_pred)
    
    micro = f1_score(y_test, y_pred, average = "micro")
    macro = f1_score(y_test, y_pred, average = "macro")
    return "micro_f1: %.4f macro_f1 : %.4f" % (micro, macro)
    #############################################
github kleinzcy / speech_signal_processing / GMM_UBM.py View on Github external
def load_extract(test_size=0.3):
    # combination load_data and extract_feature
    x, y = load_data()
    # train test split
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=test_size, random_state=0)
    # extract feature from train
    train_data, x_train, y_train = extract_feature(x=x_train, y=y_train, is_train=True)
    # extract feature from test
    x_test, y_test = extract_feature(x=x_test,y=y_test)

    return train_data, x_train, y_train, x_test, y_test
github Azure / MachineLearningNotebooks / how-to-use-azureml / training / train-on-remote-vm / train.py View on Github external
os.makedirs('./outputs', exist_ok=True)
parser = argparse.ArgumentParser()
parser.add_argument('--data-folder', type=str,
                    dest='data_folder', help='data folder')
args = parser.parse_args()

print('Data folder is at:', args.data_folder)
print('List all files: ', os.listdir(args.data_folder))

X = np.load(os.path.join(args.data_folder, 'features.npy'))
y = np.load(os.path.join(args.data_folder, 'labels.npy'))

run = Run.get_context()

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=0)
data = {"train": {"X": X_train, "y": y_train},
        "test": {"X": X_test, "y": y_test}}

# list of numbers from 0.0 to 1.0 with a 0.05 interval
alphas = np.arange(0.0, 1.0, 0.05)

for alpha in alphas:
    # Use Ridge algorithm to create a regression model
    reg = Ridge(alpha=alpha)
    reg.fit(data["train"]["X"], data["train"]["y"])

    preds = reg.predict(data["test"]["X"])
    mse = mean_squared_error(preds, data["test"]["y"])
    run.log('alpha', alpha)
    run.log('mse', mse)
github giulbia / baby_cry_detection / baby_cry_detection / pc_methods / train_classifier.py View on Github external
def train(self):
        """
        Train Random Forest

        :return: pipeline, best_param, best_estimator, perf
        """

        logging.info('Splitting train and test set. Test set size: 0.25%')

        # Split into training and test set
        X_train, X_test, y_train, y_test = train_test_split(self.X, self.y,
                                                            test_size=0.25,
                                                            random_state=0,
                                                            stratify=self.y)

        logging.info('Train set size: {0}. Test set size: {1}'.format(y_train.size, y_test.size))

        pipeline = Pipeline([
            ('scl', StandardScaler()),
            # ('lda', LinearDiscriminantAnalysis()),
            ('clf', SVC(probability=True))
        ])

        # GridSearch
        param_grid = [{'clf__kernel': ['linear', 'rbf'],
                       'clf__C': [0.001, 0.01, 0.1, 1, 10, 100],
                       'clf__gamma': np.logspace(-2, 2, 5),
github atyryshkina / algorithm-performance-analysis / train_model.py View on Github external
regr = sklearn.ensemble.RandomForestRegressor(n_estimators=100, max_depth=12)

        self.pipe = sklearn.pipeline.Pipeline([
            ('chooser',chooser),
            ('scaler', scaler),
            ('regr', regr)
        ])
        
        test_size = 0.2
        test_start=len(df_labels)-int(len(df_labels)*test_size)
        print(test_start, len(df_labels))

        # print("self.args.split_randomly ", self.args.split_randomly)

        if ast.literal_eval(self.args.split_train_test) and (ast.literal_eval(self.args.split_randomly)):
            tr_features, ev_features, tr_labels, ev_labels = sklearn.model_selection.train_test_split(df_features, df_labels, test_size=test_size)
            print("splitting randomly")
        elif ast.literal_eval(self.args.split_train_test):
            tr_features, tr_labels, ev_features, ev_labels = df_features[:test_start], df_labels[:test_start], df_features[test_start:], df_labels[test_start:]
            print("splitting non-randomly")
        else:
            tr_features, tr_labels, ev_features, ev_labels = df_features,df_labels,df_features,df_labels
            print("not splitting")
    

        print("fitting the model...")
        self.pipe.fit(tr_features, tr_labels)
        ev_pred = self.pipe.predict(ev_features)

        # unlog the runtimes (they were previously log transformed in the function clean_data()) 
        cq=pd.DataFrame()
        cq["labels"] = np.expm1(ev_labels)
github AmazingDD / daisyRec / daisy / utils / loader.py View on Github external
end_idx = grp.index[-1]

            return list(range(split_idx, end_idx + 1))

        test_index = df.groupby('user').apply(time_split).explode().values
        test_set = df.loc[test_index, :]
        train_set = df[~df.index.isin(test_index)]

    elif test_method == 'tfo':
        # df = df.sample(frac=1)
        df = df.sort_values(['timestamp']).reset_index(drop=True)
        split_idx = int(np.ceil(len(df) * (1 - test_size)))
        train_set, test_set = df.iloc[:split_idx, :].copy(), df.iloc[split_idx:, :].copy()

    elif test_method == 'fo':
        train_set, test_set = train_test_split(df, test_size=test_size, random_state=2019)

    elif test_method == 'tloo':
        # df = df.sample(frac=1)
        df = df.sort_values(['timestamp']).reset_index(drop=True)
        df['rank_latest'] = df.groupby(['user'])['timestamp'].rank(method='first', ascending=False)
        train_set, test_set = df[df['rank_latest'] > 1].copy(), df[df['rank_latest'] == 1].copy()
        del train_set['rank_latest'], test_set['rank_latest']

    elif test_method == 'loo':
        # # slow method
        # test_set = df.groupby(['user']).apply(pd.DataFrame.sample, n=1).reset_index(drop=True)
        # test_key = test_set[['user', 'item']].copy()
        # train_set = df.set_index(['user', 'item']).drop(pd.MultiIndex.from_frame(test_key)).reset_index().copy()

        # # quick method
        test_index = df.groupby(['user']).apply(lambda grp: np.random.choice(grp.index))
github jw15 / wildflower-finder / src / cnn_resnet50.py View on Github external
def train_validation_split(x, y):
    '''
    Splits train and validation data and images. (Will also load test images, names from saved array).
    Input: saved numpy array, files/columns in that array
    Output: Train/validation data (e.g., X_train, X_test, y_train, y_test), test images, test image names (file names minus '.png')
    '''
    # Encode flower categories as numerical values
    number = LabelEncoder()
    y = number.fit_transform(y.astype('str'))

    # Split train and test subsets to get final text data (don't change this)
    X_training, X_test_holdout, y_training, y_test_holdout = train_test_split(x, y, stratify=y, random_state=42, test_size=.2)
    print('Initial split for (holdout) test data:\n \
    X_training: {} \n \
    y_training: {} \n \
    X_test_holdout: {} \n \
    y_test_holdout: {} \n'.format(X_training.shape, y_training.shape, X_test_holdout.shape, y_test_holdout.shape))

    # Split train into train and validation data (different for each model):
    X_train, X_test, y_train, y_test = train_test_split(X_training, y_training, stratify=y_training, random_state=seed, test_size=.2)
    train_classes = len(np.unique(y_train))
    test_classes = len(np.unique(y_test))

    print('Train/validation split for this model:\n \
    X_train: {} \n \
    y_train: {} \n \
    X_test: {} \n \
    y_test: {} \n \