How to use the sklearn.preprocessing.LabelEncoder function in sklearn

To help you get started, we’ve selected a few sklearn examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github feranick / SpectralMachine / Other / img_cnn / test_spectra_cnn.py View on Github external
try:
    with open(learnFile, 'r') as f:
        M = np.loadtxt(f, unpack =False)
except:
    print('\033[1m' + ' Learning file not found \n' + '\033[0m')

En = np.delete(np.array(M[0,:]),np.s_[0:1],0)
M = np.delete(M,np.s_[0:1],0)
Cl = ['{:.2f}'.format(x) for x in M[:,0]]
A = np.delete(M,np.s_[0:1],1)
learnFileRoot = os.path.splitext(learnFile)[0]
totA = A
totCl = Cl
numTotClasses = np.unique(totCl).size
le = preprocessing.LabelEncoder()
totCl2 = le.fit_transform(totCl)
Cl2 = le.transform(Cl)
totCl2 = keras.utils.to_categorical(totCl2, num_classes=np.unique(totCl).size)
Cl2 = keras.utils.to_categorical(Cl2, num_classes=np.unique(Cl).size+1)



df = pd.DataFrame()
for i in range(A.shape[0]):
    G = np.dstack([np.ones(En.shape[0]), En])
    G = np.dstack([G, A[0]])
    S = pd.Series(G.tolist())
    df[i] = S.values


S = np.asarray([df.iloc[:,1].values[0]])
github RasaHQ / rasa / rasa / nlu / featurizers / ngram_featurizer.py View on Github external
    @staticmethod
    def encode_labels(labels):
        from sklearn import preprocessing

        intent_encoder = preprocessing.LabelEncoder()
        intent_encoder.fit(labels)
        return intent_encoder.transform(labels)
github ckbjimmy / cdc / cdc / src / MLPipeline.py View on Github external
a = alg.split('+')
    alg_list = [alg_list[alg_abbreviation.index(i)] for i in a]
    num_cores = multiprocessing.cpu_count()

    os.chdir(path + 'data/')
    
    print "--- Data preprocessing ---"
    df = pd.read_csv(data, sep='\t')
    df = df.sort('fname', ascending=True).reset_index(drop=True)
    df['label'] = pd.read_csv(label, sep='\t', header=None)
    #for i in xrange(len(df.fname.tolist())):
    #    df.label[i] = re.sub(r"(.*)(.*)( \([0-9]+\).xml)", r"\1", df.fname.tolist()[i])
    y_unencoded = df.label

    print "Label encoding"
    encoder = LabelEncoder()
    encoder.fit(y_unencoded)
    y = encoder.transform(y_unencoded)

    pred_table = pd.DataFrame()
    score_table = pd.DataFrame()
    best_score = 0
    
    ff = feature.split('+')
    
    for f in ff:
        
        time_start = time.time()
        
        if alg == 'dnn':
            best_model, best_feature, best_algorithm, best_coef, score_table, pred_table, metrics = \
                Dnn(df=df, y=y, encoder=encoder, class_wt=False, trained_w2v='umls', batch_size=int(df.shape[0]/100), n_epoch=25, \
github ghzhang233 / Leakage-Neutral-Learning-for-QuoraQP / quantify / leaky_predict.py View on Github external
def encode_data(data):
    data['label'] = LabelEncoder().fit_transform(data['label'])
    le = LabelEncoder()
    le.fit(data['sen1'].append(data['sen2']))
    data['tid1'] = le.transform(data['sen1'])
    data['tid2'] = le.transform(data['sen2'])
    return data
github xflows / rdm / benchmark_studies / benchmark_algorithms.py View on Github external
if en[-1] != '':
                    features_target = en[-1]
                    features_train = en[0:len(en)-1]
                    features_train = [1 if x == "+" else 0 for x in features_train]
                    entries_test.append(features_train)
                    targets_test.append(features_target)

            if learner == "aleph":
                targets_test = ['positive' if x == target_attr_value else 'negative' for x in targets_test]

        train_features = pd.DataFrame(entries)
        train_targets = pd.DataFrame(targets)
        test_features = pd.DataFrame(entries_test)
        test_targets = pd.DataFrame(targets_test)

        le = preprocessing.LabelEncoder()
        le.fit(train_targets)

        targets_train_encoded = le.transform(train_targets)

        targets_test_encoded = le.transform(test_targets)

        clf = tree.DecisionTreeClassifier()
        clf.fit(train_features,targets_train_encoded)
        preds = clf.predict(test_features)

        acc = accuracy_score(preds,targets_test_encoded)
        f1 = f1_score(preds,targets_test_encoded)
        predictions.append(acc)
        predictions_f1.append(f1)
        end = timer()
        times.append(end-start)
github feranick / SpectralMachine / Archive / 20171010c / SpectraLearnPredict.py View on Github external
else:
        dnntfDef.alwaysImprove = True
        model_directory = None
        print("\n  Training model not saved\n")

    #**********************************************
    ''' Initialize Estimator and training data '''
    #**********************************************
    print(' Initializing TensorFlow...')
    tf.reset_default_graph()

    totA = np.vstack((A, A_test))
    totCl = np.append(Cl, Cl_test)
    numTotClasses = np.unique(totCl).size
    
    le = preprocessing.LabelEncoder()
    totCl2 = le.fit_transform(totCl)
    Cl2 = le.transform(Cl)
    Cl2_test = le.transform(Cl_test)
    
    validation_monitor = skflow.monitors.ValidationMonitor(input_fn=lambda: input_fn(A_test, Cl2_test),
                                                           eval_steps=1,
                                                           every_n_steps=dnntfDef.valMonitorSecs)

    feature_columns = skflow.infer_real_valued_columns_from_input(totA.astype(np.float32))
    clf = skflow.DNNClassifier(feature_columns=feature_columns, hidden_units=dnntfDef.hidden_layers,
            optimizer=dnntfDef.optimizer, n_classes=numTotClasses,
            activation_fn=dnntfDef.activationFn, model_dir=model_directory,
            config=skflow.RunConfig(save_checkpoints_secs=dnntfDef.timeCheckpoint),
            dropout=dnntfDef.dropout_perc)
                               
    print("\n Number of global steps:",dnntfDef.trainingSteps)
github braindynamicslab / dyneusr / dyneusr / tools / graph_utils.py View on Github external
if len(meta) < 1:
            continue
        # process meta label
        meta_label = None
        if isinstance(labels, dict):
            # one list per column
            if meta_col in labels:
                meta_label = list(labels[meta_col])
        elif isinstance(labels, list):
            # shared list
            if not isinstance(labels[0], list):
                meta_label = list(labels)

        # process meta
        if str(meta[0]).isalpha() or type(meta[0]) is str:
            encoder = LabelEncoder()
            yi = encoder.fit_transform(meta)
            yi_bins = np.linspace(yi.min(), yi.max(), num=min(5, len(set(yi))), endpoint=True)
            meta = np.digitize(yi, yi_bins, right=True)
            meta = yi_bins[meta]
            meta_label = [list(yi_bins).index(_) for _ in sorted(set(meta))]
            #meta_label = ['Group '+str(_+1) for _ in meta_label]
            meta_bins = [_ for _ in zip(yi_bins[:-1], yi_bins[1:])] 
            meta_label = ['Group {} {}'.format(_+1, __) for _,__ in enumerate(meta_bins)]

        
        elif len(set(meta)) > 9 and zscore is True:
            # TODO: how do we decide whether to use continuous vs. discrete
            # zscore
            yi = meta.copy()
            yi_nz = yi[~np.isnan(yi)]
            zi = stats.zscore(yi_nz)
github albertsl / toolkit / templates / python for data science.py View on Github external
# Remove categorical columns (will replace with one-hot encoding)
num_X_train = X_train.drop(low_cardinality_cols, axis=1)
num_X_val = X_val.drop(low_cardinality_cols, axis=1)
# Add one-hot encoded columns to numerical features
OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
OH_X_val = pd.concat([num_X_valid, OH_cols_val], axis=1)

#Use pandas get_dummies for categories encoded as strings
pd.get_dummies(df, columns=['col1','col2'])

#OrdinalEncoding for categories which have an order (example: low/medium/high)
map_dict = {'low': 0, 'medium': 1, 'high': 2}
df['var_oe'] = df['var'].apply(lambda x: map_dict[x])
#We can also do it with sklearn's LabelEncoder
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['var_oe'] = le.fit_transform(df['var'])

#BinaryEncoder when we have many categories in one variable it means creating many columns with OHE. With Binary encoding we can do so with many less columns by using binary numbers. Use only when there is a high cardinality in the categorical variable.
from category_encoders.binary import BinaryEncoder
be = BinaryEncoder(cols = ['var'])
df = be.fit_transform(df)

#HashingEncoder
from category_encoders.hashing import HashingEncoder
he = HashingEncoder(cols = ['var'])
df = he.fit_transform(df)

#Feature selection: Drop attributes that provide no useful information for the task
#Unsupervised Feature selection before training a model
from sklearn.feature_selection import SelectKBest, chi2
bestfeatures = SelectKBest(score_func=chi2, k='all')
github andreasvc / disco-dop / discodop / functiontags.py View on Github external
copy=False, with_mean=False))])
	# PTB has no function tags on pretermintals, Negra/Tiger/Lassy do.
	posfunc = any(functions(node) for tree in trees
			for node in tree.subtrees()
			if node and isinstance(node[0], int))
	target = [functions(node) for tree in trees
			for node in tree.subtrees()
			if tree is not node and node
				and (posfunc or isinstance(node[0], Tree))]
	# PTB may have multiple tags (or 0) per node.
	# Negra/Tiger/Lassy have exactly 1 tag for every node.
	multi = any(len(a) > 1 for a in target)
	if multi:
		encoder = preprocessing.MultiLabelBinarizer()
	else:
		encoder = preprocessing.LabelEncoder()
		target = [a[0] if a else '--' for a in target]
	# binarize features (output is a sparse array)
	trainfeats = vectorizer.fit_transform(functionfeatures(node, sent)
			for tree, sent in zip(trees, sents)
				for node in tree.subtrees()
				if tree is not node
				and node and (posfunc or isinstance(node[0], Tree)))
	trainfuncs = encoder.fit_transform(target)
	classifier = linear_model.SGDClassifier(
			loss='hinge',
			penalty='elasticnet',
			max_iter=int(10 ** 6 / len(trees)))
	alphas = [1e-1, 1e-2, 1e-3, 1e-4, 1e-5, 1e-6]
	if multi:
		classifier = multiclass.OneVsRestClassifier(
				classifier, n_jobs=numproc or -1)
github kengz / aiva / lib / py / ais / ai_lib / preprocess.py View on Github external
def restore(self,model_path):
    '''
    Restore a saved multiencoder from path using npz file, by reconstructing the LabelEncoders with the classes.
    Restore the X header too.
    '''
    path = model_path + '/encoder.npz'
    h_path = model_path + '/header.npz'
    npzfile = np.load(path)
    h_npzfile = np.load(h_path)
    self.header = h_npzfile['header']
    self.encoders = {}
    for k,v in npzfile.items():
      le = LabelEncoder()
      le.classes_ = v
      self.encoders[k] = le
    self.columns = list(self.encoders.keys())
    return self