Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
try:
with open(learnFile, 'r') as f:
M = np.loadtxt(f, unpack =False)
except:
print('\033[1m' + ' Learning file not found \n' + '\033[0m')
En = np.delete(np.array(M[0,:]),np.s_[0:1],0)
M = np.delete(M,np.s_[0:1],0)
Cl = ['{:.2f}'.format(x) for x in M[:,0]]
A = np.delete(M,np.s_[0:1],1)
learnFileRoot = os.path.splitext(learnFile)[0]
totA = A
totCl = Cl
numTotClasses = np.unique(totCl).size
le = preprocessing.LabelEncoder()
totCl2 = le.fit_transform(totCl)
Cl2 = le.transform(Cl)
totCl2 = keras.utils.to_categorical(totCl2, num_classes=np.unique(totCl).size)
Cl2 = keras.utils.to_categorical(Cl2, num_classes=np.unique(Cl).size+1)
df = pd.DataFrame()
for i in range(A.shape[0]):
G = np.dstack([np.ones(En.shape[0]), En])
G = np.dstack([G, A[0]])
S = pd.Series(G.tolist())
df[i] = S.values
S = np.asarray([df.iloc[:,1].values[0]])
@staticmethod
def encode_labels(labels):
from sklearn import preprocessing
intent_encoder = preprocessing.LabelEncoder()
intent_encoder.fit(labels)
return intent_encoder.transform(labels)
a = alg.split('+')
alg_list = [alg_list[alg_abbreviation.index(i)] for i in a]
num_cores = multiprocessing.cpu_count()
os.chdir(path + 'data/')
print "--- Data preprocessing ---"
df = pd.read_csv(data, sep='\t')
df = df.sort('fname', ascending=True).reset_index(drop=True)
df['label'] = pd.read_csv(label, sep='\t', header=None)
#for i in xrange(len(df.fname.tolist())):
# df.label[i] = re.sub(r"(.*)(.*)( \([0-9]+\).xml)", r"\1", df.fname.tolist()[i])
y_unencoded = df.label
print "Label encoding"
encoder = LabelEncoder()
encoder.fit(y_unencoded)
y = encoder.transform(y_unencoded)
pred_table = pd.DataFrame()
score_table = pd.DataFrame()
best_score = 0
ff = feature.split('+')
for f in ff:
time_start = time.time()
if alg == 'dnn':
best_model, best_feature, best_algorithm, best_coef, score_table, pred_table, metrics = \
Dnn(df=df, y=y, encoder=encoder, class_wt=False, trained_w2v='umls', batch_size=int(df.shape[0]/100), n_epoch=25, \
def encode_data(data):
data['label'] = LabelEncoder().fit_transform(data['label'])
le = LabelEncoder()
le.fit(data['sen1'].append(data['sen2']))
data['tid1'] = le.transform(data['sen1'])
data['tid2'] = le.transform(data['sen2'])
return data
if en[-1] != '':
features_target = en[-1]
features_train = en[0:len(en)-1]
features_train = [1 if x == "+" else 0 for x in features_train]
entries_test.append(features_train)
targets_test.append(features_target)
if learner == "aleph":
targets_test = ['positive' if x == target_attr_value else 'negative' for x in targets_test]
train_features = pd.DataFrame(entries)
train_targets = pd.DataFrame(targets)
test_features = pd.DataFrame(entries_test)
test_targets = pd.DataFrame(targets_test)
le = preprocessing.LabelEncoder()
le.fit(train_targets)
targets_train_encoded = le.transform(train_targets)
targets_test_encoded = le.transform(test_targets)
clf = tree.DecisionTreeClassifier()
clf.fit(train_features,targets_train_encoded)
preds = clf.predict(test_features)
acc = accuracy_score(preds,targets_test_encoded)
f1 = f1_score(preds,targets_test_encoded)
predictions.append(acc)
predictions_f1.append(f1)
end = timer()
times.append(end-start)
else:
dnntfDef.alwaysImprove = True
model_directory = None
print("\n Training model not saved\n")
#**********************************************
''' Initialize Estimator and training data '''
#**********************************************
print(' Initializing TensorFlow...')
tf.reset_default_graph()
totA = np.vstack((A, A_test))
totCl = np.append(Cl, Cl_test)
numTotClasses = np.unique(totCl).size
le = preprocessing.LabelEncoder()
totCl2 = le.fit_transform(totCl)
Cl2 = le.transform(Cl)
Cl2_test = le.transform(Cl_test)
validation_monitor = skflow.monitors.ValidationMonitor(input_fn=lambda: input_fn(A_test, Cl2_test),
eval_steps=1,
every_n_steps=dnntfDef.valMonitorSecs)
feature_columns = skflow.infer_real_valued_columns_from_input(totA.astype(np.float32))
clf = skflow.DNNClassifier(feature_columns=feature_columns, hidden_units=dnntfDef.hidden_layers,
optimizer=dnntfDef.optimizer, n_classes=numTotClasses,
activation_fn=dnntfDef.activationFn, model_dir=model_directory,
config=skflow.RunConfig(save_checkpoints_secs=dnntfDef.timeCheckpoint),
dropout=dnntfDef.dropout_perc)
print("\n Number of global steps:",dnntfDef.trainingSteps)
if len(meta) < 1:
continue
# process meta label
meta_label = None
if isinstance(labels, dict):
# one list per column
if meta_col in labels:
meta_label = list(labels[meta_col])
elif isinstance(labels, list):
# shared list
if not isinstance(labels[0], list):
meta_label = list(labels)
# process meta
if str(meta[0]).isalpha() or type(meta[0]) is str:
encoder = LabelEncoder()
yi = encoder.fit_transform(meta)
yi_bins = np.linspace(yi.min(), yi.max(), num=min(5, len(set(yi))), endpoint=True)
meta = np.digitize(yi, yi_bins, right=True)
meta = yi_bins[meta]
meta_label = [list(yi_bins).index(_) for _ in sorted(set(meta))]
#meta_label = ['Group '+str(_+1) for _ in meta_label]
meta_bins = [_ for _ in zip(yi_bins[:-1], yi_bins[1:])]
meta_label = ['Group {} {}'.format(_+1, __) for _,__ in enumerate(meta_bins)]
elif len(set(meta)) > 9 and zscore is True:
# TODO: how do we decide whether to use continuous vs. discrete
# zscore
yi = meta.copy()
yi_nz = yi[~np.isnan(yi)]
zi = stats.zscore(yi_nz)
# Remove categorical columns (will replace with one-hot encoding)
num_X_train = X_train.drop(low_cardinality_cols, axis=1)
num_X_val = X_val.drop(low_cardinality_cols, axis=1)
# Add one-hot encoded columns to numerical features
OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
OH_X_val = pd.concat([num_X_valid, OH_cols_val], axis=1)
#Use pandas get_dummies for categories encoded as strings
pd.get_dummies(df, columns=['col1','col2'])
#OrdinalEncoding for categories which have an order (example: low/medium/high)
map_dict = {'low': 0, 'medium': 1, 'high': 2}
df['var_oe'] = df['var'].apply(lambda x: map_dict[x])
#We can also do it with sklearn's LabelEncoder
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['var_oe'] = le.fit_transform(df['var'])
#BinaryEncoder when we have many categories in one variable it means creating many columns with OHE. With Binary encoding we can do so with many less columns by using binary numbers. Use only when there is a high cardinality in the categorical variable.
from category_encoders.binary import BinaryEncoder
be = BinaryEncoder(cols = ['var'])
df = be.fit_transform(df)
#HashingEncoder
from category_encoders.hashing import HashingEncoder
he = HashingEncoder(cols = ['var'])
df = he.fit_transform(df)
#Feature selection: Drop attributes that provide no useful information for the task
#Unsupervised Feature selection before training a model
from sklearn.feature_selection import SelectKBest, chi2
bestfeatures = SelectKBest(score_func=chi2, k='all')
copy=False, with_mean=False))])
# PTB has no function tags on pretermintals, Negra/Tiger/Lassy do.
posfunc = any(functions(node) for tree in trees
for node in tree.subtrees()
if node and isinstance(node[0], int))
target = [functions(node) for tree in trees
for node in tree.subtrees()
if tree is not node and node
and (posfunc or isinstance(node[0], Tree))]
# PTB may have multiple tags (or 0) per node.
# Negra/Tiger/Lassy have exactly 1 tag for every node.
multi = any(len(a) > 1 for a in target)
if multi:
encoder = preprocessing.MultiLabelBinarizer()
else:
encoder = preprocessing.LabelEncoder()
target = [a[0] if a else '--' for a in target]
# binarize features (output is a sparse array)
trainfeats = vectorizer.fit_transform(functionfeatures(node, sent)
for tree, sent in zip(trees, sents)
for node in tree.subtrees()
if tree is not node
and node and (posfunc or isinstance(node[0], Tree)))
trainfuncs = encoder.fit_transform(target)
classifier = linear_model.SGDClassifier(
loss='hinge',
penalty='elasticnet',
max_iter=int(10 ** 6 / len(trees)))
alphas = [1e-1, 1e-2, 1e-3, 1e-4, 1e-5, 1e-6]
if multi:
classifier = multiclass.OneVsRestClassifier(
classifier, n_jobs=numproc or -1)
def restore(self,model_path):
'''
Restore a saved multiencoder from path using npz file, by reconstructing the LabelEncoders with the classes.
Restore the X header too.
'''
path = model_path + '/encoder.npz'
h_path = model_path + '/header.npz'
npzfile = np.load(path)
h_npzfile = np.load(h_path)
self.header = h_npzfile['header']
self.encoders = {}
for k,v in npzfile.items():
le = LabelEncoder()
le.classes_ = v
self.encoders[k] = le
self.columns = list(self.encoders.keys())
return self