Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def compare_assessors(X, y):
n_estimator = 20
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)
# It is important to train the ensemble of trees on a different subset
# of the training data than the linear regression model to avoid
# overfitting, in particular if the total number of leaves is
# similar to the number of training samples
X_train, X_train_lr, y_train, y_train_lr = train_test_split(X_train,
y_train,
test_size=0.1)
# Unsupervised transformation based on totally random trees
rt = RandomTreesEmbedding(n_estimators=n_estimator, random_state=0)
rt_lm = LogisticRegression()
pipeline = make_pipeline(rt, rt_lm)
pipeline.fit(X_train, y_train)
y_pred_rt = pipeline.predict_proba(X_test)[:, 1]
fpr_rt_lm, tpr_rt_lm, _ = roc_curve(y_test, y_pred_rt)
# Supervised transformation based on random forests
rf = RandomForestClassifier(n_estimators=n_estimator)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict_proba(X_test)[:, 1]
fpr_rf, tpr_rf, _ = roc_curve(y_test, y_pred_rf)
# RF + LR
rf_enc = OneHotEncoder()
rf_enc.fit(rf.apply(X_train))
rf_lm = LogisticRegression()
rf_lm.fit(rf_enc.transform(rf.apply(X_train_lr)), y_train_lr)
X_train, X_valid, y_train, y_valid = cross_validation.train_test_split(trainsetx, trainsety, test_size=(validPercentage/100.0), random_state=seed+run_ix)
for (c_ix, c) in enumerate(c_list):
#more parametrization of model can come from some config file eventually.
model_c = linear_model.LogisticRegression(penalty='l1', C=c, fit_intercept='true', class_weight='auto')
model_c.fit(X_train,y_train)
Ypred_valid = model_c.predict_proba(X_valid)
# evaluation metric could come from a config file eventually. currently AUC is commonly used and we use here
fprs, tprs, thresholdss = roc_curve(y_valid, Ypred_valid[:,1])
score_c = auc(fprs,tprs)
score_array [run_ix, c_ix] = score_c
mean_scores = score_array.mean(axis=0)
mean_scores_ix = np.argmax(mean_scores)
best_c = c_list[mean_scores_ix]
#now train on the entire train set, using best c:
model_best_c = linear_model.LogisticRegression(penalty='l1', C=best_c, fit_intercept='true', class_weight='auto')
model_best_c.fit(trainsetx,trainsety)
#----
Ypred_test = model_best_c.predict_proba(testsetx)
fprs, tprs, thresholdss = roc_curve(testsety, Ypred_test[:,1])
Ypred_train = model_best_c.predict_proba(trainsetx)
fprt, tprt, thresholdst = roc_curve(trainsety, Ypred_train[:,1])
print('score on unseen test set is: ', auc(fprs,tprs), file=sys.stderr)
print('training score on this set was: ', auc(fprt,tprt), file=sys.stderr)
print("best average score during cross validation was:", mean_scores[mean_scores_ix], "with c =", best_c, file=sys.stderr)
#----
print('saving the model in directory: ', modeloutput, file=sys.stderr)
if not os.path.exists(modeloutput):
os.makedirs(modeloutput)
save_name = getsavefile(modeloutput + "/reg_model_scklearn", ".pkl", overwrite)
cPickle.dump(model_best_c, open(save_name, 'wb'), -1)
save_name = getsavefile(modeloutput + "/reg_model_weights", ".txt", overwrite)
def test_n_weights(X,y,test_params):
n_weights=[]
for c in test_params['C']:
lr = LogisticRegression(penalty='l1',C=c, solver='liblinear', fit_intercept=True)
res=lr.fit(X,~y)
n_weights.append(res.coef_[0].astype(bool).sum(axis=0))
return n_weights
def logreg_test_in_training(self):
"""fast, initial method: test vectors in the training data"""
self.good_fv_logreg = LogisticRegression(C=self.C, penalty='l2', solver='liblinear', tol=0.01)
self.bad_fv_logreg = LogisticRegression(C=self.C, penalty='l2', solver='liblinear', tol=0.01)
good_x_test = self.good_X_all[self.good_columns]
good_X = self.good_X_all
good_y = self.good_y_all
good_w = self.good_w_all
bad_x_test = self.bad_X_all[self.bad_columns]
bad_X = self.bad_X_all
bad_y = self.bad_y_all
bad_w = self.bad_w_all
if good_x_test.shape[0] > 0:
self.good_fv_logreg.fit(good_X, good_y, sample_weight=good_w)
self.good_signal = self.good_fv_logreg.decision_function(good_x_test)
if bad_x_test.shape[0] > 0:
def readout_sk(self, X_train, X_test, y_train, y_test, **kwargs):
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(**kwargs)
lr.fit(X_train.T, y_train.T)
y_train_predictions = lr.predict(X_train.T)
y_test_predictions = lr.predict(X_test.T)
return accuracy_score(y_train_predictions, y_train.T), accuracy_score(y_test_predictions, y_test.T)
def LR_ROC(data):
#we initialize the random number generator to a const value
#this is important if we want to ensure that the results
#we can achieve from this model can be achieved again precisely
#Axis or axes along which the means are computed. The default is to compute the mean of the flattened array.
mean = np.mean(data,axis=0)
std = np.std(data,axis=0)
#print 'Mean: \n',mean
#print 'Standar deviation: \n',std
X,Y = preparingData(data)
x_train, x_test, y_train, y_test = train_test_split(X,Y, test_size=0.20)
# convert integers to dummy variables (i.e. one hot encoded)
lr = LogisticRegression(class_weight='balanced')
lr.fit(x_train,y_train)
#The score function of sklearn can quickly assess the model performance
#due to class imbalance , we nned to evaluate the model performance
#on every class. Which means to find when we classify people from the first team wrong
#feature selection RFE is based on the idea to repeatedly construct a model and choose either the best
#or worst performing feature, setting the feature aside and then repeating the process with the rest of the
#features. This process is applied until all features in the dataset are exhausted. The goal of RFE is to select
# features by recursively considering smaller and smaller sets of features
rfe = RFE(lr,13)
rfe = rfe.fit(x_train,y_train)
#print rfe.support_
#An index that selects the retained features from a feature vector. If indices is False, this is a boolean array of shape
#[# input features], in which an element is True iff its corresponding feature is selected for retention
def __init__(self, isTrain, isOutlierRemoval=0):
super(ClassificationUniformBlending, self).__init__(isTrain, isOutlierRemoval)
# data preprocessing
self.dataPreprocessing()
# create logistic regression object
self.logreg = linear_model.LogisticRegression(tol=1e-6, penalty='l1', C=0.0010985411419875584)
# create adaboost object
self.dt_stump = DecisionTreeClassifier(max_depth=10)
self.ada = AdaBoostClassifier(
base_estimator=self.dt_stump,
learning_rate=1,
n_estimators=5,
algorithm="SAMME.R")
# create knn object
self.knn = neighbors.KNeighborsClassifier(2, weights='uniform')
# create decision tree object
self.decisiontree = DecisionTreeClassifier(max_depth=45, max_features='log2')
# create neural network object
def call_GridParamSearch_featfilt(X, y) :
'''
(def is Currently just a cut & paste from "main".)
Calles def GridParamSearch , (which uses randomized CV to find odel param)
Used to try different ml models, then get their optimal paramters
'''
print("SPARSE (L1) EXT gridparam scores:")
# clf = Pipeline([
# ('feature_selection', LinearSVC(penalty="l1", loss='l1',dual=False, class_weight='auto')),
# ('classification', ExtraTreesClassifier(n_jobs=3)
# )])
'Sparse; L1 penalized features selection prior to RF fitting/prediction'
clf_svm = LinearSVC(penalty="l1", loss='l2', dual=False, class_weight='auto')
clf_logit = LogisticRegression(penalty="l1", dual=False, class_weight='auto')
'http://scikit-learn.org/0.13/auto_examples/plot_feature_selection.html'
print('Original features matrix:')
print(X.shape)
# Univariate feature selection with F-test for feature scoring
# We use the default selection function: the 20% most significant features
# selector = SelectPercentile(f_classif, percentile=20)
selector = SelectPercentile(chi2, percentile=20)
X_anova = selector.fit_transform(X, y)
print(
'New (2 f_classif) Using statistical feature selection: features matrix is:')
print(X_anova.shape)
# lda = LDA(n_components=10)
# X_lda = lda.fit_transform(X, y)
# print('New LDA filtered features matrix:')
import sklearn.decomposition
import sklearn.ensemble
import sklearn.decomposition
import sklearn.linear_model
from sklearn import datasets
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from baikal import Input, Model, make_step
from baikal.sklearn import SKLearnWrapper
LogisticRegression = make_step(sklearn.linear_model.LogisticRegression)
RandomForestClassifier = make_step(sklearn.ensemble.RandomForestClassifier)
PCA = make_step(sklearn.decomposition.PCA)
def build_fn():
x = Input()
y_t = Input()
h = PCA(random_state=random_state, name="pca")(x)
y_p = LogisticRegression(random_state=random_state, name="classifier")(h, y_t)
model = Model(x, y_p, y_t)
return model
iris = datasets.load_iris()
x_data = iris.data
y_data = iris.target
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import BaggingClassifier, \
RandomForestClassifier, \
AdaBoostClassifier, \
GradientBoostingClassifier, \
ExtraTreesClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
Names = ['LR', 'KNN', 'DT', 'NB', 'Bagging', 'RF', 'AB', 'GB', 'SVM', 'LDA', 'ET']
Classifiers = [
LogisticRegression(penalty='l2', C=0.10, max_iter=500, solver='sag'), #1
KNeighborsClassifier(n_neighbors=7), #2
DecisionTreeClassifier(), #3
GaussianNB(), #4
BaggingClassifier(), #5
RandomForestClassifier(), #6
AdaBoostClassifier(), #7
GradientBoostingClassifier(), #8
SVC(C=15.0, kernel='rbf', degree=3, probability=True), #9
LinearDiscriminantAnalysis(), #10
# ExtraTreesClassifier(), #11
]
F = open('evaluationResults.txt', 'w')
F.write('Evaluation Scale:'+'\n')
F.write('0.0% <=Accuracy<= 100.0%'+'\n')