def compare_assessors(X, y):

    n_estimator = 20
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)
    # It is important to train the ensemble of trees on a different subset
    # of the training data than the linear regression model to avoid
    # overfitting, in particular if the total number of leaves is
    # similar to the number of training samples
    X_train, X_train_lr, y_train, y_train_lr = train_test_split(X_train,

    # Unsupervised transformation based on totally random trees
    rt = RandomTreesEmbedding(n_estimators=n_estimator, random_state=0)

    rt_lm = LogisticRegression()
    pipeline = make_pipeline(rt, rt_lm), y_train)
    y_pred_rt = pipeline.predict_proba(X_test)[:, 1]
    fpr_rt_lm, tpr_rt_lm, _ = roc_curve(y_test, y_pred_rt)

    # Supervised transformation based on random forests
    rf = RandomForestClassifier(n_estimators=n_estimator), y_train)
    y_pred_rf = rf.predict_proba(X_test)[:, 1]
    fpr_rf, tpr_rf, _ = roc_curve(y_test, y_pred_rf)

    # RF + LR
    rf_enc = OneHotEncoder()
    rf_lm = LogisticRegression(), y_train_lr)
X_train, X_valid, y_train, y_valid = cross_validation.train_test_split(trainsetx, trainsety, test_size=(validPercentage/100.0), random_state=seed+run_ix)
            for (c_ix, c) in enumerate(c_list):
                #more parametrization of model can come from some config file eventually.
                model_c  = linear_model.LogisticRegression(penalty='l1', C=c, fit_intercept='true', class_weight='auto')
                Ypred_valid = model_c.predict_proba(X_valid)
                # evaluation metric could come from a config file eventually. currently AUC is commonly used and we use here
                fprs, tprs, thresholdss = roc_curve(y_valid, Ypred_valid[:,1])
                score_c = auc(fprs,tprs)
                score_array [run_ix, c_ix] = score_c

        mean_scores = score_array.mean(axis=0)
        mean_scores_ix = np.argmax(mean_scores)
        best_c = c_list[mean_scores_ix]
        #now train on the entire train set, using best c:
        model_best_c  = linear_model.LogisticRegression(penalty='l1', C=best_c, fit_intercept='true', class_weight='auto'),trainsety)
        Ypred_test = model_best_c.predict_proba(testsetx)
        fprs, tprs, thresholdss = roc_curve(testsety, Ypred_test[:,1])
        Ypred_train = model_best_c.predict_proba(trainsetx)
        fprt, tprt, thresholdst = roc_curve(trainsety, Ypred_train[:,1])
        print('score on unseen test set is: ', auc(fprs,tprs), file=sys.stderr)
        print('training score on this set was: ', auc(fprt,tprt), file=sys.stderr)
        print("best average score during cross validation was:", mean_scores[mean_scores_ix], "with c =", best_c, file=sys.stderr)
        print('saving the model in directory: ', modeloutput, file=sys.stderr)
        if not os.path.exists(modeloutput):
        save_name = getsavefile(modeloutput + "/reg_model_scklearn", ".pkl", overwrite)
        cPickle.dump(model_best_c, open(save_name, 'wb'), -1)
        save_name = getsavefile(modeloutput + "/reg_model_weights", ".txt", overwrite)
def test_n_weights(X,y,test_params):
    for c in test_params['C']:
        lr = LogisticRegression(penalty='l1',C=c, solver='liblinear', fit_intercept=True),~y)
    return n_weights
def logreg_test_in_training(self):
        """fast, initial method: test vectors in the training data"""

        self.good_fv_logreg = LogisticRegression(C=self.C, penalty='l2', solver='liblinear', tol=0.01)
        self.bad_fv_logreg = LogisticRegression(C=self.C, penalty='l2', solver='liblinear', tol=0.01)

        good_x_test = self.good_X_all[self.good_columns]
        good_X = self.good_X_all
        good_y = self.good_y_all
        good_w = self.good_w_all

        bad_x_test = self.bad_X_all[self.bad_columns]
        bad_X = self.bad_X_all
        bad_y = self.bad_y_all
        bad_w = self.bad_w_all

        if good_x_test.shape[0] > 0:
  , good_y, sample_weight=good_w)
            self.good_signal = self.good_fv_logreg.decision_function(good_x_test)
        if bad_x_test.shape[0] > 0:
def readout_sk(self, X_train, X_test, y_train, y_test, **kwargs):
        from sklearn.linear_model import LogisticRegression
        lr = LogisticRegression(**kwargs), y_train.T)
        y_train_predictions = lr.predict(X_train.T)
        y_test_predictions = lr.predict(X_test.T)
        return accuracy_score(y_train_predictions, y_train.T), accuracy_score(y_test_predictions, y_test.T)
def LR_ROC(data):
	#we initialize the random number generator to a const value
	#this is important if we want to ensure that the results
	#we can achieve from this model can be achieved again precisely
	#Axis or axes along which the means are computed. The default is to compute the mean of the flattened array.	
	mean = np.mean(data,axis=0)
	std = np.std(data,axis=0)
	#print 'Mean: \n',mean
	#print 'Standar deviation: \n',std
	X,Y = preparingData(data)
	x_train, x_test, y_train, y_test = train_test_split(X,Y, test_size=0.20)
	# convert integers to dummy variables (i.e. one hot encoded)
	lr = LogisticRegression(class_weight='balanced'),y_train)
	#The score function of sklearn can quickly assess the model performance
	#due to class imbalance , we nned to evaluate the model performance
	#on every class. Which means to find when we classify people from the first team wrong

	#feature selection RFE is based on the idea to repeatedly construct a model and choose either the best
	#or worst performing feature, setting the feature aside and then repeating the process with the rest of the 
	#features. This process is applied until all features in the dataset are exhausted. The goal of RFE is to select
	# features by recursively considering smaller and smaller sets of features
	rfe = RFE(lr,13)
	rfe =,y_train)
	#print rfe.support_

	#An index that selects the retained features from a feature vector. If indices is False, this is a boolean array of shape 
	#[# input features], in which an element is True iff its corresponding feature is selected for retention
def __init__(self, isTrain, isOutlierRemoval=0):
        super(ClassificationUniformBlending, self).__init__(isTrain, isOutlierRemoval)
        # data preprocessing

        # create logistic regression object
        self.logreg = linear_model.LogisticRegression(tol=1e-6, penalty='l1', C=0.0010985411419875584)

        # create adaboost object
        self.dt_stump = DecisionTreeClassifier(max_depth=10)
        self.ada = AdaBoostClassifier(

        # create knn object
        self.knn = neighbors.KNeighborsClassifier(2, weights='uniform')

        # create decision tree object
        self.decisiontree = DecisionTreeClassifier(max_depth=45, max_features='log2')

        # create neural network object
def call_GridParamSearch_featfilt(X, y) :
        (def is Currently just a cut & paste from "main".)
        Calles def GridParamSearch , (which uses randomized CV to find odel param)
    Used to try different ml models, then get their optimal paramters
    print("SPARSE (L1) EXT gridparam scores:")
    #   clf = Pipeline([
    #       ('feature_selection', LinearSVC(penalty="l1", loss='l1',dual=False, class_weight='auto')),
    # ('classification', ExtraTreesClassifier(n_jobs=3)
    #   )])
    'Sparse; L1 penalized features selection prior to RF fitting/prediction'
    clf_svm = LinearSVC(penalty="l1", loss='l2', dual=False, class_weight='auto')
    clf_logit = LogisticRegression(penalty="l1", dual=False, class_weight='auto')

    print('Original features matrix:')
    # Univariate feature selection with F-test for feature scoring
    # We use the default selection function: the 20% most significant features
    # selector = SelectPercentile(f_classif, percentile=20)
    selector = SelectPercentile(chi2, percentile=20)
    X_anova = selector.fit_transform(X, y)
        'New (2 f_classif) Using statistical feature selection: features matrix is:')

    # lda = LDA(n_components=10)
    # X_lda = lda.fit_transform(X, y)
    # print('New LDA filtered features matrix:')
import sklearn.decomposition
import sklearn.ensemble
import sklearn.decomposition
import sklearn.linear_model
from sklearn import datasets
from sklearn.model_selection import GridSearchCV, StratifiedKFold

from baikal import Input, Model, make_step
from baikal.sklearn import SKLearnWrapper

LogisticRegression = make_step(sklearn.linear_model.LogisticRegression)
RandomForestClassifier = make_step(sklearn.ensemble.RandomForestClassifier)
PCA = make_step(sklearn.decomposition.PCA)

def build_fn():
    x = Input()
    y_t = Input()
    h = PCA(random_state=random_state, name="pca")(x)
    y_p = LogisticRegression(random_state=random_state, name="classifier")(h, y_t)
    model = Model(x, y_p, y_t)
    return model

iris = datasets.load_iris()
x_data =
y_data =
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import BaggingClassifier, \
    RandomForestClassifier, \
    AdaBoostClassifier, \
    GradientBoostingClassifier, \

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

Names = ['LR', 'KNN', 'DT', 'NB', 'Bagging', 'RF', 'AB', 'GB', 'SVM', 'LDA', 'ET']

Classifiers = [
    LogisticRegression(penalty='l2', C=0.10, max_iter=500, solver='sag'),           #1
    KNeighborsClassifier(n_neighbors=7),         #2
    DecisionTreeClassifier(),       #3
    GaussianNB(),                   #4
    BaggingClassifier(),            #5
    RandomForestClassifier(),       #6
    AdaBoostClassifier(),           #7
    GradientBoostingClassifier(),   #8
    SVC(C=15.0, kernel='rbf', degree=3, probability=True),          #9
    LinearDiscriminantAnalysis(),   #10
    # ExtraTreesClassifier(),         #11

F = open('evaluationResults.txt', 'w')

F.write('Evaluation Scale:'+'\n')
F.write('0.0% <=Accuracy<= 100.0%'+'\n')