Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_xgboost_classifier(output_margin):
import xgboost as xgb
df = pd.read_csv("./open_data/creditcard.csv")
X = np.array(df.iloc[:, :df.shape[1] - 1], dtype='float32', order='C')
y = np.array(df.iloc[:, df.shape[1] - 1], dtype='float32', order='C')
gbm = xgb.sklearn.XGBClassifier()
gbm.fit(X, y)
gbm.predict(X, output_margin=output_margin)
gbm.predict_proba(X, output_margin=output_margin)
def fit_model(self,X_train,y_train,X_test,y_test):
clf = XGBClassifier(
learning_rate =self.learning_rate,
n_estimators=self.n_estimators,
max_depth=self.max_depth,
min_child_weight=self.min_child_weight,
gamma=self.gamma,
subsample=self.subsample,
colsample_bytree=self.colsample_bytree,
objective= self.objective,
nthread=self.nthread,
scale_pos_weight=self.scale_pos_weight,
reg_alpha=self.reg_alpha,
reg_lambda=self.reg_lambda,
seed=self.seed)
clf.fit(X_train, y_train)
y_pre= clf.predict(X_test)
y_pro= clf.predict_proba(X_test)[:,1]
def perform_prediction(training, labels, testing, xgb_votes, rf_votes):
""" Perform prediction using a combination of XGB and RandomForests. """
predictions = np.zeros((len(testing), len(set(labels))))
# Predictions using xgboost.
for i in range(xgb_votes):
print 'XGB vote %d' % i
xgb = XGBClassifier(
max_depth=DEPTH_XGB, learning_rate=LEARNING_XGB,
n_estimators=ESTIMATORS_XGB, objective='multi:softprob',
subsample=SUBSAMPLE_XGB, colsample_bytree=COLSAMPLE_XGB)
xgb.fit(training, labels)
predictions += xgb.predict_proba(testing)
# Predictions using RandomForestClassifier.
for i in range(rf_votes):
print 'RandomForest vote %d' % i
rand_forest = RandomForestClassifier(
n_estimators=ESTIMATORS_RF, criterion=CRITERION_RF, n_jobs=JOBS_RF,
max_depth=DEPTH_RF, min_samples_leaf=MIN_LEAF_RF, bootstrap=True)
rand_forest.fit(training, labels)
predictions += rand_forest.predict_proba(testing)
return predictions
#for i in range(9):
# q, _ = np.histogram(y[:, i].ravel(), bins=[0, 0.5, 1])
# print(i + 1, q, q[0] / q[1])
clf1 = sklearn.linear_model.LogisticRegression(C=200)
clf1vlad = sklearn.linear_model.LogisticRegression(C=1)
clf2 = sklearn.svm.LinearSVR(C=5)
#clf2vlad = sklearn.svm.LinearSVR(C=1)
#clf2 = sklearn.svm.SVR(C=0.1, kernel='linear')
#clf1 = sklearn.linear_model.LogisticRegressionCV(Cs=100)
#clf1 = sklearn.ensemble.RandomForestClassifier(n_estimators=100)
#clf1 = sklearn.neighbors.KNeighborsClassifier(n_neighbors=50)
#clf1 = sklearn.svm.SVC(C=10, gamma=0.03, kernel='linear', probability=True)
clf3 = xgb.sklearn.XGBClassifier(learning_rate=0.1, n_estimators=200, nthread=8,
max_depth=5, subsample=0.9, colsample_bytree=0.9)
clf3vlad = xgb.sklearn.XGBClassifier(learning_rate=0.1, n_estimators=200, nthread=8,
max_depth=5, subsample=0.9, colsample_bytree=0.9)
#kf = cross_validation.KFold(x.shape[0], n_folds=5, shuffle=True, random_state=0)
#res = 0
#for i in range(9):
# res = 0
# for train_index, test_index in kf:
# X_train, X_val = x[train_index], x[test_index]
# y_train, y_val = y[train_index], y[test_index]
# rrr = np.zeros((X_val.shape[0], 9), dtype=np.int32)
#
# clf.fit(X_train, y_train[:, i])
# preds = clf.predict(X_val)
# rrr[:, i] = preds
intercept_scaling=1, random_state=None, max_iter=3000),
"knn": KNeighborsClassifier(n_neighbors=100, weights='distance', leaf_size=30, n_jobs=n_jobs),
"random_forests": RandomForestClassifier(n_estimators=350, criterion='entropy', min_samples_split=2,
min_samples_leaf=1, max_leaf_nodes=600, n_jobs=n_jobs),
"logistic_regression": LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=2.4, fit_intercept=True, intercept_scaling=1,
random_state=None, solver='liblinear', max_iter=1000, multi_class='ovr',
warm_start=False, n_jobs=n_jobs),
"decision_trees": DecisionTreeClassifier(criterion='gini', splitter='best', max_depth=None, min_samples_split=2,
min_samples_leaf=100, min_weight_fraction_leaf=0.0, max_features=None,
random_state=None, max_leaf_nodes=None, presort=False),
"sgd": SGDClassifier(alpha=.0001, n_iter=500, penalty="elasticnet", n_jobs=n_jobs),
"neural_network": Classifier(layers=[Layer("Sigmoid", units=14), Layer("Sigmoid", units=13), Layer("Sigmoid", units=12),
Layer("Sigmoid", units=10), Layer("Softmax")], learning_rate=0.01, n_iter=200,
batch_size=10, regularize='L1', n_stable=50, dropout_rate=0, verbose=True),
"GBC": GradientBoostingClassifier(max_depth=10, max_leaf_nodes=850, min_samples_leaf=15, learning_rate=0.1),
"XGB": XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
max_depth=10, min_child_weight=2, missing=None, n_estimators=100, nthread=n_jobs, reg_alpha=0,
objective='binary:logistic', reg_lambda=1, scale_pos_weight=1, seed=0, silent=True, subsample=1)}
return classifier_list[classifier_name].fit(train_set, train_labels)
# ('21k_1024.npy', sklearn.linear_model.LogisticRegression(C=100)),
# ('v3_2048.npy', sklearn.linear_model.LogisticRegression(C=100)),
# ('res_full_l2.npy', sklearn.linear_model.LogisticRegression(C=1)),
('21k_50k_2048.npy', sklearn.linear_model.LogisticRegression(C=100)),
('21k_v3_3072.npy', sklearn.linear_model.LogisticRegression(C=100)),
('21k_v3_128.npy', sklearn.linear_model.LogisticRegression(C=50)),
# ('21k.npy', sklearn.linear_model.LogisticRegression(C=50)),
('fisher.npy', sklearn.linear_model.LogisticRegression(C=2)),
('v3_full.npy', sklearn.linear_model.LogisticRegression(C=100)),
('21k_full.npy', sklearn.linear_model.LogisticRegression(C=100)),
# ('vlad_2_21k_full.npy', sklearn.linear_model.LogisticRegression(C=1)),
# ('21k_v3_128.npy', xgb_wrapper()),
# ('fisher_21k_1024.npy', sklearn.linear_model.LogisticRegression(C=2))
# ('v3.npy', sklearn.linear_model.LogisticRegression(C=100)),
('vlad_2_21k_full.npy', xgb.sklearn.XGBClassifier(learning_rate=0.1, n_estimators=100, nthread=8,
max_depth=3, subsample=0.8, colsample_bytree=0.8)),
# ('jo.npy', xgb.sklearn.XGBClassifier(learning_rate=0.1, n_estimators=100, nthread=8,
# max_depth=4, subsample=0.9, colsample_bytree=0.9))
]
def f_real(weights):
kf = cross_validation.KFold(2000, n_folds=10, shuffle=True, random_state=0)
re = np.array([])
fold = 0
for train_index, test_index in kf:
y = np.load('y_train.npy')
y_val = y[test_index]
preds = np.array([])
for feature, clf in features:
def grid_search(model_type, X, y, num_of_folds, verbose, first_dim, second_dim=None, third_dim=None, return_auc_values=False):
best_auc=0
best_auc_setting=None
if model_type=='XGB':
auc_matrix=np.zeros((len(first_dim),len(second_dim),len(third_dim)))
for max_depth_index, max_depth in enumerate(first_dim):
for n_estimator_index, n_estimator in enumerate(second_dim):
for learning_rate_index, learning_rate in enumerate(third_dim):
model=XGBClassifier(max_depth=int(max_depth), n_estimators=int(n_estimator), learning_rate=learning_rate)
auc=auc_calculator(model, X, y, num_of_folds)
auc_matrix[max_depth_index, n_estimator_index, learning_rate_index]=auc
if auc>best_auc:
best_auc=auc
best_auc_setting=[max_depth,n_estimator,learning_rate]
if verbose==True:
sys.stdout.write('\rGRID SEARCHING XGB: progress: {0:.3f} % ...'.format(
(max_depth_index*(len(second_dim)*len(third_dim))+
n_estimator_index*(len(third_dim))+
learning_rate_index
+1)/(len(first_dim)*len(second_dim)*len(third_dim))*100))
if model_type=='LR+LASSO':
auc_matrix=np.zeros(len(first_dim))
for index, regularization_strength in enumerate(first_dim):
model=LogisticRegression(penalty='l1', C=regularization_strength)
auc=auc_calculator(model, X, y, num_of_folds)
def __init__(self,data_train, data_test, target, predictors, cv_folds=10,scoring_metric_skl='accuracy', scoring_metric_xgb='error'):
GenericModelClass.__init__(self, alg=XGBClassifier(), data_train=data_train,
data_test=data_test, target=target, predictors=predictors,cv_folds=cv_folds,scoring_metric=scoring_metric_skl)
#Define default parameters on your own:
self.default_parameters = {
'max_depth':3, 'learning_rate':0.1,
'n_estimators':100, 'silent':True,
'objective':"binary:logistic",
'nthread':1, 'gamma':0, 'min_child_weight':1,
'max_delta_step':0, 'subsample':1, 'colsample_bytree':1, 'colsample_bylevel':1,
'reg_alpha':0, 'reg_lambda':1, 'scale_pos_weight':1,
'base_score':0.5, 'seed':0, 'missing':None
}
self.model_output = pd.Series(self.default_parameters)
#create DMatrix with no missing:
self.xgtrain = xgb.DMatrix(self.data_train[self.predictors].values, label=self.data_train[self.target].values)