Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
rf = RandomForestClassifier(n_estimators=self.trees,
class_weight='balanced_subsample', n_jobs=jobs)
mod = rf.fit(x, y)
importances = mod.feature_importances_
if prune:
# Trimming the tree to the top features
sorted_indices = np.argsort(importances)
trimmed_indices = np.array(sorted_indices[-top:])
self.feature_indices = trimmed_indices
# Pruning the unnecessary features from the training data
X = deepcopy(x[:, trimmed_indices])
# Training a new forest on the pruned data
mod = RandomForestClassifier(n_estimators=self.trees,
class_weight='balanced_subsample',
n_jobs=jobs)
mod.fit(X, y)
# Passing attributes up to the instance
self.feature_importances = importances
self.pruned = True
# Setting the model attribute for the instance
self.mod = mod
def _get_score(feats:List[str]) -> Tuple[float,float]:
score = []
w_val = val_df[wgt_name] if wgt_name is not None else wgt_name
m = RandomForestClassifier if 'class' in objective.lower() else RandomForestRegressor
while len(score) < n_rfs:
tmp_trn = subsample_df(train_df, objective=objective, targ_name=targ_name, strat_key=strat_key, wgt_name=wgt_name,
n_samples=int(subsample_rate*len(train_df)) if subsample_rate is not None else None)
w_trn = None if wgt_name is None else tmp_trn[wgt_name]
rf = m(**rf_params)
rf.fit(X=tmp_trn[feats], y=tmp_trn[targ_name], sample_weight=w_trn)
score.append(rf.score(X=val_df[feats], y=val_df[targ_name], sample_weight=w_val))
return uncert_round(np.mean(score), np.std(score, ddof=1))
def _get_column_importances(self):
if self.y is None: return np.ones(self.X.shape[0])
misc.start('_get_column_importances')
rf = ensemble.RandomForestRegressor(50) if self.is_regression else ensemble.RandomForestClassifier(50)
rf.fit(self.X_no_nan[:self._importance_row_limit], self.y[:self._importance_row_limit])
misc.stop('done _get_column_importances, num feats: ' + `len(rf.feature_importances_)`)
return rf.feature_importances_
temp_list = data.split(' ')
sign_list.append(temp_list[0].strip())
name_list.append(convertListToString(temp_list[1:]))
class_labels = map(lambda x:class_label_map[x],sign_list)
feature_set = np.zeros((len(class_labels),26))
c = 0
for name in name_list:
for character in name:
feature_set[c][ord(character)-ord('a')] = feature_set[c][ord(character)-ord('a')] + 1.0
c = c + 1
train_data,test_data,train_labels,test_labels = cross_validation.train_test_split(feature_set,class_labels,test_size=0.2)
rf = RandomForestClassifier(n_estimators=101)
ada = AdaBoostClassifier(n_estimators=101)
grad_boost = GradientBoostingClassifier(n_estimators=101)
bagging = BaggingClassifier(n_estimators=101)
svm = SVC(kernel='rbf')
knn = KNeighborsClassifier(n_neighbors=5)
classifiers = [rf,ada,grad_boost,bagging,svm,knn]
classifier_names = ["Random Forest","AdaBoost","Gradient Boost","Bagging","SVM","KNN"]
for classifier,classifier_name in zip(classifiers,classifier_names):
classifier.fit(train_data,train_labels)
predicted_labels = classifier.predict(test_data)
print "--------------------------------\n"
print "Accuracy for ",classifier_name," : ",metrics.accuracy_score(test_labels,predicted_labels)
print "Confusion Matrix for ",classifier_name, ":\n",metrics.confusion_matrix(test_labels,predicted_labels)
def fit_logit_and_rfs(dset_names, data_dir, out_dir, classification_only=True, random_state=42):
logit_test_scores = []
rf_test_scores = []
rfs = []
for dset_name in tqdm(dset_names):
X, y = dsets.fetch_data(dset_name, return_X_y=True,
local_cache_dir=data_dir)
train_X, test_X, train_y, test_y = train_test_split(X, y, random_state=random_state)
if classification_only:
logit = LogisticRegression(solver='liblinear', multi_class='auto', random_state=random_state) # liblinear best for small dsets, otherwise lbfgs
rf = RandomForestClassifier(n_estimators=100, random_state=random_state)
# print(dset_name, X.shape)
logit.fit(train_X, train_y)
rf.fit(train_X, train_y)
logit_test_scores.append(logit.score(test_X, test_y))
rf_test_scores.append(rf.score(test_X, test_y))
rfs.append(deepcopy(rf))
# save
logit_test_scores = np.array(logit_test_scores)
rf_test_scores = np.array(rf_test_scores)
classification_results = {'logit_test_score': logit_test_scores,
'rf_test_score': rf_test_scores,
'dset_name': dset_names,
'rf': rfs}
pkl.dump(classification_results,
n_neighbors=4),
n_jobs=-1,
n_estimators=10)),
('bag_svm', BaggingClassifier(
Pipeline([ ('scaling', StandardScaler()),
('rbf_svm', SVC(kernel='rbf',
probability=True,
cache_size=2000,
C=10.0,
gamma='auto',
class_weight='balanced')) ]),
n_jobs=-1,
n_estimators=10)),
('boost_rf', Pipeline([ ('scaling', StandardScaler()),
('adaboost_random_forest', AdaBoostClassifier(
RandomForestClassifier(n_jobs=-1,
n_estimators=500,
max_features='auto'),
n_estimators=100)) ])) ],
voting='soft')
},
# Include inferred class distributions in best stand-alone models of SVM, RF ##################
'expt_45': {
'note': 'add class weights to expt_32',
'name': 'Yeah I work out',
'pl': Pipeline([ ('scaling', StandardScaler()),
('random_forest', RandomForestClassifier(n_jobs=-1,
n_estimators=500,
max_features='auto',
class_weight = {0:0.098,
1:0.111,
2:0.104,
def discrimination_threshold(ax=None):
data = load_spam(return_dataset=True)
X, y = data.to_pandas()
viz = DiscriminationThreshold(RandomForestClassifier(n_estimators=10), ax=ax)
return tts_plot(viz, X, y, score=False)
num_false_predicted_true = 0.0
num_false_predicted_false = 0.0
timer = 0.0
result = {}
file = None
if accuracyresults:
file = open('dataset-accuracyresults-{0}.txt'.format(method),'w+')
with open( dataset ) as csvfile:
reader = csv.DictReader( csvfile, fieldnames=[ "s1" , "s2" , "res" , "c1" , "c2", "a1", "a2", "cc1", "cc2"], delimiter='\t' )
for row in reader:
if row['res'] == "TRUE": num_true += 1.0
else: num_false += 1.0
model1 = None
model2 = None
if method == 'rf':
model1 = ensemble.RandomForestClassifier( n_estimators=600 , random_state=0 , n_jobs=2, max_depth=100)
model2 = ensemble.RandomForestClassifier( n_estimators=600 , random_state=0 , n_jobs=2, max_depth=100)
elif method == 'et':
model1 = ensemble.ExtraTreesClassifier( n_estimators=600 , random_state=0 , n_jobs=2, max_depth=100)
model2 = ensemble.ExtraTreesClassifier( n_estimators=600 , random_state=0 , n_jobs=2, max_depth=100)
elif method == 'svm':
model1 = svm.LinearSVC( random_state=0, C=1.0)
model2 = svm.LinearSVC( random_state=0, C=1.0)
elif method == 'xgboost':
model1 = xgboost.XGBClassifier( n_estimators=3000 , seed=0 )
model2 = xgboost.XGBClassifier( n_estimators=3000 , seed=0 )
X1 = []
Y1 = []
X2 = []
Y2 = []
print "Reading dataset..."
with open( dataset ) as csvfile:
def rf(data, nlabels, training, test):
# declare the rf model
rfb = RandomForestClassifier(n_jobs=-1)
rfn = RandomForestClassifier(n_jobs=-1, bootstrap=False)
# fit both models and get its error
error_boots = fit_and_error(model=rfb, data=data, labels=nlabels, mask=training)
error_noboots = fit_and_error(model=rfn, data=data, labels=nlabels, mask=training)
# print("Error en training:\n\tWith Bootstrap:\t",error_boots,"\n\tWithout Bootstrap:\t",error_noboots)
# fit both models and get its test error
error_boots_test = fit_and_error(model=rfb, data=data, labels=nlabels, mask=test)
error_noboots_test = fit_and_error(model=rfn, data=data, labels=nlabels, mask=test)
# print("Error en test:\n\tWith Bootstrap:\t", error_boots_test, "\n\tWithout Bootstrap:\t", error_noboots_test)
return rfb, rfn, error_boots, error_noboots, error_boots_test, error_noboots_test
def get_classifier(clf_type: ClassifierType):
if clf_type == ClassifierType.DECISION_TREE:
clf = DecisionTreeClassifier(criterion="gini", max_depth=None, random_state=42)
elif clf_type == ClassifierType.RANDOM_FOREST:
clf = RandomForestClassifier(n_estimators=100, max_depth=None, n_jobs=3, random_state=42)
elif clf_type == ClassifierType.RANDOM_FOREST_REGRESSOR:
clf = RandomForestRegressor(n_estimators=100, max_depth=None, n_jobs=3, random_state=42)
else:
raise ValueError("Unknown classifier type specified")
return clf