How to use the sklearn.ensemble.RandomForestClassifier function in sklearn

To help you get started, we’ve selected a few sklearn examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github scotthlee / document_classification / rf.py View on Github external
rf = RandomForestClassifier(n_estimators=self.trees, 
                              class_weight='balanced_subsample', n_jobs=jobs)
		mod = rf.fit(x, y)
		importances = mod.feature_importances_
			
		if prune:
			# Trimming the tree to the top features
			sorted_indices = np.argsort(importances)
			trimmed_indices = np.array(sorted_indices[-top:])
			self.feature_indices = trimmed_indices
			
			# Pruning the unnecessary features from the training data
			X = deepcopy(x[:, trimmed_indices])
			
			# Training a new forest on the pruned data
			mod = RandomForestClassifier(n_estimators=self.trees, 
                                class_weight='balanced_subsample', 
                                n_jobs=jobs)
			mod.fit(X, y)
			
			# Passing attributes up to the instance			
			self.feature_importances = importances
			self.pruned = True
		
		# Setting the model attribute for the instance
		self.mod = mod
github GilesStrong / lumin / lumin / optimisation / features.py View on Github external
def _get_score(feats:List[str]) -> Tuple[float,float]:
        score = []
        w_val = val_df[wgt_name] if wgt_name is not None else wgt_name
        m = RandomForestClassifier if 'class' in objective.lower() else RandomForestRegressor
        while len(score) < n_rfs:
            tmp_trn = subsample_df(train_df, objective=objective, targ_name=targ_name, strat_key=strat_key, wgt_name=wgt_name,
                                   n_samples=int(subsample_rate*len(train_df)) if subsample_rate is not None else None)
            w_trn = None if wgt_name is None else tmp_trn[wgt_name]
            rf = m(**rf_params)
            rf.fit(X=tmp_trn[feats], y=tmp_trn[targ_name], sample_weight=w_trn)
            score.append(rf.score(X=val_df[feats], y=val_df[targ_name], sample_weight=w_val))
        return uncert_round(np.mean(score), np.std(score, ddof=1))
github gatapia / py_ml_utils / pandas_extensions / describe.py View on Github external
def _get_column_importances(self):
    if self.y is None: return np.ones(self.X.shape[0])
    misc.start('_get_column_importances')    
    rf = ensemble.RandomForestRegressor(50) if self.is_regression else ensemble.RandomForestClassifier(50)    
    rf.fit(self.X_no_nan[:self._importance_row_limit], self.y[:self._importance_row_limit])
    misc.stop('done _get_column_importances, num feats: ' + `len(rf.feature_importances_)`)
    return rf.feature_importances_
github rupakc / UCI-Data-Analysis / Badge Dataset / badge.py View on Github external
temp_list = data.split(' ')
    sign_list.append(temp_list[0].strip())
    name_list.append(convertListToString(temp_list[1:]))

class_labels = map(lambda x:class_label_map[x],sign_list)
feature_set = np.zeros((len(class_labels),26)) 

c = 0
for name in name_list:
    for character in name:
        feature_set[c][ord(character)-ord('a')] = feature_set[c][ord(character)-ord('a')] + 1.0
    c = c + 1

train_data,test_data,train_labels,test_labels = cross_validation.train_test_split(feature_set,class_labels,test_size=0.2)

rf = RandomForestClassifier(n_estimators=101)
ada = AdaBoostClassifier(n_estimators=101)
grad_boost = GradientBoostingClassifier(n_estimators=101)
bagging = BaggingClassifier(n_estimators=101)
svm = SVC(kernel='rbf')
knn = KNeighborsClassifier(n_neighbors=5)

classifiers = [rf,ada,grad_boost,bagging,svm,knn]
classifier_names = ["Random Forest","AdaBoost","Gradient Boost","Bagging","SVM","KNN"]

for classifier,classifier_name in zip(classifiers,classifier_names):
    classifier.fit(train_data,train_labels)
    predicted_labels = classifier.predict(test_data) 
    
    print "--------------------------------\n"
    print "Accuracy for ",classifier_name," : ",metrics.accuracy_score(test_labels,predicted_labels)
    print "Confusion Matrix for ",classifier_name, ":\n",metrics.confusion_matrix(test_labels,predicted_labels)
github csinva / disentangled-attribution-curves / pmlb_comparisons / train.py View on Github external
def fit_logit_and_rfs(dset_names, data_dir, out_dir, classification_only=True, random_state=42):
    
    logit_test_scores = []
    rf_test_scores = []
    rfs = []

    for dset_name in tqdm(dset_names):
        X, y = dsets.fetch_data(dset_name, return_X_y=True, 
                          local_cache_dir=data_dir)


        train_X, test_X, train_y, test_y = train_test_split(X, y, random_state=random_state)
        if classification_only:
            logit = LogisticRegression(solver='liblinear', multi_class='auto', random_state=random_state) # liblinear best for small dsets, otherwise lbfgs
            rf = RandomForestClassifier(n_estimators=100, random_state=random_state)
    #     print(dset_name, X.shape)
        logit.fit(train_X, train_y)
        rf.fit(train_X, train_y)

        logit_test_scores.append(logit.score(test_X, test_y))
        rf_test_scores.append(rf.score(test_X, test_y))
        rfs.append(deepcopy(rf))

    # save
    logit_test_scores = np.array(logit_test_scores)
    rf_test_scores = np.array(rf_test_scores)
    classification_results = {'logit_test_score': logit_test_scores,
               'rf_test_score': rf_test_scores,
               'dset_name': dset_names,
               'rf': rfs}
    pkl.dump(classification_results,
github jrmontag / mnist-sklearn / models.py View on Github external
n_neighbors=4), 
                                        n_jobs=-1,
                                        n_estimators=10)),
                        ('bag_svm', BaggingClassifier( 
                                        Pipeline([ ('scaling', StandardScaler()), 
                                                    ('rbf_svm', SVC(kernel='rbf', 
                                                                    probability=True,
                                                                    cache_size=2000,
                                                                    C=10.0,
                                                                    gamma='auto',
                                                                    class_weight='balanced')) ]),    
                                        n_jobs=-1,
                                        n_estimators=10)),
                        ('boost_rf', Pipeline([ ('scaling', StandardScaler()), 
                                                ('adaboost_random_forest', AdaBoostClassifier( 
                                                                                RandomForestClassifier(n_jobs=-1,
                                                                                                n_estimators=500,
                                                                                                max_features='auto'),
                                                                                n_estimators=100)) ])) ],
                    voting='soft')
        },
    # Include inferred class distributions in best stand-alone models of SVM, RF ################## 
    'expt_45': { 
        'note': 'add class weights to expt_32',
        'name': 'Yeah I work out',
        'pl': Pipeline([ ('scaling', StandardScaler()), 
                        ('random_forest', RandomForestClassifier(n_jobs=-1,
                                                                    n_estimators=500,
                                                                    max_features='auto',
                                                                    class_weight = {0:0.098, 
                                                                                    1:0.111, 
                                                                                    2:0.104,
github DistrictDataLabs / yellowbrick / docs / images / readme / readme_imgs.py View on Github external
def discrimination_threshold(ax=None):
    data = load_spam(return_dataset=True)
    X, y = data.to_pandas()

    viz = DiscriminationThreshold(RandomForestClassifier(n_estimators=10), ax=ax)
    return tts_plot(viz, X, y, score=False)
github ruipds / Toponym-Matching / featureclassifiers.py View on Github external
num_false_predicted_true = 0.0
    num_false_predicted_false = 0.0
    timer = 0.0
    result = {}
    file = None
    if accuracyresults:
        file = open('dataset-accuracyresults-{0}.txt'.format(method),'w+')
    with open( dataset ) as csvfile:
        reader = csv.DictReader( csvfile, fieldnames=[ "s1" , "s2" , "res" , "c1" , "c2", "a1", "a2", "cc1", "cc2"], delimiter='\t' )
        for row in reader:
            if row['res'] == "TRUE": num_true += 1.0
            else: num_false += 1.0
    model1 = None
    model2 = None
    if method == 'rf':
        model1 = ensemble.RandomForestClassifier( n_estimators=600 , random_state=0 , n_jobs=2, max_depth=100)
        model2 = ensemble.RandomForestClassifier( n_estimators=600 , random_state=0 , n_jobs=2, max_depth=100)
    elif method == 'et':
        model1 = ensemble.ExtraTreesClassifier( n_estimators=600 , random_state=0 , n_jobs=2, max_depth=100)
        model2 = ensemble.ExtraTreesClassifier( n_estimators=600 , random_state=0 , n_jobs=2, max_depth=100)
    elif method == 'svm':
        model1 = svm.LinearSVC( random_state=0, C=1.0)
        model2 = svm.LinearSVC( random_state=0, C=1.0)
    elif method == 'xgboost':
        model1 = xgboost.XGBClassifier( n_estimators=3000 , seed=0 )
        model2 = xgboost.XGBClassifier( n_estimators=3000 , seed=0 )
    X1 = []
    Y1 = []
    X2 = []
    Y2 = []
    print "Reading dataset..."
    with open( dataset ) as csvfile:
github mgmacias95 / Flower-Recognition / model.py View on Github external
def rf(data, nlabels, training, test):
    # declare the rf model
    rfb = RandomForestClassifier(n_jobs=-1)
    rfn = RandomForestClassifier(n_jobs=-1, bootstrap=False)
    # fit both models and get its error
    error_boots = fit_and_error(model=rfb, data=data, labels=nlabels, mask=training)
    error_noboots = fit_and_error(model=rfn, data=data, labels=nlabels, mask=training)
    # print("Error en training:\n\tWith Bootstrap:\t",error_boots,"\n\tWithout Bootstrap:\t",error_noboots)
    # fit both models and get its test error
    error_boots_test = fit_and_error(model=rfb, data=data, labels=nlabels, mask=test)
    error_noboots_test = fit_and_error(model=rfn, data=data, labels=nlabels, mask=test)
    # print("Error en test:\n\tWith Bootstrap:\t", error_boots_test, "\n\tWithout Bootstrap:\t", error_noboots_test)
    return rfb, rfn, error_boots, error_noboots, error_boots_test, error_noboots_test
github PUTvision / decision_tree / decision_trees / utils / constants.py View on Github external
def get_classifier(clf_type: ClassifierType):
    if clf_type == ClassifierType.DECISION_TREE:
        clf = DecisionTreeClassifier(criterion="gini", max_depth=None, random_state=42)
    elif clf_type == ClassifierType.RANDOM_FOREST:
        clf = RandomForestClassifier(n_estimators=100, max_depth=None, n_jobs=3, random_state=42)
    elif clf_type == ClassifierType.RANDOM_FOREST_REGRESSOR:
        clf = RandomForestRegressor(n_estimators=100, max_depth=None, n_jobs=3, random_state=42)
    else:
        raise ValueError("Unknown classifier type specified")

    return clf