How to use the pandas.get_dummies function in pandas

To help you get started, we’ve selected a few pandas examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github algorithmica-repository / datascience / 2017-feb / 14.kaggle-IV(classification) / titanic-v6(fe,pre,mb).py View on Github external
if not id.isdigit() and len(id) > 0:
            return id.upper()
        else: 
            return 'X'

titanic_train['TicketId'] = titanic_train['Ticket'].map(extract_id)

pd.crosstab(index=titanic_train["TicketId"], columns="count")   
pd.crosstab(index=titanic_train['Survived'], columns=titanic_train['TicketId'])
sns.factorplot(x="Survived", hue="TicketId", data=titanic_train, kind="count", size=6)

#process fare column
fare_imputer = preprocessing.Imputer(strategy="median")
fare_imputer.fit(titanic_train[['Fare']])

titanic_train1 = pd.get_dummies(titanic_train, columns=['Pclass', 'FamilyType', 'Embarked', 'Sex','Title','TicketId'])
type(titanic_train1)
titanic_train1.info()
titanic_train1.drop(['PassengerId','Name','FamilySize','SibSp','Parch','Ticket','Cabin','Survived'], axis=1, inplace=True)

X_train = titanic_train1
y_train = titanic_train['Survived']

parameter_grid = dict(n_estimators=[300,400],
                      criterion=['gini','entropy'],
                      max_features=[3,4,5,6,7,8])
rf_estimator = ensemble.RandomForestClassifier(random_state=100)
rf_grid_estimator = model_selection.GridSearchCV(estimator=rf_estimator, param_grid=parameter_grid, cv=10, verbose=1, n_jobs=10, refit=True)
rf_grid_estimator.fit(X_train,y_train)
rf_grid_estimator.grid_scores_

titanic_test = pd.read_csv("test.csv")
github sql-machine-learning / sqlflow / doc / tutorial / apsara2019 / carprice_xgboost / carprice_preprocessing.py View on Github external
def dataPrepare(X, y):
    RANDOM_STATE = 20
    TEST_SIZE = 0.3
    numList = X.dtypes[X.dtypes != 'object'].index
    objList = X.dtypes[X.dtypes == 'object'].index 
    if len(numList)>0:
        X_num = pd.DataFrame(preprocessing.StandardScaler().fit_transform(X[numList]),columns=numList)
        X_num.reset_index(drop=True, inplace=True)
    else:
        X_num = pd.DataFrame([])

    if len(objList)>0:
        X_dummy = pd.concat([pd.get_dummies(X[i],prefix = i, drop_first=True) for i in objList], axis=1)
        X_dummy.reset_index(drop=True, inplace=True)
    else:
        X_dummy = pd.DataFrame([])
    X_final = pd.concat([X_num, X_dummy], axis = 1)
    Xtrain, Xtest, ytrain, ytest = train_test_split(X_final, y, test_size=TEST_SIZE, random_state=RANDOM_STATE) #将数据集划分成训练集和测试集
    trainSet = pd.concat([Xtrain, ytrain], axis=1)
    testSet = pd.concat([Xtest, ytest], axis=1)
    return trainSet, testSet
github akelleh / causality / causality / estimation / parametric.py View on Github external
def fit_WLS(self, X, assignment, outcome, confounder_types, weight_name='weights', intercept='True'):
        df = X[[assignment, outcome]].copy()
        regression_confounders = []
        for confounder, var_type in confounder_types.items():
            if var_type == 'o' or var_type == 'u':
                c_dummies = pd.get_dummies(X[[confounder]], prefix=confounder)
                if len(c_dummies.columns) == 1:
                    df = pd.concat([df, c_dummies[c_dummies.columns]], axis=1)
                    regression_confounders.extend(c_dummies.columns)
                else:
                    df = pd.concat([df, c_dummies[c_dummies.columns[1:]]], axis=1)
                    regression_confounders.extend(c_dummies.columns[1:])
            else:
                regression_confounders.append(confounder)
                df.loc[:, confounder] = X[confounder].copy()
                df.loc[:, confounder] = X[confounder].copy()
        if intercept:
            df.loc[:, 'intercept'] = 1.
            regression_confounders.append('intercept')
        model = WLS(df[outcome], df[[assignment] + regression_confounders], weights=X[weight_name])
        result = model.fit()
        self.wls_model = result
github renatoosousa / GeneticAlgorithmForFeatureSelection / gaFeatureSelection.py View on Github external
def getFitness(individual, X, y):
    """
    Feature subset fitness function
    """

    if(individual.count(0) != len(individual)):
        # get index with value 0
        cols = [index for index in range(
            len(individual)) if individual[index] == 0]

        # get features subset
        X_parsed = X.drop(X.columns[cols], axis=1)
        X_subset = pd.get_dummies(X_parsed)

        # apply classification algorithm
        clf = LogisticRegression()

        return (avg(cross_val_score(clf, X_subset, y, cv=5)),)
    else:
        return(0,)
github HealthCatalyst / healthcareai-py / healthcareai / common / transformers.py View on Github external
X,
            self.excluded_columns)

        for col in columns_to_dummify:
            # Convert both object and category to object then category to force
            # unseen levels out of category type columns. Low tech hack.
            # There's probably a more elegant pandas way of doing this.
            temp_object = X[col].astype(object)

            # TODO nans are being inserted for categories that are unseen. Mode!
            X[col] = temp_object.astype(
                'category',
                categories=self.categorical_levels[col])

        # Create dummy variables
        X = pd.get_dummies(
            X,
            columns=columns_to_dummify,
            drop_first=False, prefix_sep='.')

        return X
github GaelVaroquaux / interpreting_ml_tuto / _downloads / 02_cross_validation.py View on Github external
g.map_lower(sns.kdeplot)
g.map_upper(plt.scatter, s=2)
g.map_diag(sns.kdeplot, lw=3)


###############################################################
# **Prediction**: Can we predict test grades in maths from demographics
# (ie, not from other grades)?

# A bit of feature engineering to get a numerical matrix (easily done
# with the ColumnTransformer in scikit-learn >= 0.20)
X = exams.drop(columns=['StudentID', 'Maths', 'Ravens', 'English'])
# Encode gender as an integer variables
X['Gender'] = X['Gender'] == 'Girl'
# One-hot encode social class
X = pd.get_dummies(X, drop_first=True)
y = exams['Maths']

from sklearn import ensemble
print(cross_val_score(ensemble.GradientBoostingRegressor(), X, y,
                      cv=10).mean())

###############################################################
# We can predict!
#
# But there is one caveat: are we simply learning to recognive students
# across the years? There is many implicit informations about students:
# notably in the school ID and the class ID.
#
# **Stratification** To test for this, we can make sure that we have
# different students in the train and the test set.
from sklearn import model_selection
github ynulonger / DE_CNN / cnn.py View on Github external
#offset = (k * accuracy_batch_size) % (test_y.shape[0] - accuracy_batch_size)
            test_cnn_batch = cnn_test_x[start:(offset + start), :, :, :, :]
            test_cnn_batch = test_cnn_batch.reshape(len(test_cnn_batch) * window_size, input_height, input_width, input_channel_num)
            test_batch_y = test_y[start:(offset + start), :]

            test_a, test_c, test_p, test_r = session.run([accuracy, cost, y_pred, y_posi],
                                                         feed_dict={cnn_in: test_cnn_batch,Y: test_batch_y, keep_prob: 1.0, phase_train: False})
            test_t = test_batch_y

            test_accuracy = np.append(test_accuracy, test_a)
            test_loss = np.append(test_loss, test_c)
            test_pred = np.append(test_pred, test_p)
            test_true = np.vstack([test_true, test_t])
            test_posi = np.vstack([test_posi, test_r])
        # test_true = tf.argmax(test_true, 1)
        test_pred_1_hot = np.asarray(pd.get_dummies(test_pred), dtype=np.int8)
        test_true_list = tf.argmax(test_true, 1).eval()

        print("(" + time.asctime(time.localtime(time.time())) + ") Final Test Cost: ", np.mean(test_loss),
              "Final Test Accuracy: ", np.mean(test_accuracy))
        # save result
    #    os.system("mkdir -p ./result/cnn_rnn_parallel/tune_rnn_layer/" + output_dir)
        result = pd.DataFrame(
            {'epoch': range(1, epoch + 2), "train_accuracy": train_accuracy_save, "test_accuracy": test_accuracy_save,
             "train_loss": train_loss_save, "test_loss": test_loss_save})

        ins = pd.DataFrame({'conv_1': conv_1_shape, 'conv_2': conv_2_shape,'conv_3': conv_3_shape,
                            'cnn_fc': fc_size,'accuracy': np.mean(test_accuracy),
                            'keep_prob': 1 - dropout_prob,"epoch": epoch + 1, "norm": norm_type,
                            "learning_rate": learning_rate, "regularization": regularization_method,
                            "train_sample": train_sample, "test_sample": test_sample,"batch_size":batch_size}, index=[0])
    #    summary = pd.DataFrame({'class': one_hot_labels, 'recall': test_recall, 'precision': test_precision,
github yubin-park / bonsai-dt / research / utils.py View on Github external
misc_grp = "_others"
    for cname in X_cat.columns:
        x_cname = X_cat[cname].values
        x_cname[pd.isnull(x_cname)] = na_grp
        val_cnt = Counter(x_cname).most_common()
        val_under_thr = [val for val, cnt in val_cnt 
                        if (cnt+0.0)/n < cat_ratio_thr]
        if len(val_under_thr) > 0:
            x_cname[np.in1d(x_cname, val_under_thr)] = misc_grp
        if len(val_cnt) > cat_max_k:
            val_elig = [val for val, cnt in val_cnt[:(cat_max_k-1)]]
            x_cname[~np.in1d(x_cname, val_elig)] = misc_grp
        X_cat.loc[:,cname] = x_cname
    # 1.2.
    if len(X_cat.columns) > 0:
        X_list.append(pd.get_dummies(X_cat))
    
    col_num_exc = []
    for cname in X_num.columns:
        mean = X_num[cname].mean()
        std = X_num[cname].std()
        # 2.1.
        if std < num_min_std or np.isnan(std):
            col_num_exc.append(cname)
            continue
        # 2.2.
        X_num.loc[:,cname] = (X_num[cname].values - mean)/std
    # 2.3. 
    if len(col_num_exc) > 0:
        X_list.append(X_num.drop(col_num_exc, axis=1))
    else:
        X_list.append(X_num)
github SuyashLakhotia / RossmannStoreSales / linearregression-independent3.py View on Github external
# test_df["Month"] = test_df["Date"].apply(lambda x: dt.datetime.strptime(x, "%Y-%m-%d").month)

# Create "YearMonth" column
# training_df["YearMonth"] = training_df["Date"].apply(lambda x: str(dt.datetime.strptime(x, "%Y-%m-%d").year) + "-" + less_than_ten(str(dt.datetime.strptime(x, "%Y-%m-%d").month)))
# test_df["YearMonth"] = test_df["Date"].apply(lambda x: str(dt.datetime.strptime(x, "%Y-%m-%d").year) + "-" + less_than_ten(str(dt.datetime.strptime(x, "%Y-%m-%d").month)))

# "StateHoliday" has values "0" & 0
training_df["StateHoliday"].loc[training_df["StateHoliday"] == 0] = "0"
test_df["StateHoliday"].loc[test_df["StateHoliday"] == 0] = "0"

# Create "StateHolidayBinary" column
# training_df["StateHolidayBinary"] = training_df["StateHoliday"].map({0: 0, "0": 0, "a": 1, "b": 1, "c": 1})
# test_df["StateHolidayBinary"] = test_df["StateHoliday"].map({0: 0, "0": 0, "a": 1, "b": 1, "c": 1})

# One-hot encoding of "DayOfWeek" & "StateHoliday" columns
training_df = pd.get_dummies(training_df, columns=["DayOfWeek", "StateHoliday"])
test_df = pd.get_dummies(test_df, columns=["DayOfWeek", "StateHoliday"])

############################################
# store_df                                 #
############################################

# Fill NaN values in store_df for "CompetitionDistance" = 0 (since no record exists where "CD" = NaN & "COS[Y/M]" = !NaN)
# store_df["CompetitionDistance"][is_nan(store_df["CompetitionDistance"])] = 0

# Fill NaN values in store_df for "CompetitionSince[X]" with 1900-01
# store_df["CompetitionOpenSinceYear"][(store_df["CompetitionDistance"] != 0) & (is_nan(store_df["CompetitionOpenSinceYear"]))] = 1900
# store_df["CompetitionOpenSinceMonth"][(store_df["CompetitionDistance"] != 0) & (is_nan(store_df["CompetitionOpenSinceMonth"]))] = 1

# One-hot encoding of "StoreType" & "Assortment" columns
# store_df = pd.get_dummies(store_df, columns=["StoreType", "Assortment"])
github nabeel-oz / qlik-py-tools / core / _machine_learning.py View on Github external
# Set up an empty data frame for data to be scaled
        scale_df = pd.DataFrame()

        ohe_df = None
        hash_df = None
        cv_df = None
        tfidf_df = None
        text_df = None
        
        if self.ohe:
            # Get a subset of the data that requires one hot encoding
            ohe_df = X[self.ohe_meta.index.tolist()]
                
            # Apply one hot encoding to relevant columns
            ohe_df = pd.get_dummies(ohe_df, columns=ohe_df.columns)
            
            # Keep a copy of the OHE dataframe structure so we can align the transform dataset 
            self.ohe_df_structure = pd.DataFrame().reindex_like(ohe_df)
        
        # Scaling needs to be fit exclusively on the training data so as not to influence the results
        if self.scale:
            # Get a subset of the data that requires scaling
            scale_df = X[self.scale_meta.index.tolist()]
                   
        if self.hash:
            # Get a subset of the data that requires feature hashing
            hash_df = X[self.hash_meta.index.tolist()]
            hash_cols = hash_df.columns

            # Hash unique values for each relevant column and then join to a dataframe for hashed data
            for c in hash_cols: