Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
if not id.isdigit() and len(id) > 0:
return id.upper()
else:
return 'X'
titanic_train['TicketId'] = titanic_train['Ticket'].map(extract_id)
pd.crosstab(index=titanic_train["TicketId"], columns="count")
pd.crosstab(index=titanic_train['Survived'], columns=titanic_train['TicketId'])
sns.factorplot(x="Survived", hue="TicketId", data=titanic_train, kind="count", size=6)
#process fare column
fare_imputer = preprocessing.Imputer(strategy="median")
fare_imputer.fit(titanic_train[['Fare']])
titanic_train1 = pd.get_dummies(titanic_train, columns=['Pclass', 'FamilyType', 'Embarked', 'Sex','Title','TicketId'])
type(titanic_train1)
titanic_train1.info()
titanic_train1.drop(['PassengerId','Name','FamilySize','SibSp','Parch','Ticket','Cabin','Survived'], axis=1, inplace=True)
X_train = titanic_train1
y_train = titanic_train['Survived']
parameter_grid = dict(n_estimators=[300,400],
criterion=['gini','entropy'],
max_features=[3,4,5,6,7,8])
rf_estimator = ensemble.RandomForestClassifier(random_state=100)
rf_grid_estimator = model_selection.GridSearchCV(estimator=rf_estimator, param_grid=parameter_grid, cv=10, verbose=1, n_jobs=10, refit=True)
rf_grid_estimator.fit(X_train,y_train)
rf_grid_estimator.grid_scores_
titanic_test = pd.read_csv("test.csv")
def dataPrepare(X, y):
RANDOM_STATE = 20
TEST_SIZE = 0.3
numList = X.dtypes[X.dtypes != 'object'].index
objList = X.dtypes[X.dtypes == 'object'].index
if len(numList)>0:
X_num = pd.DataFrame(preprocessing.StandardScaler().fit_transform(X[numList]),columns=numList)
X_num.reset_index(drop=True, inplace=True)
else:
X_num = pd.DataFrame([])
if len(objList)>0:
X_dummy = pd.concat([pd.get_dummies(X[i],prefix = i, drop_first=True) for i in objList], axis=1)
X_dummy.reset_index(drop=True, inplace=True)
else:
X_dummy = pd.DataFrame([])
X_final = pd.concat([X_num, X_dummy], axis = 1)
Xtrain, Xtest, ytrain, ytest = train_test_split(X_final, y, test_size=TEST_SIZE, random_state=RANDOM_STATE) #将数据集划分成训练集和测试集
trainSet = pd.concat([Xtrain, ytrain], axis=1)
testSet = pd.concat([Xtest, ytest], axis=1)
return trainSet, testSet
def fit_WLS(self, X, assignment, outcome, confounder_types, weight_name='weights', intercept='True'):
df = X[[assignment, outcome]].copy()
regression_confounders = []
for confounder, var_type in confounder_types.items():
if var_type == 'o' or var_type == 'u':
c_dummies = pd.get_dummies(X[[confounder]], prefix=confounder)
if len(c_dummies.columns) == 1:
df = pd.concat([df, c_dummies[c_dummies.columns]], axis=1)
regression_confounders.extend(c_dummies.columns)
else:
df = pd.concat([df, c_dummies[c_dummies.columns[1:]]], axis=1)
regression_confounders.extend(c_dummies.columns[1:])
else:
regression_confounders.append(confounder)
df.loc[:, confounder] = X[confounder].copy()
df.loc[:, confounder] = X[confounder].copy()
if intercept:
df.loc[:, 'intercept'] = 1.
regression_confounders.append('intercept')
model = WLS(df[outcome], df[[assignment] + regression_confounders], weights=X[weight_name])
result = model.fit()
self.wls_model = result
def getFitness(individual, X, y):
"""
Feature subset fitness function
"""
if(individual.count(0) != len(individual)):
# get index with value 0
cols = [index for index in range(
len(individual)) if individual[index] == 0]
# get features subset
X_parsed = X.drop(X.columns[cols], axis=1)
X_subset = pd.get_dummies(X_parsed)
# apply classification algorithm
clf = LogisticRegression()
return (avg(cross_val_score(clf, X_subset, y, cv=5)),)
else:
return(0,)
X,
self.excluded_columns)
for col in columns_to_dummify:
# Convert both object and category to object then category to force
# unseen levels out of category type columns. Low tech hack.
# There's probably a more elegant pandas way of doing this.
temp_object = X[col].astype(object)
# TODO nans are being inserted for categories that are unseen. Mode!
X[col] = temp_object.astype(
'category',
categories=self.categorical_levels[col])
# Create dummy variables
X = pd.get_dummies(
X,
columns=columns_to_dummify,
drop_first=False, prefix_sep='.')
return X
g.map_lower(sns.kdeplot)
g.map_upper(plt.scatter, s=2)
g.map_diag(sns.kdeplot, lw=3)
###############################################################
# **Prediction**: Can we predict test grades in maths from demographics
# (ie, not from other grades)?
# A bit of feature engineering to get a numerical matrix (easily done
# with the ColumnTransformer in scikit-learn >= 0.20)
X = exams.drop(columns=['StudentID', 'Maths', 'Ravens', 'English'])
# Encode gender as an integer variables
X['Gender'] = X['Gender'] == 'Girl'
# One-hot encode social class
X = pd.get_dummies(X, drop_first=True)
y = exams['Maths']
from sklearn import ensemble
print(cross_val_score(ensemble.GradientBoostingRegressor(), X, y,
cv=10).mean())
###############################################################
# We can predict!
#
# But there is one caveat: are we simply learning to recognive students
# across the years? There is many implicit informations about students:
# notably in the school ID and the class ID.
#
# **Stratification** To test for this, we can make sure that we have
# different students in the train and the test set.
from sklearn import model_selection
#offset = (k * accuracy_batch_size) % (test_y.shape[0] - accuracy_batch_size)
test_cnn_batch = cnn_test_x[start:(offset + start), :, :, :, :]
test_cnn_batch = test_cnn_batch.reshape(len(test_cnn_batch) * window_size, input_height, input_width, input_channel_num)
test_batch_y = test_y[start:(offset + start), :]
test_a, test_c, test_p, test_r = session.run([accuracy, cost, y_pred, y_posi],
feed_dict={cnn_in: test_cnn_batch,Y: test_batch_y, keep_prob: 1.0, phase_train: False})
test_t = test_batch_y
test_accuracy = np.append(test_accuracy, test_a)
test_loss = np.append(test_loss, test_c)
test_pred = np.append(test_pred, test_p)
test_true = np.vstack([test_true, test_t])
test_posi = np.vstack([test_posi, test_r])
# test_true = tf.argmax(test_true, 1)
test_pred_1_hot = np.asarray(pd.get_dummies(test_pred), dtype=np.int8)
test_true_list = tf.argmax(test_true, 1).eval()
print("(" + time.asctime(time.localtime(time.time())) + ") Final Test Cost: ", np.mean(test_loss),
"Final Test Accuracy: ", np.mean(test_accuracy))
# save result
# os.system("mkdir -p ./result/cnn_rnn_parallel/tune_rnn_layer/" + output_dir)
result = pd.DataFrame(
{'epoch': range(1, epoch + 2), "train_accuracy": train_accuracy_save, "test_accuracy": test_accuracy_save,
"train_loss": train_loss_save, "test_loss": test_loss_save})
ins = pd.DataFrame({'conv_1': conv_1_shape, 'conv_2': conv_2_shape,'conv_3': conv_3_shape,
'cnn_fc': fc_size,'accuracy': np.mean(test_accuracy),
'keep_prob': 1 - dropout_prob,"epoch": epoch + 1, "norm": norm_type,
"learning_rate": learning_rate, "regularization": regularization_method,
"train_sample": train_sample, "test_sample": test_sample,"batch_size":batch_size}, index=[0])
# summary = pd.DataFrame({'class': one_hot_labels, 'recall': test_recall, 'precision': test_precision,
misc_grp = "_others"
for cname in X_cat.columns:
x_cname = X_cat[cname].values
x_cname[pd.isnull(x_cname)] = na_grp
val_cnt = Counter(x_cname).most_common()
val_under_thr = [val for val, cnt in val_cnt
if (cnt+0.0)/n < cat_ratio_thr]
if len(val_under_thr) > 0:
x_cname[np.in1d(x_cname, val_under_thr)] = misc_grp
if len(val_cnt) > cat_max_k:
val_elig = [val for val, cnt in val_cnt[:(cat_max_k-1)]]
x_cname[~np.in1d(x_cname, val_elig)] = misc_grp
X_cat.loc[:,cname] = x_cname
# 1.2.
if len(X_cat.columns) > 0:
X_list.append(pd.get_dummies(X_cat))
col_num_exc = []
for cname in X_num.columns:
mean = X_num[cname].mean()
std = X_num[cname].std()
# 2.1.
if std < num_min_std or np.isnan(std):
col_num_exc.append(cname)
continue
# 2.2.
X_num.loc[:,cname] = (X_num[cname].values - mean)/std
# 2.3.
if len(col_num_exc) > 0:
X_list.append(X_num.drop(col_num_exc, axis=1))
else:
X_list.append(X_num)
# test_df["Month"] = test_df["Date"].apply(lambda x: dt.datetime.strptime(x, "%Y-%m-%d").month)
# Create "YearMonth" column
# training_df["YearMonth"] = training_df["Date"].apply(lambda x: str(dt.datetime.strptime(x, "%Y-%m-%d").year) + "-" + less_than_ten(str(dt.datetime.strptime(x, "%Y-%m-%d").month)))
# test_df["YearMonth"] = test_df["Date"].apply(lambda x: str(dt.datetime.strptime(x, "%Y-%m-%d").year) + "-" + less_than_ten(str(dt.datetime.strptime(x, "%Y-%m-%d").month)))
# "StateHoliday" has values "0" & 0
training_df["StateHoliday"].loc[training_df["StateHoliday"] == 0] = "0"
test_df["StateHoliday"].loc[test_df["StateHoliday"] == 0] = "0"
# Create "StateHolidayBinary" column
# training_df["StateHolidayBinary"] = training_df["StateHoliday"].map({0: 0, "0": 0, "a": 1, "b": 1, "c": 1})
# test_df["StateHolidayBinary"] = test_df["StateHoliday"].map({0: 0, "0": 0, "a": 1, "b": 1, "c": 1})
# One-hot encoding of "DayOfWeek" & "StateHoliday" columns
training_df = pd.get_dummies(training_df, columns=["DayOfWeek", "StateHoliday"])
test_df = pd.get_dummies(test_df, columns=["DayOfWeek", "StateHoliday"])
############################################
# store_df #
############################################
# Fill NaN values in store_df for "CompetitionDistance" = 0 (since no record exists where "CD" = NaN & "COS[Y/M]" = !NaN)
# store_df["CompetitionDistance"][is_nan(store_df["CompetitionDistance"])] = 0
# Fill NaN values in store_df for "CompetitionSince[X]" with 1900-01
# store_df["CompetitionOpenSinceYear"][(store_df["CompetitionDistance"] != 0) & (is_nan(store_df["CompetitionOpenSinceYear"]))] = 1900
# store_df["CompetitionOpenSinceMonth"][(store_df["CompetitionDistance"] != 0) & (is_nan(store_df["CompetitionOpenSinceMonth"]))] = 1
# One-hot encoding of "StoreType" & "Assortment" columns
# store_df = pd.get_dummies(store_df, columns=["StoreType", "Assortment"])
# Set up an empty data frame for data to be scaled
scale_df = pd.DataFrame()
ohe_df = None
hash_df = None
cv_df = None
tfidf_df = None
text_df = None
if self.ohe:
# Get a subset of the data that requires one hot encoding
ohe_df = X[self.ohe_meta.index.tolist()]
# Apply one hot encoding to relevant columns
ohe_df = pd.get_dummies(ohe_df, columns=ohe_df.columns)
# Keep a copy of the OHE dataframe structure so we can align the transform dataset
self.ohe_df_structure = pd.DataFrame().reindex_like(ohe_df)
# Scaling needs to be fit exclusively on the training data so as not to influence the results
if self.scale:
# Get a subset of the data that requires scaling
scale_df = X[self.scale_meta.index.tolist()]
if self.hash:
# Get a subset of the data that requires feature hashing
hash_df = X[self.hash_meta.index.tolist()]
hash_cols = hash_df.columns
# Hash unique values for each relevant column and then join to a dataframe for hashed data
for c in hash_cols: