Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_TargetEncoder(generate_data):
df = generate_data()
feature_cols = [x for x in df.columns if x != TARGET_COL]
cat_cols = [x for x in feature_cols if df[x].nunique() < 100]
te = TargetEncoder()
X_cat = te.fit_transform(df[cat_cols], df[TARGET_COL])
print('Without CV:\n{}'.format(X_cat.head()))
assert X_cat.shape[1] == len(cat_cols)
cv = KFold(n_splits=N_FOLD, shuffle=True, random_state=RANDOM_SEED)
te = TargetEncoder(cv=cv)
X_cat = te.fit_transform(df[cat_cols], df[TARGET_COL])
print('With CV (fit_transform()):\n{}'.format(X_cat.head()))
assert X_cat.shape[1] == len(cat_cols)
te = TargetEncoder(cv=cv)
te.fit(df[cat_cols], df[TARGET_COL])
X_cat = te.transform(df[cat_cols])
print('With CV (fit() and transform() separately):\n{}'.format(X_cat.head()))
assert X_cat.shape[1] == len(cat_cols)
cat_cols = [x for x in feature_cols if df[x].nunique() < 100]
te = TargetEncoder()
X_cat = te.fit_transform(df[cat_cols], df[TARGET_COL])
print('Without CV:\n{}'.format(X_cat.head()))
assert X_cat.shape[1] == len(cat_cols)
cv = KFold(n_splits=N_FOLD, shuffle=True, random_state=RANDOM_SEED)
te = TargetEncoder(cv=cv)
X_cat = te.fit_transform(df[cat_cols], df[TARGET_COL])
print('With CV (fit_transform()):\n{}'.format(X_cat.head()))
assert X_cat.shape[1] == len(cat_cols)
te = TargetEncoder(cv=cv)
te.fit(df[cat_cols], df[TARGET_COL])
X_cat = te.transform(df[cat_cols])
print('With CV (fit() and transform() separately):\n{}'.format(X_cat.head()))
assert X_cat.shape[1] == len(cat_cols)
def test_TargetEncoder(generate_data):
df = generate_data()
feature_cols = [x for x in df.columns if x != TARGET_COL]
cat_cols = [x for x in feature_cols if df[x].nunique() < 100]
te = TargetEncoder()
X_cat = te.fit_transform(df[cat_cols], df[TARGET_COL])
print('Without CV:\n{}'.format(X_cat.head()))
assert X_cat.shape[1] == len(cat_cols)
cv = KFold(n_splits=N_FOLD, shuffle=True, random_state=RANDOM_SEED)
te = TargetEncoder(cv=cv)
X_cat = te.fit_transform(df[cat_cols], df[TARGET_COL])
print('With CV (fit_transform()):\n{}'.format(X_cat.head()))
assert X_cat.shape[1] == len(cat_cols)
te = TargetEncoder(cv=cv)
te.fit(df[cat_cols], df[TARGET_COL])
X_cat = te.transform(df[cat_cols])
print('With CV (fit() and transform() separately):\n{}'.format(X_cat.head()))
def test_automl():
X, y = make_regression(n_samples=N_OBS,
n_features=N_FEATURE,
n_informative=N_IMP_FEATURE,
random_state=RANDOM_SEED)
X = pd.DataFrame(X, columns=['x{}'.format(i) for i in range(X.shape[1])])
y = pd.Series(y)
logging.info(X.shape, y.shape)
X_trn, X_tst, y_trn, y_tst = train_test_split(X, y, test_size=.2, random_state=RANDOM_SEED)
model = AutoLGB(objective='regression', metric='l1')
model.tune(X_trn, y_trn)
model.fit(X_trn, y_trn)
p = model.predict(X_tst)
r = (np.random.rand(X_tst.shape[0]) * (y_trn.max() - y_trn.min()) + y_trn.min())
logging.info('MAE (LGB): {:.4f}'.format(mae(y_tst, p)))
assert mae(y_tst, p) < mae(y_tst, r)
model = AutoXGB(objective='reg:linear', metric='rmse')
model.tune(X_trn, y_trn)
model.fit(X_trn, y_trn)
p = model.predict(X_tst)
r = (np.random.rand(X_tst.shape[0]) * (y_trn.max() - y_trn.min()) + y_trn.min())
logging.info('MAE (XGB): {:.4f}'.format(mae(y_tst, p)))
assert mae(y_tst, p) < mae(y_tst, r)
random_state=RANDOM_SEED)
X = pd.DataFrame(X, columns=['x{}'.format(i) for i in range(X.shape[1])])
y = pd.Series(y)
logging.info(X.shape, y.shape)
X_trn, X_tst, y_trn, y_tst = train_test_split(X, y, test_size=.2, random_state=RANDOM_SEED)
model = AutoLGB(objective='regression', metric='l1')
model.tune(X_trn, y_trn)
model.fit(X_trn, y_trn)
p = model.predict(X_tst)
r = (np.random.rand(X_tst.shape[0]) * (y_trn.max() - y_trn.min()) + y_trn.min())
logging.info('MAE (LGB): {:.4f}'.format(mae(y_tst, p)))
assert mae(y_tst, p) < mae(y_tst, r)
model = AutoXGB(objective='reg:linear', metric='rmse')
model.tune(X_trn, y_trn)
model.fit(X_trn, y_trn)
p = model.predict(X_tst)
r = (np.random.rand(X_tst.shape[0]) * (y_trn.max() - y_trn.min()) + y_trn.min())
logging.info('MAE (XGB): {:.4f}'.format(mae(y_tst, p)))
assert mae(y_tst, p) < mae(y_tst, r)
X_emb = ee.fit_transform(X=df[feature_cols], y=df[TARGET_COL])
assert X_emb.shape[1] == sum(ee.n_emb)
print('Test with the binary classification target')
df[TARGET_COL] = (df[TARGET_COL] > df[TARGET_COL].mean()).astype(int)
ee = EmbeddingEncoder(cat_cols=cat_cols,
num_cols=num_cols,
random_state=RANDOM_SEED)
X_emb = ee.fit_transform(X=df[feature_cols], y=df[TARGET_COL])
assert X_emb.shape[1] == sum(ee.n_emb)
print('Test with the binary classification target with cross validation')
cv = KFold(n_splits=N_FOLD, shuffle=True, random_state=RANDOM_SEED)
ee = EmbeddingEncoder(cat_cols=cat_cols,
num_cols=num_cols,
cv=cv,
random_state=RANDOM_SEED)
X_emb = ee.fit_transform(X=df[feature_cols], y=df[TARGET_COL])
assert X_emb.shape[1] == sum(ee.n_emb)
feature_cols = [x for x in df.columns if x != TARGET_COL]
cat_cols = [x for x in feature_cols if df[x].nunique() < 100]
num_cols = [x for x in feature_cols if x not in cat_cols]
print('Test with the regression target')
ee = EmbeddingEncoder(cat_cols=cat_cols,
num_cols=num_cols,
random_state=RANDOM_SEED)
X_emb = ee.fit_transform(X=df[feature_cols], y=df[TARGET_COL])
assert X_emb.shape[1] == sum(ee.n_emb)
print('Test with the binary classification target')
df[TARGET_COL] = (df[TARGET_COL] > df[TARGET_COL].mean()).astype(int)
ee = EmbeddingEncoder(cat_cols=cat_cols,
num_cols=num_cols,
random_state=RANDOM_SEED)
X_emb = ee.fit_transform(X=df[feature_cols], y=df[TARGET_COL])
assert X_emb.shape[1] == sum(ee.n_emb)
print('Test with the binary classification target with cross validation')
cv = KFold(n_splits=N_FOLD, shuffle=True, random_state=RANDOM_SEED)
ee = EmbeddingEncoder(cat_cols=cat_cols,
num_cols=num_cols,
cv=cv,
random_state=RANDOM_SEED)
X_emb = ee.fit_transform(X=df[feature_cols], y=df[TARGET_COL])
assert X_emb.shape[1] == sum(ee.n_emb)
def train_predict(train_file, test_file, predict_valid_file, predict_test_file,
n_est=100, batch_size=1024, retrain=True):
model_name = os.path.splitext(os.path.splitext(os.path.basename(predict_test_file))[0])[0]
logging.basicConfig(format='%(asctime)s %(levelname)s %(message)s',
level=logging.DEBUG,
filename='{}.log'.format(model_name))
logging.info('Loading training and test data...')
X, y = load_data(train_file)
X_tst, _ = load_data(test_file)
dims = X.shape[1]
logging.info('{} dims'.format(dims))
logging.info('Loading CV Ids')
cv = StratifiedKFold(n_splits=N_FOLD, shuffle=True, random_state=SEED)
p = np.zeros_like(y)
p_tst = np.zeros((X_tst.shape[0],))
for i, (i_trn, i_val) in enumerate(cv.split(X, y), 1):
logging.info('Training model #{}'.format(i))
clf = nn_model(dims)
clf.fit_generator(generator=batch_generator(X[i_trn],
y[i_trn],
batch_size,
def train_predict(train_file, test_file, predict_valid_file, predict_test_file,
n_est=100, batch_size=1024, retrain=True):
model_name = os.path.splitext(os.path.splitext(os.path.basename(predict_test_file))[0])[0]
logging.basicConfig(format='%(asctime)s %(levelname)s %(message)s',
level=logging.DEBUG,
filename='{}.log'.format(model_name))
logging.info('Loading training and test data...')
X, y = load_data(train_file)
X_tst, _ = load_data(test_file)
dims = X.shape[1]
logging.info('{} dims'.format(dims))
logging.info('Loading CV Ids')
cv = StratifiedKFold(n_splits=N_FOLD, shuffle=True, random_state=SEED)
p = np.zeros_like(y)
p_tst = np.zeros((X_tst.shape[0],))
for i, (i_trn, i_val) in enumerate(cv.split(X, y), 1):
logging.info('Training model #{}'.format(i))
clf = nn_model(dims)
clf.fit_generator(generator=batch_generator(X[i_trn],
y[i_trn],
batch_size,
True),
algo=tpe.suggest, max_evals=n_eval, verbose=1,
rstate=self.random_state)
hyperparams = space_eval(self.space, best)
return hyperparams, trials
def fit(self, X, y):
self.model = XGBModel(n_estimators=self.n_best, **self.params)
self.model.fit(X=X[self.features], y=y, eval_metric='mae', verbose=False)
return self
def predict(self, X):
return self.model.predict(X[self.features])
class AutoLGB(BaseAutoML):
params = {
"bagging_freq": 1,
"verbosity": -1,
"seed": RANDOM_SEED,
"num_threads": -1,
}
space = {
"learning_rate": hp.loguniform("learning_rate", np.log(0.01), np.log(0.3)),
"num_leaves": hp.choice("num_leaves", [15, 31, 63, 127, 255]),
"max_depth": hp.choice("max_depth", [-1, 4, 6, 8, 10]),
"feature_fraction": hp.quniform("feature_fraction", .5, .9, 0.1),
"bagging_fraction": hp.quniform("bagging_fraction", .5, .9, 0.1),
"min_child_samples": hp.choice('min_child_samples', [10, 25, 100]),
"lambda_l1": hp.choice('lambda_l1', [0, .1, 1, 10]),