Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
df = df.apply(lambda x: x.str.strip(' \t.'))
df.drop(['url', ' timedelta'], axis=1, inplace=True)
meta = []
for col_name in df.columns:
if "is_" in col_name:
meta.append({
"name": col_name,
"type": CATEGORICAL,
"size": 2,
"i2s": ['0', '1']
})
else:
meta.append({
"name": "label" if col_name.strip() == "shares" else col_name.strip(),
"type": CONTINUOUS,
"min": np.min(df[col_name].values.astype('float')),
"max": np.max(df[col_name].values.astype('float'))
})
tdata = df.values.astype('float32')
np.random.seed(0)
np.random.shuffle(tdata)
t_train = tdata[:-8000]
t_test = tdata[-8000:]
name = "news"
with open("{}/{}.json".format(output_dir, name), 'w') as f:
json.dump(meta, f, sort_keys=True, indent=4, separators=(',', ': '))
np.savez("{}/{}.npz".format(output_dir, name), train=t_train, test=t_test)
df = pd.read_csv("data/raw/adult/adult.data", dtype='str', header=-1)
df = df.apply(lambda x: x.str.strip(' \t.'))
col_type = [
("age", CONTINUOUS),
("workclass", CATEGORICAL),
("fnlwgt", CONTINUOUS),
("education", ORDINAL, ["Preschool", "1st-4th", "5th-6th", "7th-8th", "9th", "10th", "11th", "12th", "HS-grad", "Prof-school", "Assoc-voc", "Assoc-acdm", "Some-college", "Bachelors", "Masters", "Doctorate"]),
("education-num", CONTINUOUS),
("marital-status", CATEGORICAL),
("occupation", CATEGORICAL),
("relationship", CATEGORICAL),
("race", CATEGORICAL),
("sex", CATEGORICAL),
("capital-gain", CONTINUOUS),
("capital-loss", CONTINUOUS),
("hours-per-week", CONTINUOUS),
("native-country", CATEGORICAL),
("label", CATEGORICAL)
]
meta = []
for id_, info in enumerate(col_type):
if info[1] == CONTINUOUS:
meta.append({
"name": info[0],
"type": info[1],
"min": np.min(df.iloc[:, id_].values.astype('float')),
"max": np.max(df.iloc[:, id_].values.astype('float'))
})
else:
if info[1] == CATEGORICAL:
df.drop([19], axis=1, inplace=True)
col_type = [
("duration", CONTINUOUS),
("protocol_type", CATEGORICAL),
("service", CATEGORICAL),
("flag", CATEGORICAL),
("src_bytes", CONTINUOUS),
("dst_bytes", CONTINUOUS),
("land", CATEGORICAL),
("wrong_fragment", ORDINAL, ['0', '1', '2', '3']),
("urgent", ORDINAL, ['0', '1', '2', '3']),
("hot", CONTINUOUS),
("num_failed_logins", ORDINAL, ['0', '1', '2', '3', '4', '5']),
("logged_in", CATEGORICAL),
("num_compromised", CONTINUOUS),
("root_shell", CATEGORICAL),
("su_attempted", ORDINAL, ['0', '1', '2', '3']),
("num_root", CONTINUOUS),
("num_file_creations", CONTINUOUS),
("num_shells", ORDINAL, ['0', '1', '2']),
("num_access_files", ORDINAL, ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']),
# ("num_outbound_cmds", CONTINUOUS), # all zero, removed
("is_host_login", CATEGORICAL),
("is_guest_login", CATEGORICAL),
("count", CONTINUOUS),
("srv_count", CONTINUOUS),
("serror_rate", CONTINUOUS),
("srv_serror_rate", CONTINUOUS),
("rerror_rate", CONTINUOUS),
("srv_rerror_rate", CONTINUOUS),
("same_srv_rate", CONTINUOUS),
pass
try:
os.mkdir(temp_dir)
except:
pass
df = pd.read_csv("data/raw/creditcard.csv")
df.drop(columns=['Time'], inplace=True)
values = df.values
meta = []
for i in range(28):
meta.append({
"name": "V%d" % i,
"type": CONTINUOUS,
"min": np.min(values[:, i]),
"max": np.max(values[:, i])
})
meta.append({
"name": "Amount",
"type": CONTINUOUS,
"min": np.min(values[:, 28]),
"max": np.max(values[:, 28])
})
meta.append({
"name": "label",
"type": CATEGORICAL,
"size": 2,
"i2s": ["0", "1"]
})
try:
os.mkdir(output_dir)
except:
pass
try:
os.mkdir(temp_dir)
except:
pass
df = pd.read_csv("data/raw/covtype/covtype.data", dtype='str', header=-1)
col_type = [
("Elevation", CONTINUOUS),
("Aspect", CONTINUOUS),
("Slope", CONTINUOUS),
("Horizontal_Distance_To_Hydrology", CONTINUOUS),
("Vertical_Distance_To_Hydrology", CONTINUOUS),
("Horizontal_Distance_To_Roadways", CONTINUOUS),
("Hillshade_9am", CONTINUOUS),
("Hillshade_Noon", CONTINUOUS),
("Hillshade_3pm", CONTINUOUS),
("Horizontal_Distance_To_Fire_Points", CONTINUOUS)
] + [
("Wilderness_Area_{}".format(i), CATEGORICAL) for i in range(4)
] + [
("Soil_Type{}".format(i), CATEGORICAL) for i in range(40)
] + [
("label", CATEGORICAL)
]
meta = []
("detailed industry recode", CATEGORICAL),
("detailed occupation recode", CATEGORICAL),
("education", CATEGORICAL),
("wage per hour", CONTINUOUS),
("enroll in edu inst last wk", CATEGORICAL),
("marital stat", CATEGORICAL),
("major industry code", CATEGORICAL),
("major occupation code", CATEGORICAL),
("race", CATEGORICAL),
("hispanic origin", CATEGORICAL),
("sex", CATEGORICAL),
("member of a labor union", CATEGORICAL),
("reason for unemployment", CATEGORICAL),
("full or part time employment stat", CATEGORICAL),
("capital gains", CONTINUOUS),
("capital losses", CONTINUOUS),
("dividends from stocks", CONTINUOUS),
("tax filer stat", CATEGORICAL),
("region of previous residence", CATEGORICAL),
("state of previous residence", CATEGORICAL),
("detailed household and family stat", CATEGORICAL),
("detailed household summary in household", CATEGORICAL),
("migration code-change in msa", CATEGORICAL),
("migration code-change in reg", CATEGORICAL),
("migration code-move within reg", CATEGORICAL),
("live in this house 1 year ago", CATEGORICAL),
("migration prev res in sunbelt", CATEGORICAL),
("num persons worked for employer", CONTINUOUS),
("family members under 18", CATEGORICAL),
("country of birth father", CATEGORICAL),
("country of birth mother", CATEGORICAL),
("country of birth self", CATEGORICAL),