How to use the sdgym.utils.data.utils.CONTINUOUS function in sdgym

To help you get started, we’ve selected a few sdgym examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github DAI-Lab / SDGym / sdgym / utils / data / real / news.py View on Github external
df = df.apply(lambda x: x.str.strip(' \t.'))
    df.drop(['url', ' timedelta'], axis=1, inplace=True)

    meta = []
    for col_name in df.columns:
        if "is_" in col_name:
            meta.append({
                "name": col_name,
                "type": CATEGORICAL,
                "size": 2,
                "i2s": ['0', '1']
            })
        else:
            meta.append({
                "name": "label" if col_name.strip() == "shares" else col_name.strip(),
                "type": CONTINUOUS,
                "min": np.min(df[col_name].values.astype('float')),
                "max": np.max(df[col_name].values.astype('float'))
            })

    tdata = df.values.astype('float32')

    np.random.seed(0)
    np.random.shuffle(tdata)

    t_train = tdata[:-8000]
    t_test = tdata[-8000:]

    name = "news"
    with open("{}/{}.json".format(output_dir, name), 'w') as f:
        json.dump(meta, f, sort_keys=True, indent=4, separators=(',', ': '))
    np.savez("{}/{}.npz".format(output_dir, name), train=t_train, test=t_test)
github DAI-Lab / SDGym / sdgym / utils / data / real / adult.py View on Github external
df = pd.read_csv("data/raw/adult/adult.data", dtype='str', header=-1)
    df = df.apply(lambda x: x.str.strip(' \t.'))

    col_type = [
        ("age", CONTINUOUS),
        ("workclass", CATEGORICAL),
        ("fnlwgt", CONTINUOUS),
        ("education", ORDINAL, ["Preschool", "1st-4th", "5th-6th", "7th-8th", "9th", "10th", "11th", "12th", "HS-grad", "Prof-school", "Assoc-voc", "Assoc-acdm", "Some-college", "Bachelors", "Masters", "Doctorate"]),
        ("education-num", CONTINUOUS),
        ("marital-status", CATEGORICAL),
        ("occupation", CATEGORICAL),
        ("relationship", CATEGORICAL),
        ("race", CATEGORICAL),
        ("sex", CATEGORICAL),
        ("capital-gain", CONTINUOUS),
        ("capital-loss", CONTINUOUS),
        ("hours-per-week", CONTINUOUS),
        ("native-country", CATEGORICAL),
        ("label", CATEGORICAL)
    ]

    meta = []
    for id_, info in enumerate(col_type):
        if info[1] == CONTINUOUS:
            meta.append({
                "name": info[0],
                "type": info[1],
                "min": np.min(df.iloc[:, id_].values.astype('float')),
                "max": np.max(df.iloc[:, id_].values.astype('float'))
            })
        else:
            if info[1] == CATEGORICAL:
github DAI-Lab / SDGym / sdgym / utils / data / real / intrusion.py View on Github external
df.drop([19], axis=1, inplace=True)

    col_type = [
        ("duration", CONTINUOUS),
        ("protocol_type", CATEGORICAL),
        ("service", CATEGORICAL),
        ("flag", CATEGORICAL),
        ("src_bytes", CONTINUOUS),
        ("dst_bytes", CONTINUOUS),
        ("land", CATEGORICAL),
        ("wrong_fragment", ORDINAL, ['0', '1', '2', '3']),
        ("urgent", ORDINAL, ['0', '1', '2', '3']),
        ("hot", CONTINUOUS),
        ("num_failed_logins", ORDINAL, ['0', '1', '2', '3', '4', '5']),
        ("logged_in", CATEGORICAL),
        ("num_compromised", CONTINUOUS),
        ("root_shell", CATEGORICAL),
        ("su_attempted", ORDINAL, ['0', '1', '2', '3']),
        ("num_root", CONTINUOUS),
        ("num_file_creations", CONTINUOUS),
        ("num_shells", ORDINAL, ['0', '1', '2']),
        ("num_access_files", ORDINAL, ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']),
        # ("num_outbound_cmds", CONTINUOUS), # all zero, removed
        ("is_host_login", CATEGORICAL),
        ("is_guest_login", CATEGORICAL),
        ("count", CONTINUOUS),
        ("srv_count", CONTINUOUS),
        ("serror_rate", CONTINUOUS),
        ("srv_serror_rate", CONTINUOUS),
        ("rerror_rate", CONTINUOUS),
        ("srv_rerror_rate", CONTINUOUS),
        ("same_srv_rate", CONTINUOUS),
github DAI-Lab / SDGym / sdgym / utils / data / real / credit.py View on Github external
pass

    try:
        os.mkdir(temp_dir)
    except:
        pass

    df = pd.read_csv("data/raw/creditcard.csv")
    df.drop(columns=['Time'], inplace=True)
    values = df.values

    meta = []
    for i in range(28):
        meta.append({
            "name": "V%d" % i,
            "type": CONTINUOUS,
            "min": np.min(values[:, i]),
            "max": np.max(values[:, i])
        })
    meta.append({
        "name": "Amount",
        "type": CONTINUOUS,
        "min": np.min(values[:, 28]),
        "max": np.max(values[:, 28])
    })
    meta.append({
        "name": "label",
        "type": CATEGORICAL,
        "size": 2,
        "i2s": ["0", "1"]
    })
github DAI-Lab / SDGym / sdgym / utils / data / real / covtype.py View on Github external
try:
        os.mkdir(output_dir)
    except:
        pass

    try:
        os.mkdir(temp_dir)
    except:
        pass

    df = pd.read_csv("data/raw/covtype/covtype.data", dtype='str', header=-1)

    col_type = [
        ("Elevation", CONTINUOUS),
        ("Aspect", CONTINUOUS),
        ("Slope", CONTINUOUS),
        ("Horizontal_Distance_To_Hydrology", CONTINUOUS),
        ("Vertical_Distance_To_Hydrology", CONTINUOUS),
        ("Horizontal_Distance_To_Roadways", CONTINUOUS),
        ("Hillshade_9am", CONTINUOUS),
        ("Hillshade_Noon", CONTINUOUS),
        ("Hillshade_3pm", CONTINUOUS),
        ("Horizontal_Distance_To_Fire_Points", CONTINUOUS)
    ] + [
        ("Wilderness_Area_{}".format(i), CATEGORICAL) for i in range(4)
    ] + [
        ("Soil_Type{}".format(i), CATEGORICAL) for i in range(40)
    ] + [
        ("label", CATEGORICAL)
    ]

    meta = []
github DAI-Lab / SDGym / sdgym / utils / data / real / census.py View on Github external
("detailed industry recode", CATEGORICAL),
        ("detailed occupation recode", CATEGORICAL),
        ("education", CATEGORICAL),
        ("wage per hour", CONTINUOUS),
        ("enroll in edu inst last wk", CATEGORICAL),
        ("marital stat", CATEGORICAL),
        ("major industry code", CATEGORICAL),
        ("major occupation code", CATEGORICAL),
        ("race", CATEGORICAL),
        ("hispanic origin", CATEGORICAL),
        ("sex", CATEGORICAL),
        ("member of a labor union", CATEGORICAL),
        ("reason for unemployment", CATEGORICAL),
        ("full or part time employment stat", CATEGORICAL),
        ("capital gains", CONTINUOUS),
        ("capital losses", CONTINUOUS),
        ("dividends from stocks", CONTINUOUS),
        ("tax filer stat", CATEGORICAL),
        ("region of previous residence", CATEGORICAL),
        ("state of previous residence", CATEGORICAL),
        ("detailed household and family stat", CATEGORICAL),
        ("detailed household summary in household", CATEGORICAL),
        ("migration code-change in msa", CATEGORICAL),
        ("migration code-change in reg", CATEGORICAL),
        ("migration code-move within reg", CATEGORICAL),
        ("live in this house 1 year ago", CATEGORICAL),
        ("migration prev res in sunbelt", CATEGORICAL),
        ("num persons worked for employer", CONTINUOUS),
        ("family members under 18", CATEGORICAL),
        ("country of birth father", CATEGORICAL),
        ("country of birth mother", CATEGORICAL),
        ("country of birth self", CATEGORICAL),