How to use the matminer.datasets.dataset_retrieval.load_dataset function in matminer

To help you get started, we’ve selected a few matminer examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github hackingmaterials / automatminer / automatminer / automl / adaptors.py View on Github external
not_in_model))
        else:
            X = df[self._features].values  # rectify feature order
            y_pred = self._backend.predict(X)
            df[target + " predicted"] = y_pred
            self._logger.debug("Prediction finished successfully.")
            return df


if __name__ == "__main__":
    from matminer.datasets.dataset_retrieval import load_dataset
    from automatminer.featurization import AutoFeaturizer
    from automatminer.preprocessing import DataCleaner, FeatureReducer

    # Load a dataset
    df = load_dataset("elastic_tensor_2015").rename(
        columns={"formula": "composition"})[["composition", "K_VRH"]]
    testdf = df.iloc[501:550]
    traindf = df.iloc[:100]
    target = "K_VRH"

    # Get top-lvel transformers
    autofeater = AutoFeaturizer()
    cleaner = DataCleaner()
    reducer = FeatureReducer()
    learner = TPOTAdaptor("regression", max_time_mins=5)

    # Fit transformers on training data
    traindf = autofeater.fit_transform(traindf, target)
    traindf = cleaner.fit_transform(traindf, target)
    traindf = reducer.fit_transform(traindf, target)
    learner.fit(traindf, target)
github hackingmaterials / automatminer / dev_scripts / evaluation / benchmarker.py View on Github external
"""
This file will eventually hold a function that tests a mslearn
pipeline on a set of datasets for predictive power.
"""

from matminer.datasets.dataset_retrieval import load_dataset, get_available_datasets
# from matminer.datasets.convenience_loaders import

if __name__ == "__main__":
    df_piezo = load_dataset("piezoelectric_tensor")
    df_exgap = load_dataset("expt_gap")
    df_elastic = load_dataset("elastic_tensor_2015")
    df_glass = load_dataset("glass_binary")
github hackingmaterials / automatminer / automatminer_dev / matbench / glass.py View on Github external
"""

from matminer.datasets.dataset_retrieval import load_dataset
from matminer.utils.io import store_dataframe_as_json
from matminer.featurizers.conversions import StrToComposition
from tqdm import tqdm

import pandas as pd

# pd.set_option('display.height', 1000)
pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 500)
pd.set_option("display.width", 1000)


df = load_dataset("glass_ternary_landolt")

df = df.rename(columns={"formula": "composition"})
df = df[["composition", "gfa"]]

df = StrToComposition(target_col_id="composition_obj").featurize_dataframe(
    df, "composition"
)
df["composition"] = [c.reduced_formula for c in df["composition_obj"]]
df = df.drop(columns=["composition_obj"])

# print("Ground truth")
# print(df[df["composition"]=="ZrTi9"])  # should be False in final dataframe also!!
# print(df[df["composition"]=="ZrVCo8"]) # should be True in final dataframe also!
# print(df["gfa"].value_counts())    # proportion is about 5000 GFA 2054 no GFA
# raise ValueError
github hackingmaterials / automatminer / dev_scripts / evaluation / benchmarker.py View on Github external
"""
This file will eventually hold a function that tests a mslearn
pipeline on a set of datasets for predictive power.
"""

from matminer.datasets.dataset_retrieval import load_dataset, get_available_datasets
# from matminer.datasets.convenience_loaders import

if __name__ == "__main__":
    df_piezo = load_dataset("piezoelectric_tensor")
    df_exgap = load_dataset("expt_gap")
    df_elastic = load_dataset("elastic_tensor_2015")
    df_glass = load_dataset("glass_binary")
github hackingmaterials / automatminer / dev_scripts / evaluation / benchmarker.py View on Github external
"""
This file will eventually hold a function that tests a mslearn
pipeline on a set of datasets for predictive power.
"""

from matminer.datasets.dataset_retrieval import load_dataset, get_available_datasets
# from matminer.datasets.convenience_loaders import

if __name__ == "__main__":
    df_piezo = load_dataset("piezoelectric_tensor")
    df_exgap = load_dataset("expt_gap")
    df_elastic = load_dataset("elastic_tensor_2015")
    df_glass = load_dataset("glass_binary")
github hackingmaterials / automatminer / automatminer / examples / mse_example.py View on Github external
def test_mse_example(self):
        df = load_dataset("elastic_tensor_2015")
        default_config = get_preset_config("default")
        pipe = MatPipe(**default_config)
        df = df.rename(columns={"formula": "composition"})[["composition", "structure", "K_VRH"]]
        predicted = pipe.benchmark(df, "K_VRH", test_spec=0.2)
        self.assertTrue(not predicted.empty)

        y_true = predicted["K_VRH"]
        y_test = predicted["K_VRH predicted"]
        mse = mean_squared_error(y_true, y_test)
        print("MSE: " + str(mse))
        self.assertTrue(mse < 500)
        self.assertTrue(mse > 0)
github hackingmaterials / automatminer / automatminer_dev / matbench / expt_gap.py View on Github external
"""
from matminer.datasets.dataset_retrieval import load_dataset
from matminer.utils.io import store_dataframe_as_json
from matminer.featurizers.conversions import StrToComposition
from tqdm import tqdm
import numpy as np


import pandas as pd

pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 500)
pd.set_option("display.width", 1000)
pd.set_option("precision", 8)

df = load_dataset("expt_gap")
df = df.rename(columns={"formula": "composition"})


# print("Ground Truth")
# print(df[df["composition"] == "ZrW2"])  # should be 0.00
# print(df[df["composition"] == "ZrSe2"]) # should be 2.00
# raise ValueError


excluded_compositions = []


# Prevent differences in order of formula symbols from corrupting the actual number of unique compositions
df = StrToComposition(target_col_id="composition_obj").featurize_dataframe(
    df, "composition"
)
github hackingmaterials / automatminer / automatminer_dev / matbench / castelli.py View on Github external
"""

from matminer.datasets.dataset_retrieval import load_dataset
from matminer.data_retrieval.retrieve_MP import MPDataRetrieval


import pandas as pd

# pd.set_option('display.height', 1000)
pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 500)
pd.set_option("display.width", 1000)

mpdr = MPDataRetrieval()

df = load_dataset("castelli_perovskites")
df = df[["structure", "e_form"]]
df = df.reset_index(drop=True)

print(df)
df.to_pickle("castelli.pickle.gz")
github materialsproject / MPContribs / mpcontribs-portal / notebooks / ml.materialsproject.cloud / matbench_upload.py View on Github external
col.replace("_", "|")
            .replace("-", "|")
            .replace(" ", "||")
            .replace("(", " ")
            .replace(")", "")
        )
        colmap[col] = k
    return colmap


if __name__ == "__main__":

    # Just trying it out with a single dataset, Dielectric from MP...
    for config in [DIELECTRIC]:
        project = config["data_file"].replace(".json.gz", "")
        df = load_dataset(project)
        pinput = "structure" if "structure" in df.columns else "composition"
        column_map_pretty = pretty_column_map(df.columns.tolist())
        df = df.rename(columns=column_map_pretty)
        target = column_map_pretty[config["target"]]

        # print(pinput)
        # raise ValueError

        # print(df)
        # raise ValueError

        # clean up
        has_more = True
        while has_more:
            resp = client.contributions.delete_entries(
                project=project, _limit=250
github hackingmaterials / automatminer / dev_scripts / evaluation / benchmark.py View on Github external
raise ValueError("{} is an unknown learner name!"
                             "".format(self["learner_name"]))

        # Set up the pipeline and data
        pipe_config_dict = fw_spec["pipe_config"]
        pipe_config = {"learner": learner(**pipe_config_dict["learner_kwargs"]),
                       "reducer": FeatureReducer(
                           **pipe_config_dict["reducer_kwargs"]),
                       "cleaner": DataCleaner(
                           **pipe_config_dict["cleaner_kwargs"]),
                       "autofeaturizer_kwargs":
                           AutoFeaturizer(
                               **pipe_config_dict["autofeaturizer_kwargs"])}
        pipe = MatPipe(**pipe_config)
        dataset = fw_spec["dataset"]
        df = load_dataset(dataset)
        df = df.rename(columns=REWRITE_COLS[dataset])[RELEVANT_COLS[dataset]]
        target = TARGETS[dataset]

        # Run the benchmark
        t1 = time.time()
        predicted_test_df = pipe.benchmark(df, target, test_spec=0.2)
        elapsed_time = time.time() - t1

        # Save everything
        savedir = fw_spec["save_dir"]
        pipe.save(os.path.join(savedir, "pipe.p"))
        pipe.digest(os.path.join(savedir, "digest.txt"))
        predicted_test_df.to_csv(os.path.join(savedir, "test_df.csv"))
        pipe.post_fit_df.to_csv(os.path.join(savedir, "fitted_df.csv"))

        # Evaluate model