How to use the openml.datasets function in openml

To help you get started, we’ve selected a few openml examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github openml / openml-python / tests / test_datasets / test_dataset_functions.py View on Github external
df['play'] = df['play'].astype('category')
        # meta-information
        name = '%s-pandas_testing_dataset' % self._get_sentinel()
        description = 'Synthetic dataset created from a Pandas DataFrame'
        creator = 'OpenML tester'
        collection_date = '01-01-2018'
        language = 'English'
        licence = 'MIT'
        default_target_attribute = 'play'
        citation = 'None'
        original_data_url = 'http://openml.github.io/openml-python'
        paper_url = 'http://openml.github.io/openml-python'

        # pass a list to ignore_attribute
        ignore_attribute = ['outlook', 'windy']
        dataset = openml.datasets.functions.create_dataset(
            name=name,
            description=description,
            creator=creator,
            contributor=None,
            collection_date=collection_date,
            language=language,
            licence=licence,
            default_target_attribute=default_target_attribute,
            row_id_attribute=None,
            ignore_attribute=ignore_attribute,
            citation=citation,
            attributes='auto',
            data=df,
            version_label='test',
            original_data_url=original_data_url,
            paper_url=paper_url
github openml / openml-python / develop / _downloads / 9e0617073c8209f15abf91f273871776 / flows_and_runs_tutorial.py View on Github external
openml.config.start_using_configuration_for_example()
# NOTE: We are using dataset 68 from the test server: https://test.openml.org/d/68
dataset = openml.datasets.get_dataset(68)
X, y, categorical_indicator, attribute_names = dataset.get_data(
    dataset_format='array',
    target=dataset.default_target_attribute
)
clf = neighbors.KNeighborsClassifier(n_neighbors=1)
clf.fit(X, y)

############################################################################
# You can also ask for meta-data to automatically preprocess the data.
#
# * e.g. categorical features -> do feature encoding
dataset = openml.datasets.get_dataset(17)
X, y, categorical_indicator, attribute_names = dataset.get_data(
    dataset_format='array',
    target=dataset.default_target_attribute
)
print(f"Categorical features: {categorical_indicator}")
transformer = compose.ColumnTransformer(
    [('one_hot_encoder', preprocessing.OneHotEncoder(categories='auto'), categorical_indicator)])
X = transformer.fit_transform(X)
clf.fit(X, y)

############################################################################
# Runs: Easily explore models
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^
# We can run (many) scikit-learn algorithms on (many) OpenML tasks.

# Get a task
github openml / openml-python / master / _downloads / cb73f5fc41065b327cef9d079472032b / simple_flows_and_runs_tutorial.py View on Github external
import openml
from sklearn import ensemble, neighbors

############################################################################
# Train a machine learning model
# ==============================
#
# .. warning:: This example uploads data. For that reason, this example
#   connects to the test server at test.openml.org. This prevents the main
#   server from crowding with example datasets, tasks, runs, and so on.

openml.config.start_using_configuration_for_example()

# NOTE: We are using dataset 20 from the test server: https://test.openml.org/d/20
dataset = openml.datasets.get_dataset(20)
X, y, categorical_indicator, attribute_names = dataset.get_data(
    dataset_format='array',
    target=dataset.default_target_attribute
)
clf = neighbors.KNeighborsClassifier(n_neighbors=3)
clf.fit(X, y)

############################################################################
# Running a model on a task
# =========================

task = openml.tasks.get_task(119)
clf = ensemble.RandomForestClassifier()
run = openml.runs.run_model_on_task(clf, task)
print(run)
github alan-turing-institute / mlaut / download_openml.py View on Github external
#regression datasets have a value of -1. Classification datasets specify the number of classes
    if all_datasets[id]['NumberOfClasses'] == -1:
        print(f"Skipping dataset {id}, {all_datasets[id]['name']}. This is a regression dataset.")
        continue
    if all_datasets[id]['NumberOfMissingValues'] > 0:
        print(f"Skipping dataset {id}, {all_datasets[id]['name']} due to missing values.")
        continue
    if all_datasets[id]['NumberOfInstances'] > NUMBER_OF_INSTANCES_CUTOFF_NUMBER:
        print(f"Skipping dataset {id}, {all_datasets[id]['name']}. It has more than {NUMBER_OF_INSTANCES_CUTOFF_NUMBER} instances.")
        continue


    print(f"Trying to download dataset {id}, {all_datasets[id]['name']}")

    try:
        dataset = openml.datasets.get_dataset(id)
        X, names = dataset.get_data(return_attribute_names=True)
      

        metadata = {
            'class_name': dataset.__dict__['default_target_attribute'],
            'source': 'OpenML',
            'dataset_name':dataset.__dict__['name'],
            'dataset_id': id
        }
        class_name_index = names.index(metadata['class_name'])

        #Normalize the data
        # scaler = preprocessing.StandardScaler(copy=True, with_mean=True, with_std=True)
        # scaler.fit(X)
        # x_transformed  = scaler.transform(X)
        # x_transformed[:,class_name_index] = X[:, class_name_index]
github automl / auto-sklearn / scripts / update_metadata_util.py View on Github external
def load_task(task_id):
    task = openml.tasks.get_task(task_id)
    X, y = task.get_X_and_y()
    train_indices, test_indices = task.get_train_test_split_indices()
    X_train = X[train_indices]
    y_train = y[train_indices]
    X_test = X[test_indices]
    y_test = y[test_indices]
    dataset = openml.datasets.get_dataset(task.dataset_id)
    _, _, cat, _ = dataset.get_data(target=task.target_name)
    del _
    del dataset
    cat = ['categorical' if c else 'numerical' for c in cat]

    unique = np.unique(y_train)
    mapping = {unique_value: i for i, unique_value in enumerate(unique)}
    y_train = np.array([mapping[value] for value in y_train])
    y_test = np.array([mapping[value] for value in y_test])

    return X_train, y_train, X_test, y_test, cat
github openml / openml-python / master / _downloads / 911f16d4db6b665d864c4483331b062a / introduction_tutorial.py View on Github external
#
# * Add the  line **cachedir = 'MYDIR'** to the config file, replacing
#   'MYDIR' with the path to the cache directory. By default, OpenML
#   will use **~/.openml/cache** as the cache directory.
# * Run the code below, replacing 'YOURDIR' with the path to the cache directory.

# Uncomment and set your OpenML cache directory
# import os
# openml.config.cache_directory = os.path.expanduser('YOURDIR')

############################################################################
# Simple Example
# ^^^^^^^^^^^^^^
# Download the OpenML task for the eeg-eye-state.
task = openml.tasks.get_task(403)
data = openml.datasets.get_dataset(task.dataset_id)
clf = neighbors.KNeighborsClassifier(n_neighbors=5)
run = openml.runs.run_model_on_task(clf, task, avoid_duplicate_runs=False)
# Publish the experiment on OpenML (optional, requires an API key).
# For this tutorial, our configuration publishes to the test server
# as to not crowd the main server with runs created by examples.
myrun = run.publish()
print("kNN on %s: http://test.openml.org/r/%d" % (data.name, myrun.run_id))

############################################################################
openml.config.stop_using_configuration_for_example()
github openml / openml-python / openml / utils.py View on Github external
def _get_rest_api_type_alias(oml_object: 'OpenMLBase') -> str:
    """ Return the alias of the openml entity as it is defined for the REST API. """
    rest_api_mapping = [
        (openml.datasets.OpenMLDataset, 'data'),
        (openml.flows.OpenMLFlow, 'flow'),
        (openml.tasks.OpenMLTask, 'task'),
        (openml.runs.OpenMLRun, 'run'),
        ((openml.study.OpenMLStudy, openml.study.OpenMLBenchmarkSuite), 'study')
    ]  # type: List[Tuple[Union[Type, Tuple], str]]
    _, api_type_alias = [(python_type, api_alias)
                         for (python_type, api_alias) in rest_api_mapping
                         if isinstance(oml_object, python_type)][0]
    return api_type_alias
github openml / openml-python / openml / tasks / task.py View on Github external
def get_dataset(self) -> datasets.OpenMLDataset:
        """Download dataset associated with task"""
        return datasets.get_dataset(self.dataset_id)
github openml / openml-python / circle_drop / _downloads / introduction_tutorial.py View on Github external
# ^^^^^^^
# When downloading datasets, tasks, runs and flows, they will be cached to retrieve them without calling the server later. As with the API key, the cache directory can be either specified through the config file or through the API:
#
# * Add the  line **cachedir = 'MYDIR'** to the config file, replacing 'MYDIR' with the path to the cache directory. By default, OpenML will use **~/.openml/cache** as the cache directory.
# * Run the code below, replacing 'YOURDIR' with the path to the cache directory.

import os
# Uncomment and set your OpenML cache directory
# openml.config.cache_directory = os.path.expanduser('YOURDIR')

############################################################################
# Simple Example
# ^^^^^^^^^^^^^^
# Download the OpenML task for the eeg-eye-state.
task = openml.tasks.get_task(403)
data = openml.datasets.get_dataset(task.dataset_id)
clf = neighbors.KNeighborsClassifier(n_neighbors=5)
flow = openml.flows.sklearn_to_flow(clf)
run = openml.runs.run_flow_on_task(flow, task, avoid_duplicate_runs=False)
# Publish the experiment on OpenML (optional, requires an API key).
# For this tutorial, our configuration publishes to the test server
# as to not pollute the main server.
myrun = run.publish()
print("kNN on %s: http://test.openml.org/r/%d" % (data.name, myrun.run_id))
github openml / openml-python / examples / datasets_tutorial.py View on Github external
# * Find datasets with more than 10000 examples.
# * Find a dataset called 'eeg_eye_state'.
# * Find all datasets with more than 50 classes.
datalist[datalist.NumberOfInstances > 10000
         ].sort_values(['NumberOfInstances']).head(n=20)
############################################################################
datalist.query('name == "eeg-eye-state"')
############################################################################
datalist.query('NumberOfClasses > 50')

############################################################################
# Download datasets
# =================

# This is done based on the dataset ID ('did').
dataset = openml.datasets.get_dataset(68)
# NOTE: Dataset 68 exists on the test server https://test.openml.org/d/68

# Print a summary
print("This is dataset '%s', the target feature is '%s'" %
      (dataset.name, dataset.default_target_attribute))
print("URL: %s" % dataset.url)
print(dataset.description[:500])

############################################################################
# Get the actual data.
#
# The dataset can be returned in 2 possible formats: as a NumPy array, a SciPy
# sparse matrix, or as a Pandas DataFrame (or SparseDataFrame). The format is
# controlled with the parameter ``dataset_format`` which can be either 'array'
# (default) or 'dataframe'. Let's first build our dataset from a NumPy array
# and manually create a dataframe.