How to use the openml.datasets.get_dataset function in openml

To help you get started, we’ve selected a few openml examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github openml / openml-python / tests / test_datasets / test_dataset_functions.py View on Github external
def test_get_online_dataset_format(self):

        # Phoneme dataset
        dataset_id = 77
        dataset = openml.datasets.get_dataset(dataset_id, download_data=False)

        self.assertEqual(
            (dataset.format).lower(),
            _get_online_dataset_format(dataset_id),
            "The format of the ARFF files is different"
        )
github openml / openml-python / tests / test_datasets / test_dataset_functions.py View on Github external
def test_get_dataset_lazy(self):
        dataset = openml.datasets.get_dataset(1, download_data=False)
        self.assertEqual(type(dataset), OpenMLDataset)
        self.assertEqual(dataset.name, 'anneal')
        self._datasets_retrieved_successfully([1], metadata_only=True)

        self.assertGreater(len(dataset.features), 1)
        self.assertGreater(len(dataset.qualities), 4)

        dataset.get_data()
        self._datasets_retrieved_successfully([1], metadata_only=False)

        # Issue324 Properly handle private datasets when trying to access them
        openml.config.server = self.production_server
        self.assertRaises(OpenMLPrivateDatasetError, openml.datasets.get_dataset, 45, False)
github openml / openml-python / tests / test_datasets / test_dataset_functions.py View on Github external
def test_get_dataset_by_name(self):
        dataset = openml.datasets.get_dataset('anneal')
        self.assertEqual(type(dataset), OpenMLDataset)
        self.assertEqual(dataset.dataset_id, 1)
        self._datasets_retrieved_successfully([1], metadata_only=False)

        self.assertGreater(len(dataset.features), 1)
        self.assertGreater(len(dataset.qualities), 4)

        # Issue324 Properly handle private datasets when trying to access them
        openml.config.server = self.production_server
        self.assertRaises(OpenMLPrivateDatasetError, openml.datasets.get_dataset, 45)
github openml / openml-python / tests / test_datasets / test_dataset.py View on Github external
def setUp(self):
        super(OpenMLDatasetTest, self).setUp()
        openml.config.server = self.production_server

        # Load dataset id 2 - dataset 2 is interesting because it contains
        # missing values, categorical features etc.
        self.dataset = openml.datasets.get_dataset(2, download_data=False)
        # titanic as missing values, categories, and string
        self.titanic = openml.datasets.get_dataset(40945, download_data=False)
        # these datasets have some boolean features
        self.pc4 = openml.datasets.get_dataset(1049, download_data=False)
        self.jm1 = openml.datasets.get_dataset(1053, download_data=False)
        self.iris = openml.datasets.get_dataset(61, download_data=False)
github openml / openml-python / examples / flows_and_runs_tutorial.py View on Github external
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
#
# Train a scikit-learn model on the data manually.

dataset = openml.datasets.get_dataset(68)
X, y = dataset.get_data(
    target=dataset.default_target_attribute
)
clf = neighbors.KNeighborsClassifier(n_neighbors=1)
clf.fit(X, y)

############################################################################
# You can also ask for meta-data to automatically preprocess the data.
#
# * e.g. categorical features -> do feature encoding
dataset = openml.datasets.get_dataset(17)
X, y, categorical = dataset.get_data(
    target=dataset.default_target_attribute,
    return_categorical_indicator=True,
)
print("Categorical features: %s" % categorical)
enc = preprocessing.OneHotEncoder(categorical_features=categorical)
X = enc.fit_transform(X)
clf.fit(X, y)

############################################################################
# Runs: Easily explore models
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^
# We can run (many) scikit-learn algorithms on (many) OpenML tasks.

# Get a task
task = openml.tasks.get_task(403)
github openml / openml-python / circle_drop / _downloads / flows_and_runs_tutorial.py View on Github external
# ^^^^^^^^^
#
# Try to build the best possible models on several OpenML tasks,
# compare your results with the rest of the class and learn from
# them. Some tasks you could try (or browse openml.org):
#
# * EEG eye state: data_id:`1471 `_, task_id:`14951 `_
# * Volcanoes on Venus: data_id:`1527 `_, task_id:`10103 `_
# * Walking activity: data_id:`1509 `_, task_id:`9945 `_, 150k instances.
# * Covertype (Satellite): data_id:`150 `_, task_id:`218 `_, 500k instances.
# * Higgs (Physics): data_id:`23512 `_, task_id:`52950 `_, 100k instances, missing values.

# Easy benchmarking:
for task_id in [115, ]:  # Add further tasks. Disclaimer: they might take some time
    task = openml.tasks.get_task(task_id)
    data = openml.datasets.get_dataset(task.dataset_id)
    clf = neighbors.KNeighborsClassifier(n_neighbors=5)
    flow = openml.flows.sklearn_to_flow(clf)

    run = openml.runs.run_flow_on_task(flow, task, avoid_duplicate_runs=False)
    myrun = run.publish()
    print("kNN on %s: http://test.openml.org/r/%d" % (data.name, myrun.run_id))
github udellgroup / oboe / automl / convex_opt_c.py View on Github external
"""
    Get the numbers of data points and features of the datasets.
    
    Args:
        default_error_matrix (pandas.core.frame.DataFrame): The default error matrix DataFrame.
    Returns:
        dataset_sizes (np.ndarray): The dataset sizes; each row is [dataset_index, number_of_data_points, number_of_features].
        
    """
    openml_datasets = openml.datasets.list_datasets()
    openml_datasets = pd.DataFrame.from_dict(openml_datasets, orient='index')
    dataset_sizes = openml_datasets[['NumberOfInstances', 'NumberOfFeatures']]
    dataset_sizes = np.concatenate((np.array([dataset_sizes.index]).T, dataset_sizes.values), axis=1)
    indices = default_error_matrix.index.tolist()
    for i in set(indices).difference(set(dataset_sizes[:, 0])):
        dataset=openml.datasets.get_dataset(i)
        data_numeric, data_labels, categorical = dataset.get_data(target=dataset.default_target_attribute,return_categorical_indicator=True)
        dataset_sizes = np.concatenate((dataset_sizes, np.array([[i, data_numeric.shape[0], data_numeric.shape[1]]])))
    return dataset_sizes
github automl / auto-sklearn / scripts / 2015_nips_paper / run / run_without_metalearning.py View on Github external
def load_task(task_id):
    """Function used for loading data."""
    task = openml.tasks.get_task(task_id)
    X, y = task.get_X_and_y()
    train_indices, test_indices = task.get_train_test_split_indices()
    X_train = X[train_indices]
    y_train = y[train_indices]
    X_test = X[test_indices]
    y_test = y[test_indices]
    dataset = openml.datasets.get_dataset(task.dataset_id)
    _, _, cat = dataset.get_data(return_categorical_indicator=True,
                                 target=task.target_name)
    del _
    del dataset
    cat = ['categorical' if c else 'numerical' for c in cat]

    unique = np.unique(y_train)
    mapping = {unique_value: i for i, unique_value in enumerate(unique)}
    y_train = np.array([mapping[value] for value in y_train])
    y_test = np.array([mapping[value] for value in y_test])

    return X_train, y_train, X_test, y_test, cat