How to use openml - 10 common examples

To help you get started, we’ve selected a few openml examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github openml / openml-python / tests / test_extensions / test_sklearn_extension / test_sklearn_extension.py View on Github external
def test_openml_param_name_to_sklearn(self):
        scaler = sklearn.preprocessing.StandardScaler(with_mean=False)
        boosting = sklearn.ensemble.AdaBoostClassifier(
            base_estimator=sklearn.tree.DecisionTreeClassifier())
        model = sklearn.pipeline.Pipeline(steps=[
            ('scaler', scaler), ('boosting', boosting)])
        flow = self.extension.model_to_flow(model)
        task = openml.tasks.get_task(115)
        run = openml.runs.run_flow_on_task(flow, task)
        run = run.publish()
        TestBase._mark_entity_for_removal('run', run.run_id)
        TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1], run.run_id))
        run = openml.runs.get_run(run.run_id)
        setup = openml.setups.get_setup(run.setup_id)

        # make sure to test enough parameters
        self.assertGreater(len(setup.parameters), 15)

        for parameter in setup.parameters.values():
            sklearn_name = self.extension._openml_param_name_to_sklearn(parameter, flow)

            # test the inverse. Currently, OpenML stores the hyperparameter
            # fullName as flow.name + flow.version + parameter.name on the
            # server (but this behaviour is not documented and might or might
github automl / auto-sklearn / test / test_pipeline / implementations / test_OneHotEncoder.py View on Github external
def test_classification_workflow(self):
        task = openml.tasks.get_task(254)
        X, y = task.get_X_and_y()

        ohe = OneHotEncoder(categorical_features=[True]*22)
        tree = sklearn.tree.DecisionTreeClassifier(random_state=1)
        pipeline = sklearn.pipeline.Pipeline((('ohe', ohe), ('tree', tree)))

        X_train, X_test, y_train, y_test = \
            sklearn.cross_validation.train_test_split(X, y, random_state=3,
                                                      train_size=0.5,
                                                      test_size=0.5)
        pipeline.fit(X_train, y_train)
        self.assertEqual(np.mean(y_train == pipeline.predict(X_train)), 1)
        # With an incorrect copy operation the OneHotEncoder would rearrange
        # the data in such a way that the accuracy would drop to 66%
        self.assertEqual(np.mean(y_test == pipeline.predict(X_test)), 1)
github openml / openml-python / tests / test_utils / test_conditionalimputer.py View on Github external
def test_impute_with_constant(self):
        task_ids = [2]

        for task_id in task_ids:
            task = openml.tasks.get_task(task_id)
            dataset = task.get_dataset()
            X, _ = dataset.get_data(target=task.target_name)
            nominal_indices = dataset.get_features_by_type('nominal', exclude=[task.target_name])
            fill_empty = -1
            clf = ConditionalImputer(strategy="median",
                                     strategy_nominal="most_frequent",
                                     categorical_features=None,
                                     verbose=True,
                                     fill_empty=fill_empty)

            self._do_test(dataset, X, nominal_indices, clf, fill_empty=fill_empty)
github openml / openml-python / tests / test_flows / test_flow.py View on Github external
flow.publish()
        # Not collecting flow_id for deletion since this is a test for failed upload

        self.assertEqual(api_call_mock.call_count, 1)
        self.assertEqual(get_flow_mock.call_count, 1)
        self.assertEqual(flow_exists_mock.call_count, 1)

        flow_copy = copy.deepcopy(flow)
        flow_copy.name = flow_copy.name[:-1]
        get_flow_mock.return_value = flow_copy
        flow_exists_mock.return_value = 1

        with self.assertRaises(ValueError) as context_manager:
            flow.publish()
            TestBase._mark_entity_for_removal('flow', (flow.flow_id, flow.name))
            TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1],
                                                                flow.flow_id))

        fixture = (
            "The flow on the server is inconsistent with the local flow. "
            "The server flow ID is 1. Please check manually and remove "
            "the flow if necessary! Error is:\n"
            "'Flow sklearn.ensemble.forest.RandomForestClassifier: "
            "values for attribute 'name' differ: "
            "'sklearn.ensemble.forest.RandomForestClassifier'"
            "\nvs\n'sklearn.ensemble.forest.RandomForestClassifie'.'"
        )

        self.assertEqual(context_manager.exception.args[0], fixture)
        self.assertEqual(get_flow_mock.call_count, 2)
github openml / openml-python / tests / test_setups / test_setup_functions.py View on Github external
def _existing_setup_exists(self, classif):

        flow = self.extension.model_to_flow(classif)
        flow.name = 'TEST%s%s' % (get_sentinel(), flow.name)
        flow.publish()
        TestBase._mark_entity_for_removal('flow', (flow.flow_id, flow.name))
        TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1], flow.flow_id))

        # although the flow exists, we can be sure there are no
        # setups (yet) as it hasn't been ran
        setup_id = openml.setups.setup_exists(flow)
        self.assertFalse(setup_id)
        setup_id = openml.setups.setup_exists(flow)
        self.assertFalse(setup_id)

        # now run the flow on an easy task:
        task = openml.tasks.get_task(115)  # diabetes
        run = openml.runs.run_flow_on_task(flow, task)
        # spoof flow id, otherwise the sentinel is ignored
        run.flow_id = flow.flow_id
        run.publish()
        TestBase._mark_entity_for_removal('run', run.run_id)
github openml / openml-python / tests / test_setups / test_setup_functions.py View on Github external
TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1], flow.flow_id))

        # although the flow exists, we can be sure there are no
        # setups (yet) as it hasn't been ran
        setup_id = openml.setups.setup_exists(flow)
        self.assertFalse(setup_id)
        setup_id = openml.setups.setup_exists(flow)
        self.assertFalse(setup_id)

        # now run the flow on an easy task:
        task = openml.tasks.get_task(115)  # diabetes
        run = openml.runs.run_flow_on_task(flow, task)
        # spoof flow id, otherwise the sentinel is ignored
        run.flow_id = flow.flow_id
        run.publish()
        TestBase._mark_entity_for_removal('run', run.run_id)
        TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1], run.run_id))
        # download the run, as it contains the right setup id
        run = openml.runs.get_run(run.run_id)

        # execute the function we are interested in
        setup_id = openml.setups.setup_exists(flow)
        self.assertEqual(setup_id, run.setup_id)
github openml / openml-python / tests / test_flows / test_flow.py View on Github external
def test_publish_existing_flow(self, flow_exists_mock):
        clf = sklearn.tree.DecisionTreeClassifier(max_depth=2)
        flow = self.extension.model_to_flow(clf)
        flow_exists_mock.return_value = 1

        with self.assertRaises(openml.exceptions.PyOpenMLError) as context_manager:
            flow.publish(raise_error_if_exists=True)
            TestBase._mark_entity_for_removal('flow', (flow.flow_id, flow.name))
            TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1],
                                                                flow.flow_id))

        self.assertTrue('OpenMLFlow already exists' in context_manager.exception.message)
github openml / openml-python / tests / test_study / test_study_examples.py View on Github external
('estimator', sklearn.tree.DecisionTreeClassifier())
            ]
        )  # build a sklearn classifier
        for task_id in benchmark_suite.tasks[:1]:  # iterate over all tasks
            task = openml.tasks.get_task(task_id)  # download the OpenML task
            X, y = task.get_X_and_y()  # get the data (not used in this example)
            openml.config.apikey = openml.config.apikey  # set the OpenML Api Key
            run = openml.runs.run_model_on_task(
                clf, task, avoid_duplicate_runs=False
            )  # run classifier on splits (requires API key)
            score = run.get_metric_fn(
                sklearn.metrics.accuracy_score
            )  # print accuracy score
            print('Data set: %s; Accuracy: %0.2f' % (task.get_dataset().name, score.mean()))
            run.publish()  # publish the experiment on OpenML (optional)
            TestBase._mark_entity_for_removal('run', run.run_id)
            TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1],
                                                                run.run_id))
            print('URL for run: %s/run/%d' % (openml.config.server, run.run_id))
github openml / openml-python / tests / test_flows / test_flow.py View on Github external
self.assertEqual(
            flow.upload_date,
            flow.components['lr'].upload_date,
            msg=(
                flow.name,
                flow.flow_id,
                flow.components['lr'].name, flow.components['lr'].flow_id,
            ),
        )

        clf1 = sklearn.tree.DecisionTreeClassifier(max_depth=2)
        flow1 = self.extension.model_to_flow(clf1)
        flow1, sentinel = self._add_sentinel_to_flow_name(flow1, None)
        flow1.publish()
        TestBase._mark_entity_for_removal('flow', (flow.flow_id, flow.name))
        TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1],
                                                            flow1.flow_id))

        # In order to assign different upload times to the flows!
        time.sleep(1)

        clf2 = sklearn.ensemble.VotingClassifier(
            [('dt', sklearn.tree.DecisionTreeClassifier(max_depth=2))])
        flow2 = self.extension.model_to_flow(clf2)
        flow2, _ = self._add_sentinel_to_flow_name(flow2, sentinel)
        flow2.publish()
        TestBase._mark_entity_for_removal('flow', (flow2.flow_id, flow2.name))
        TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1],
                                                            flow2.flow_id))
        # If one component was published before the other, the components in
        # the flow should have different upload dates
        self.assertNotEqual(flow2.upload_date,
github openml / openml-python / tests / test_datasets / test_dataset_functions.py View on Github external
language='English',
            licence='MIT',
            default_target_attribute='col_{}'.format(data.shape[1] - 1),
            row_id_attribute=None,
            ignore_attribute=None,
            citation='None',
            attributes=attributes,
            data=data,
            version_label='test',
            original_data_url='http://openml.github.io/openml-python',
            paper_url='http://openml.github.io/openml-python'
        )

        dataset.publish()
        TestBase._mark_entity_for_removal('data', dataset.id)
        TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1],
                                                            dataset.id))

        self.assertEqual(
            _get_online_dataset_arff(dataset.id),
            dataset._dataset,
            "Uploaded arff does not match original one"
        )
        self.assertEqual(
            _get_online_dataset_format(dataset.id),
            'arff',
            "Wrong format for dataset"
        )