Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_openml_param_name_to_sklearn(self):
scaler = sklearn.preprocessing.StandardScaler(with_mean=False)
boosting = sklearn.ensemble.AdaBoostClassifier(
base_estimator=sklearn.tree.DecisionTreeClassifier())
model = sklearn.pipeline.Pipeline(steps=[
('scaler', scaler), ('boosting', boosting)])
flow = self.extension.model_to_flow(model)
task = openml.tasks.get_task(115)
run = openml.runs.run_flow_on_task(flow, task)
run = run.publish()
TestBase._mark_entity_for_removal('run', run.run_id)
TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1], run.run_id))
run = openml.runs.get_run(run.run_id)
setup = openml.setups.get_setup(run.setup_id)
# make sure to test enough parameters
self.assertGreater(len(setup.parameters), 15)
for parameter in setup.parameters.values():
sklearn_name = self.extension._openml_param_name_to_sklearn(parameter, flow)
# test the inverse. Currently, OpenML stores the hyperparameter
# fullName as flow.name + flow.version + parameter.name on the
# server (but this behaviour is not documented and might or might
def test_classification_workflow(self):
task = openml.tasks.get_task(254)
X, y = task.get_X_and_y()
ohe = OneHotEncoder(categorical_features=[True]*22)
tree = sklearn.tree.DecisionTreeClassifier(random_state=1)
pipeline = sklearn.pipeline.Pipeline((('ohe', ohe), ('tree', tree)))
X_train, X_test, y_train, y_test = \
sklearn.cross_validation.train_test_split(X, y, random_state=3,
train_size=0.5,
test_size=0.5)
pipeline.fit(X_train, y_train)
self.assertEqual(np.mean(y_train == pipeline.predict(X_train)), 1)
# With an incorrect copy operation the OneHotEncoder would rearrange
# the data in such a way that the accuracy would drop to 66%
self.assertEqual(np.mean(y_test == pipeline.predict(X_test)), 1)
def test_impute_with_constant(self):
task_ids = [2]
for task_id in task_ids:
task = openml.tasks.get_task(task_id)
dataset = task.get_dataset()
X, _ = dataset.get_data(target=task.target_name)
nominal_indices = dataset.get_features_by_type('nominal', exclude=[task.target_name])
fill_empty = -1
clf = ConditionalImputer(strategy="median",
strategy_nominal="most_frequent",
categorical_features=None,
verbose=True,
fill_empty=fill_empty)
self._do_test(dataset, X, nominal_indices, clf, fill_empty=fill_empty)
flow.publish()
# Not collecting flow_id for deletion since this is a test for failed upload
self.assertEqual(api_call_mock.call_count, 1)
self.assertEqual(get_flow_mock.call_count, 1)
self.assertEqual(flow_exists_mock.call_count, 1)
flow_copy = copy.deepcopy(flow)
flow_copy.name = flow_copy.name[:-1]
get_flow_mock.return_value = flow_copy
flow_exists_mock.return_value = 1
with self.assertRaises(ValueError) as context_manager:
flow.publish()
TestBase._mark_entity_for_removal('flow', (flow.flow_id, flow.name))
TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1],
flow.flow_id))
fixture = (
"The flow on the server is inconsistent with the local flow. "
"The server flow ID is 1. Please check manually and remove "
"the flow if necessary! Error is:\n"
"'Flow sklearn.ensemble.forest.RandomForestClassifier: "
"values for attribute 'name' differ: "
"'sklearn.ensemble.forest.RandomForestClassifier'"
"\nvs\n'sklearn.ensemble.forest.RandomForestClassifie'.'"
)
self.assertEqual(context_manager.exception.args[0], fixture)
self.assertEqual(get_flow_mock.call_count, 2)
def _existing_setup_exists(self, classif):
flow = self.extension.model_to_flow(classif)
flow.name = 'TEST%s%s' % (get_sentinel(), flow.name)
flow.publish()
TestBase._mark_entity_for_removal('flow', (flow.flow_id, flow.name))
TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1], flow.flow_id))
# although the flow exists, we can be sure there are no
# setups (yet) as it hasn't been ran
setup_id = openml.setups.setup_exists(flow)
self.assertFalse(setup_id)
setup_id = openml.setups.setup_exists(flow)
self.assertFalse(setup_id)
# now run the flow on an easy task:
task = openml.tasks.get_task(115) # diabetes
run = openml.runs.run_flow_on_task(flow, task)
# spoof flow id, otherwise the sentinel is ignored
run.flow_id = flow.flow_id
run.publish()
TestBase._mark_entity_for_removal('run', run.run_id)
TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1], flow.flow_id))
# although the flow exists, we can be sure there are no
# setups (yet) as it hasn't been ran
setup_id = openml.setups.setup_exists(flow)
self.assertFalse(setup_id)
setup_id = openml.setups.setup_exists(flow)
self.assertFalse(setup_id)
# now run the flow on an easy task:
task = openml.tasks.get_task(115) # diabetes
run = openml.runs.run_flow_on_task(flow, task)
# spoof flow id, otherwise the sentinel is ignored
run.flow_id = flow.flow_id
run.publish()
TestBase._mark_entity_for_removal('run', run.run_id)
TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1], run.run_id))
# download the run, as it contains the right setup id
run = openml.runs.get_run(run.run_id)
# execute the function we are interested in
setup_id = openml.setups.setup_exists(flow)
self.assertEqual(setup_id, run.setup_id)
def test_publish_existing_flow(self, flow_exists_mock):
clf = sklearn.tree.DecisionTreeClassifier(max_depth=2)
flow = self.extension.model_to_flow(clf)
flow_exists_mock.return_value = 1
with self.assertRaises(openml.exceptions.PyOpenMLError) as context_manager:
flow.publish(raise_error_if_exists=True)
TestBase._mark_entity_for_removal('flow', (flow.flow_id, flow.name))
TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1],
flow.flow_id))
self.assertTrue('OpenMLFlow already exists' in context_manager.exception.message)
('estimator', sklearn.tree.DecisionTreeClassifier())
]
) # build a sklearn classifier
for task_id in benchmark_suite.tasks[:1]: # iterate over all tasks
task = openml.tasks.get_task(task_id) # download the OpenML task
X, y = task.get_X_and_y() # get the data (not used in this example)
openml.config.apikey = openml.config.apikey # set the OpenML Api Key
run = openml.runs.run_model_on_task(
clf, task, avoid_duplicate_runs=False
) # run classifier on splits (requires API key)
score = run.get_metric_fn(
sklearn.metrics.accuracy_score
) # print accuracy score
print('Data set: %s; Accuracy: %0.2f' % (task.get_dataset().name, score.mean()))
run.publish() # publish the experiment on OpenML (optional)
TestBase._mark_entity_for_removal('run', run.run_id)
TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1],
run.run_id))
print('URL for run: %s/run/%d' % (openml.config.server, run.run_id))
self.assertEqual(
flow.upload_date,
flow.components['lr'].upload_date,
msg=(
flow.name,
flow.flow_id,
flow.components['lr'].name, flow.components['lr'].flow_id,
),
)
clf1 = sklearn.tree.DecisionTreeClassifier(max_depth=2)
flow1 = self.extension.model_to_flow(clf1)
flow1, sentinel = self._add_sentinel_to_flow_name(flow1, None)
flow1.publish()
TestBase._mark_entity_for_removal('flow', (flow.flow_id, flow.name))
TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1],
flow1.flow_id))
# In order to assign different upload times to the flows!
time.sleep(1)
clf2 = sklearn.ensemble.VotingClassifier(
[('dt', sklearn.tree.DecisionTreeClassifier(max_depth=2))])
flow2 = self.extension.model_to_flow(clf2)
flow2, _ = self._add_sentinel_to_flow_name(flow2, sentinel)
flow2.publish()
TestBase._mark_entity_for_removal('flow', (flow2.flow_id, flow2.name))
TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1],
flow2.flow_id))
# If one component was published before the other, the components in
# the flow should have different upload dates
self.assertNotEqual(flow2.upload_date,
language='English',
licence='MIT',
default_target_attribute='col_{}'.format(data.shape[1] - 1),
row_id_attribute=None,
ignore_attribute=None,
citation='None',
attributes=attributes,
data=data,
version_label='test',
original_data_url='http://openml.github.io/openml-python',
paper_url='http://openml.github.io/openml-python'
)
dataset.publish()
TestBase._mark_entity_for_removal('data', dataset.id)
TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1],
dataset.id))
self.assertEqual(
_get_online_dataset_arff(dataset.id),
dataset._dataset,
"Uploaded arff does not match original one"
)
self.assertEqual(
_get_online_dataset_format(dataset.id),
'arff',
"Wrong format for dataset"
)