Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_sparkml_estimator_model_log(tmpdir, spark_model_estimator):
# Print the coefficients and intercept for multinomial logistic regression
old_tracking_uri = mlflow.get_tracking_uri()
cnt = 0
# should_start_run tests whether or not calling log_model() automatically starts a run.
for should_start_run in [False, True]:
for dfs_tmp_dir in [None, os.path.join(str(tmpdir), "test")]:
print("should_start_run =", should_start_run, "dfs_tmp_dir =", dfs_tmp_dir)
try:
tracking_dir = os.path.abspath(str(tmpdir.join("mlruns")))
mlflow.set_tracking_uri("file://%s" % tracking_dir)
if should_start_run:
mlflow.start_run()
artifact_path = "model%d" % cnt
cnt += 1
sparkm.log_model(
artifact_path=artifact_path,
spark_model=spark_model_estimator.model,
dfs_tmpdir=dfs_tmp_dir)
model_uri = "runs:/{run_id}/{artifact_path}".format(
run_id=mlflow.active_run().info.run_id,
artifact_path=artifact_path)
# test reloaded model
reloaded_model = sparkm.load_model(model_uri=model_uri, dfs_tmpdir=dfs_tmp_dir)
preds_df = reloaded_model.transform(spark_model_estimator.spark_df)
preds = [x.prediction for x in preds_df.select("prediction").collect()]
artifact_path = "model%d" % cnt
cnt += 1
sparkm.log_model(artifact_path=artifact_path, spark_model=spark_model_iris.model,
dfs_tmpdir=dfs_tmp_dir)
model_uri = "runs:/{run_id}/{artifact_path}".format(
run_id=mlflow.active_run().info.run_id,
artifact_path=artifact_path)
# test reloaded model
reloaded_model = sparkm.load_model(model_uri=model_uri, dfs_tmpdir=dfs_tmp_dir)
preds_df = reloaded_model.transform(spark_model_iris.spark_df)
preds = [x.prediction for x in preds_df.select("prediction").collect()]
assert spark_model_iris.predictions == preds
finally:
mlflow.end_run()
mlflow.set_tracking_uri(old_tracking_uri)
x = dfs_tmp_dir or sparkm.DFS_TMP
shutil.rmtree(x)
shutil.rmtree(tracking_dir)
import pandas as pd
from sklearn import datasets, metrics
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
# load data
data = datasets.load_boston()
# preprocess data
x = pd.DataFrame(data.data, columns=data.feature_names)
column_order = x.columns
y = pd.DataFrame(data.target, columns=["MEDV"])
x_train, x_test, y_train, y_test = train_test_split(x, y)
# configure mlflow
mlflow.set_tracking_uri(uri='http://35.240.197.5:5000')
print('=== CircleCI env vars')
print(os.environ)
print("os.environ.get('CI', '') == 'true'")
print(os.environ.get('CI', '') == 'true')
if os.environ.get('CI', '') == 'true':
mlflow.set_experiment('CI')
else:
mlflow.set_experiment('dev')
with mlflow.start_run() as run:
# define hyperparameters
N_ESTIMATORS = 2
MAX_DEPTH = 2
# train model
model = RandomForestRegressor(n_estimators=N_ESTIMATORS, max_depth=MAX_DEPTH)
shuffle=True,
num_epochs=args.num_epochs,
batch_size=args.batch_size)
# Pass a numpy array by passing DataFrame.values
validation_dataset = model.input_fn(
features=eval_x.values,
labels=eval_y,
shuffle=False,
num_epochs=args.num_epochs,
batch_size=num_eval_examples)
start_time = time()
# Set MLflow tracking URI
if args.mlflow_tracking_uri:
mlflow.set_tracking_uri(args.mlflow_tracking_uri)
# Train model
with mlflow.start_run() as active_run:
run_id = active_run.info.run_id
# Callbacks
class MlflowCallback(tf.keras.callbacks.Callback):
# This function will be called after training completes.
def on_train_end(self, logs=None):
mlflow.log_param('num_layers', len(self.model.layers))
mlflow.log_param('optimizer_name',
type(self.model.optimizer).__name__)
# MLflow callback
mlflow_callback = MlflowCallback()
# Setup Learning Rate decay callback.
lr_decay_callback = tf.keras.callbacks.LearningRateScheduler(
lambda epoch: args.learning_rate + 0.02 * (0.5 ** (1 + epoch)),
def save_mlflow_run(params, metrices, artifacts):
"""Save MLflow run (params, metrices, artifacts) to tracking server."""
mlflow.set_tracking_uri('http://localhost:5000')
mlflow.set_experiment('dvc_dask_use_case')
with mlflow.start_run():
for stage, stage_params in params.items():
for key, value in stage_params.items():
mlflow.log_param(key, value)
for metric, value in metrices.items():
mlflow.log_metric(metric, value)
for path in artifacts:
mlflow.log_artifact(path)
def train_ner(model_name, output_path, train_data, dev_data, test_data, dropout, n_iter, patience):
mlflow.set_tracking_uri("./mlruns")
mlflow.set_experiment("Spacy NER")
mlflow.start_run(run_name="Using all")
if model_name in ["None", "False", "", "blank"]:
model_name = None
trainer = SpacyNerTrainer(model_name, output_path)
logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
logging.info("Reading train data")
diterator = DataIterator()
train_sentences = list(tqdm(itertools.islice(diterator.tagged_sentences(train_data), None)))
logging.info("Got {} sentences with at least one entity".format(len(train_sentences)))
logging.info("Reading test data")
test_sentences = list(tqdm(diterator.tagged_sentences(test_data)))
logging.info("Got {} sentences with at least one entity".format(len(test_sentences)))
def run(epochs, batch_size):
# tracking_uri = 'https://community.cloud.pipeline.ai'
users_home = '/mnt/pipelineai/users'
experiment_base_path = '%s/experiments' % users_home
tracking_uri='file://%s' % experiment_base_path
mlflow.set_tracking_uri(tracking_uri)
experiment_name = '%s-%s' % (os.getenv('PIPELINE_RESOURCE_NAME', 'mnist'), os.getenv('PIPELINE_TAG', int(1000 * time.time())))
mlflow.set_experiment(experiment_name)
with mlflow.start_run() as run:
mlflow.log_param("epochs", str(epochs))
mlflow.log_param("batch_size", str(batch_size))
mnist = tf.keras.datasets.mnist
(x_train, y_train), (x_test, y_test) = mnist.load_data()
x_train, x_test = x_train / 255.0, x_test / 255.0
model = tf.keras.models.Sequential([
tf.keras.layers.Flatten(input_shape=(28, 28)),
### import packages
#import os
import numpy as np
import pandas as pd
from statistics import mean
from matplotlib import pyplot as plt
from keras.layers import LSTM
from keras.layers import Dense
from keras.models import Sequential
from keras.callbacks import History
history = History()
import mlflow
import mlflow.keras
mlflow.set_tracking_uri('/Users/paulgureghian/mlruns')
from sklearn.preprocessing import MinMaxScaler
### read in the dataset to a dataframe
pd.set_option('display.max_columns', 8)
pd.set_option('display.width', 1000)
df = pd.read_csv('/bitstamp.csv')
print(df.head())
print('')
print(df.shape)
print('')
### encode the date
df['date'] = pd.to_datetime(df['Timestamp'], unit ='s').dt.date
group = df.groupby('date')
import mlflow
import calculate_pi
#user_id =
user_id = ''
model_name = 'sparkpi'
model_tag = 'v1'
if __name__ == "__main__":
"""
Usage: pi [partitions]
"""
tracking_uri = 'https://community.cloud.pipeline.ai'
mlflow.set_tracking_uri(tracking_uri)
experiment_name = '%s%s-%s' % (user_id, model_name, model_tag)
# This will create and set the experiment
mlflow.set_experiment(experiment_name)
with mlflow.start_run() as run:
spark = SparkSession\
.builder\
.appName("PythonSparkPi")\
.getOrCreate()
partitions = 2
n = 100000 * partitions
mlflow.log_param('partitions', str(partitions))