Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
)
# Setup the Tornasole Hook
# save tensors as reductions if necessary
rdnc = (
smd.ReductionConfig(reductions=["mean"], abs_reductions=["max"], norms=["l1"])
if FLAGS.reductions
else None
)
ts_hook = smd.SessionHook(
out_dir=FLAGS.smdebug_path,
save_all=FLAGS.save_all,
include_collections=["weights", "gradients", "losses", "biases"],
save_config=smd.SaveConfig(save_interval=FLAGS.save_frequency),
reduction_config=rdnc,
)
ts_hook.set_mode(smd.modes.TRAIN)
# Horovod: adjust number of steps based on number of GPUs.
mnist_classifier.train(
input_fn=train_input_fn,
steps=FLAGS.steps // hvd.size(),
hooks=[logging_hook, bcast_hook, ts_hook],
)
# Evaluate the model and print results
eval_input_fn = tf.estimator.inputs.numpy_input_fn(
x={"x": eval_data}, y=eval_labels, num_epochs=1, shuffle=False
)
def test_save_one_worker(out_dir):
strategy = train_model(
out_dir,
include_collections=None,
save_all=True,
save_config=SaveConfig(save_steps=[5]),
steps=["train"],
include_workers="one",
)
tr = create_trial_fast_refresh(out_dir)
assert len(tr.workers()) == 1
assert len(tr.steps())
assert len(tr.tensor_names(collection="weights"))
assert len(tr.tensor_names(collection="weights"))
assert len(tr.tensor(tr.tensor_names(collection="weights")[0]).workers(0)) == 1
assert len(tr.tensor_names(collection="biases"))
assert len(tr.tensor(tr.tensor_names(collection="biases")[0]).workers(0)) == 1
assert len(tr.tensor_names(collection="gradients"))
if hook is None:
if save_config is None:
save_config = SaveConfig(save_interval=3)
hook = KerasHook(
trial_dir,
save_config=save_config,
save_all=save_all,
include_collections=include_collections,
reduction_config=reduction_config,
)
if not save_all and include_collections is not None:
for cname in hook.include_collections:
if cname not in include_collections:
hook.get_collection(cname).save_config = SaveConfig(end_step=0)
if create_relu_collection:
hook.get_collection("relu").add_keras_layer(relu_layer, inputs=True, outputs=True)
if use_keras_optimizer:
opt = keras.optimizers.RMSprop()
else:
opt = tf.train.RMSPropOptimizer(0.1)
opt = hook.wrap_optimizer(opt)
if use_tf_keras:
model.compile(
optimizer=opt,
loss="sparse_categorical_crossentropy",
run_eagerly=eager,
def test_reductions(out_dir, save_raw_tensor=False):
pre_test_clean_up()
rdnc = smd.ReductionConfig(
reductions=ALLOWED_REDUCTIONS,
abs_reductions=ALLOWED_REDUCTIONS,
norms=ALLOWED_NORMS,
abs_norms=ALLOWED_NORMS,
save_raw_tensor=save_raw_tensor,
)
hook = smd.SessionHook(
out_dir=out_dir,
save_config=smd.SaveConfig(save_interval=1),
reduction_config=rdnc,
include_collections=["weights", "gradients", "losses"],
)
helper_test_reductions(out_dir, hook, save_raw_tensor)
def test_clash_with_tb_callback(out_dir):
train_model(
out_dir,
save_config=SaveConfig(save_interval=9),
steps=["train"],
include_collections=[
CollectionKeys.WEIGHTS,
CollectionKeys.BIASES,
CollectionKeys.LOSSES,
CollectionKeys.METRICS,
],
add_callbacks=["tensorboard"],
)
tr = create_trial_fast_refresh(out_dir)
assert len(tr.tensor_names()) == 8
shutil.rmtree(out_dir)
if FLAGS.tornasole_relu_reductions:
for r in FLAGS.tornasole_relu_reductions:
reductions.append(r)
if FLAGS.tornasole_relu_reductions_abs:
for r in FLAGS.tornasole_relu_reductions_abs:
abs_reductions.append(r)
if reductions or abs_reductions:
rnc = smd.ReductionConfig(reductions=reductions, abs_reductions=abs_reductions)
else:
rnc = None
include_collections = ["losses"]
hook = smd.SessionHook(
out_dir=FLAGS.smdebug_path,
save_config=smd.SaveConfig(save_interval=FLAGS.step_interval),
reduction_config=rnc,
include_collections=include_collections,
save_all=FLAGS.tornasole_save_all,
)
if FLAGS.save_weights is True:
include_collections.append("weights")
if FLAGS.save_gradients is True:
include_collections.append("gradients")
if FLAGS.tornasole_save_relu_activations is True:
include_collections.append("relu_activations")
if FLAGS.tornasole_save_inputs is True:
include_collections.append("inputs")
if FLAGS.tornasole_include:
hook.get_collection("default").include(FLAGS.tornasole_include)
include_collections.append("default")
return hook
parser = argparse.ArgumentParser()
parser.add_argument("--lr", type=float, help="Learning Rate", default=0.001)
parser.add_argument("--steps", type=int, help="Number of steps to run", default=100)
parser.add_argument("--scale", type=float, help="Scaling factor for inputs", default=1.0)
parser.add_argument("--save_frequency", type=float, help="How often to save TS data", default=10)
parser.add_argument("--run_name", type=str, help="Run Name", default=str(uuid.uuid4()))
parser.add_argument("--local_reductions", nargs="+", type=str, default=[])
# running in Tf estimator mode, script need to accept --model_dir parameter
parser.add_argument("--model_dir", type=str, help="model dir", default=str(uuid.uuid4()))
args = parser.parse_args()
t = str(time.time())
hook = SessionHook(
"s3://tornasolecodebuildtest/container_testing/ts_outputs/tf" + t,
save_config=SaveConfig(save_interval=10),
)
# Network definition
with tf.name_scope("foobar"):
x = tf.placeholder(shape=(None, 2), dtype=tf.float32)
w = tf.Variable(initial_value=[[10.0], [10.0]])
with tf.name_scope("foobaz"):
w0 = [[1], [1.0]]
y = tf.matmul(x, w0)
loss = tf.reduce_mean((tf.matmul(x, w) - y) ** 2, name="loss")
global_step = tf.Variable(17, name="global_step", trainable=False)
increment_global_step_op = tf.assign(global_step, global_step + 1)
optimizer = tf.train.AdamOptimizer(args.lr)
optimizer = get_hook().wrap_optimizer(optimizer)
optimizer_op = optimizer.minimize(loss, global_step=increment_global_step_op)
graph = tf.get_default_graph()
parser = argparse.ArgumentParser(description="Train resnet50 cifar10")
parser.add_argument("--batch_size", type=int, default=32)
parser.add_argument("--epoch", type=int, default=3)
parser.add_argument("--model_dir", type=str, default="./model_keras_resnet")
parser.add_argument("--out_dir", type=str)
parser.add_argument("--save_interval", type=int, default=500)
opt = parser.parse_args()
model = ResNet50(weights=None, input_shape=(32, 32, 3), classes=10)
##### Enabling SageMaker Debugger ###########
# creating hook
hook = smd.KerasHook(
out_dir=opt.out_dir,
include_collections=["weights", "gradients", "losses"],
save_config=smd.SaveConfig(save_interval=opt.save_interval),
)
optimizer = tf.keras.optimizers.Adam()
##### Enabling SageMaker Debugger ###########
# wrap the optimizer so the hook can identify the gradients
optimizer = hook.wrap_optimizer(optimizer)
model.compile(loss="categorical_crossentropy", optimizer=optimizer, metrics=["accuracy"])
# start the training.
train(opt.batch_size, opt.epoch, model, hook)