How to use the smdebug.tensorflow.SaveConfig function in smdebug

To help you get started, we’ve selected a few smdebug examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github awslabs / sagemaker-debugger / examples / tensorflow / scripts / distributed_training / horovod_mnist_estimator.py View on Github external
)

    # Setup the Tornasole Hook

    # save tensors as reductions if necessary
    rdnc = (
        smd.ReductionConfig(reductions=["mean"], abs_reductions=["max"], norms=["l1"])
        if FLAGS.reductions
        else None
    )

    ts_hook = smd.SessionHook(
        out_dir=FLAGS.smdebug_path,
        save_all=FLAGS.save_all,
        include_collections=["weights", "gradients", "losses", "biases"],
        save_config=smd.SaveConfig(save_interval=FLAGS.save_frequency),
        reduction_config=rdnc,
    )

    ts_hook.set_mode(smd.modes.TRAIN)

    # Horovod: adjust number of steps based on number of GPUs.
    mnist_classifier.train(
        input_fn=train_input_fn,
        steps=FLAGS.steps // hvd.size(),
        hooks=[logging_hook, bcast_hook, ts_hook],
    )

    # Evaluate the model and print results
    eval_input_fn = tf.estimator.inputs.numpy_input_fn(
        x={"x": eval_data}, y=eval_labels, num_epochs=1, shuffle=False
    )
github awslabs / sagemaker-debugger / tests / tensorflow / keras / test_keras_mirrored.py View on Github external
def test_save_one_worker(out_dir):
    strategy = train_model(
        out_dir,
        include_collections=None,
        save_all=True,
        save_config=SaveConfig(save_steps=[5]),
        steps=["train"],
        include_workers="one",
    )
    tr = create_trial_fast_refresh(out_dir)
    assert len(tr.workers()) == 1
    assert len(tr.steps())
    assert len(tr.tensor_names(collection="weights"))
    assert len(tr.tensor_names(collection="weights"))
    assert len(tr.tensor(tr.tensor_names(collection="weights")[0]).workers(0)) == 1
    assert len(tr.tensor_names(collection="biases"))
    assert len(tr.tensor(tr.tensor_names(collection="biases")[0]).workers(0)) == 1
    assert len(tr.tensor_names(collection="gradients"))
github awslabs / sagemaker-debugger / tests / tensorflow / keras / test_keras.py View on Github external
if hook is None:
        if save_config is None:
            save_config = SaveConfig(save_interval=3)

        hook = KerasHook(
            trial_dir,
            save_config=save_config,
            save_all=save_all,
            include_collections=include_collections,
            reduction_config=reduction_config,
        )

        if not save_all and include_collections is not None:
            for cname in hook.include_collections:
                if cname not in include_collections:
                    hook.get_collection(cname).save_config = SaveConfig(end_step=0)

    if create_relu_collection:
        hook.get_collection("relu").add_keras_layer(relu_layer, inputs=True, outputs=True)

    if use_keras_optimizer:
        opt = keras.optimizers.RMSprop()
    else:
        opt = tf.train.RMSPropOptimizer(0.1)

    opt = hook.wrap_optimizer(opt)

    if use_tf_keras:
        model.compile(
            optimizer=opt,
            loss="sparse_categorical_crossentropy",
            run_eagerly=eager,
github awslabs / sagemaker-debugger / tests / tensorflow / hooks / test_reductions.py View on Github external
def test_reductions(out_dir, save_raw_tensor=False):
    pre_test_clean_up()
    rdnc = smd.ReductionConfig(
        reductions=ALLOWED_REDUCTIONS,
        abs_reductions=ALLOWED_REDUCTIONS,
        norms=ALLOWED_NORMS,
        abs_norms=ALLOWED_NORMS,
        save_raw_tensor=save_raw_tensor,
    )
    hook = smd.SessionHook(
        out_dir=out_dir,
        save_config=smd.SaveConfig(save_interval=1),
        reduction_config=rdnc,
        include_collections=["weights", "gradients", "losses"],
    )
    helper_test_reductions(out_dir, hook, save_raw_tensor)
github awslabs / sagemaker-debugger / tests / tensorflow / keras / test_keras.py View on Github external
def test_clash_with_tb_callback(out_dir):
    train_model(
        out_dir,
        save_config=SaveConfig(save_interval=9),
        steps=["train"],
        include_collections=[
            CollectionKeys.WEIGHTS,
            CollectionKeys.BIASES,
            CollectionKeys.LOSSES,
            CollectionKeys.METRICS,
        ],
        add_callbacks=["tensorboard"],
    )
    tr = create_trial_fast_refresh(out_dir)
    assert len(tr.tensor_names()) == 8
    shutil.rmtree(out_dir)
github awslabs / sagemaker-debugger / examples / tensorflow / scripts / train_imagenet_resnet_hvd.py View on Github external
if FLAGS.tornasole_relu_reductions:
        for r in FLAGS.tornasole_relu_reductions:
            reductions.append(r)
    if FLAGS.tornasole_relu_reductions_abs:
        for r in FLAGS.tornasole_relu_reductions_abs:
            abs_reductions.append(r)
    if reductions or abs_reductions:
        rnc = smd.ReductionConfig(reductions=reductions, abs_reductions=abs_reductions)
    else:
        rnc = None

    include_collections = ["losses"]

    hook = smd.SessionHook(
        out_dir=FLAGS.smdebug_path,
        save_config=smd.SaveConfig(save_interval=FLAGS.step_interval),
        reduction_config=rnc,
        include_collections=include_collections,
        save_all=FLAGS.tornasole_save_all,
    )
    if FLAGS.save_weights is True:
        include_collections.append("weights")
    if FLAGS.save_gradients is True:
        include_collections.append("gradients")
    if FLAGS.tornasole_save_relu_activations is True:
        include_collections.append("relu_activations")
    if FLAGS.tornasole_save_inputs is True:
        include_collections.append("inputs")
    if FLAGS.tornasole_include:
        hook.get_collection("default").include(FLAGS.tornasole_include)
        include_collections.append("default")
    return hook
github awslabs / sagemaker-debugger / bin / sagemaker-containers / tensorflow / tf-train.py View on Github external
parser = argparse.ArgumentParser()
parser.add_argument("--lr", type=float, help="Learning Rate", default=0.001)
parser.add_argument("--steps", type=int, help="Number of steps to run", default=100)
parser.add_argument("--scale", type=float, help="Scaling factor for inputs", default=1.0)
parser.add_argument("--save_frequency", type=float, help="How often to save TS data", default=10)
parser.add_argument("--run_name", type=str, help="Run Name", default=str(uuid.uuid4()))
parser.add_argument("--local_reductions", nargs="+", type=str, default=[])
# running in Tf estimator mode, script need to accept --model_dir parameter
parser.add_argument("--model_dir", type=str, help="model dir", default=str(uuid.uuid4()))
args = parser.parse_args()

t = str(time.time())
hook = SessionHook(
    "s3://tornasolecodebuildtest/container_testing/ts_outputs/tf" + t,
    save_config=SaveConfig(save_interval=10),
)

# Network definition
with tf.name_scope("foobar"):
    x = tf.placeholder(shape=(None, 2), dtype=tf.float32)
    w = tf.Variable(initial_value=[[10.0], [10.0]])
with tf.name_scope("foobaz"):
    w0 = [[1], [1.0]]
    y = tf.matmul(x, w0)
loss = tf.reduce_mean((tf.matmul(x, w) - y) ** 2, name="loss")
global_step = tf.Variable(17, name="global_step", trainable=False)
increment_global_step_op = tf.assign(global_step, global_step + 1)
optimizer = tf.train.AdamOptimizer(args.lr)
optimizer = get_hook().wrap_optimizer(optimizer)
optimizer_op = optimizer.minimize(loss, global_step=increment_global_step_op)
graph = tf.get_default_graph()
github awslabs / sagemaker-debugger / examples / tensorflow / local / tf_keras_resnet.py View on Github external
parser = argparse.ArgumentParser(description="Train resnet50 cifar10")
    parser.add_argument("--batch_size", type=int, default=32)
    parser.add_argument("--epoch", type=int, default=3)
    parser.add_argument("--model_dir", type=str, default="./model_keras_resnet")
    parser.add_argument("--out_dir", type=str)
    parser.add_argument("--save_interval", type=int, default=500)
    opt = parser.parse_args()

    model = ResNet50(weights=None, input_shape=(32, 32, 3), classes=10)

    ##### Enabling SageMaker Debugger ###########
    # creating hook
    hook = smd.KerasHook(
        out_dir=opt.out_dir,
        include_collections=["weights", "gradients", "losses"],
        save_config=smd.SaveConfig(save_interval=opt.save_interval),
    )

    optimizer = tf.keras.optimizers.Adam()

    ##### Enabling SageMaker Debugger ###########
    # wrap the optimizer so the hook can identify the gradients
    optimizer = hook.wrap_optimizer(optimizer)
    model.compile(loss="categorical_crossentropy", optimizer=optimizer, metrics=["accuracy"])

    # start the training.
    train(opt.batch_size, opt.epoch, model, hook)