How to use the smdebug.tensorflow function in smdebug

To help you get started, we’ve selected a few smdebug examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github awslabs / sagemaker-debugger / tests / zero_code_change / tensorflow_integration_tests.py View on Github external
model.compile(
                loss="sparse_categorical_crossentropy", optimizer=opt, metrics=["accuracy"]
            )
            history = model.fit(
                x_train, y_train, batch_size=16, epochs=5, validation_split=0.2, callbacks=[hook]
            )
            test_scores = model.evaluate(x_test, y_test, verbose=2, callbacks=[hook])
        else:
            model.compile(
                loss="sparse_categorical_crossentropy", optimizer=opt, metrics=["accuracy"]
            )
            history = model.fit(x_train, y_train, batch_size=16, epochs=5, validation_split=0.2)
            test_scores = model.evaluate(x_test, y_test, verbose=2)

        # Check that hook created and tensors saved
        trial = smd.create_trial(path=sim.out_dir)
        assert smd.get_hook() is not None, "Hook was not created."
        assert len(trial.steps()) > 0, "Nothing saved at any step."
        assert len(trial.tensor_names()) > 0, "Tensors were not saved."
        assert len(trial.tensor_names(collection="gradients")) > 0
        if not tf_optimizer:
            # as this is only supported for keras optimizers currently
            assert len(trial.tensor_names(collection="optimizer_variables")) > 0
github awslabs / sagemaker-debugger / tests / tensorflow / hooks / test_mirrored_strategy.py View on Github external
# Use multiple GPUs by MirroredStragtegy.
    # All avaiable GPUs will be used if `num_gpus` is omitted.
    # if num_devices > 1:
    distribution = tf.contrib.distribute.MirroredStrategy()
    # print("### Doing Multi GPU Training")
    # else:
    #     distribution = None
    # Pass to RunConfig
    config = tf.estimator.RunConfig(
        train_distribute=distribution,
        eval_distribute=distribution if eval_distributed else None,
        model_dir="/tmp/mnist_convnet_model",
    )

    if save_config is None:
        save_config = smd.SaveConfig(save_interval=2)

    if include_collections is None:
        include_collections = [
            CollectionKeys.WEIGHTS,
            CollectionKeys.BIASES,
            CollectionKeys.GRADIENTS,
            CollectionKeys.LOSSES,
        ]

    if not zcc:
        ts_hook = smd.SessionHook(
            out_dir=trial_dir,
            save_all=save_all,
            include_collections=include_collections,
            save_config=save_config,
            reduction_config=reduction_config,
github awslabs / sagemaker-debugger / tests / tensorflow / test_sagemaker.py View on Github external
"CollectionConfigurations": [
        {
            "CollectionName": "weights",
            "CollectionParameters": null
        },
        {
            "CollectionName": "losses",
            "CollectionParameters": null
        }
    ],
    "DebugHookSpecification": null
}
"""
    with SagemakerSimulator(json_file_contents=json_file_contents) as sim:
        smd.del_hook()
        hook = smd.get_hook(hook_type="session", create_if_not_exists=True)
        print(hook)
        assert "weights" in hook.include_collections, hook
github awslabs / sagemaker-debugger / examples / tensorflow / scripts / distributed_training / horovod_mnist_estimator.py View on Github external
# Horovod: BroadcastGlobalVariablesHook broadcasts initial variable states from
    # rank 0 to all other processes. This is necessary to ensure consistent
    # initialization of all workers when training is started with random weights or
    # restored from a checkpoint.
    bcast_hook = hvd.BroadcastGlobalVariablesHook(0)

    # Train the model
    train_input_fn = tf.estimator.inputs.numpy_input_fn(
        x={"x": train_data}, y=train_labels, batch_size=100, num_epochs=None, shuffle=True
    )

    # Setup the Tornasole Hook

    # save tensors as reductions if necessary
    rdnc = (
        smd.ReductionConfig(reductions=["mean"], abs_reductions=["max"], norms=["l1"])
        if FLAGS.reductions
        else None
    )

    ts_hook = smd.SessionHook(
        out_dir=FLAGS.smdebug_path,
        save_all=FLAGS.save_all,
        include_collections=["weights", "gradients", "losses", "biases"],
        save_config=smd.SaveConfig(save_interval=FLAGS.save_frequency),
        reduction_config=rdnc,
    )

    ts_hook.set_mode(smd.modes.TRAIN)

    # Horovod: adjust number of steps based on number of GPUs.
    mnist_classifier.train(
github awslabs / sagemaker-debugger / examples / tensorflow / scripts / distributed_training / parameter_server_training / parameter_server_mnist.py View on Github external
# Generate predictions (for PREDICT and EVAL mode)
        "classes": tf.argmax(input=logits, axis=1),
        # Add `softmax_tensor` to the graph. It is used for PREDICT and by the
        # `logging_hook`.
        "probabilities": tf.nn.softmax(logits, name="softmax_tensor"),
    }
    if mode == tf.estimator.ModeKeys.PREDICT:
        return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)

    # Calculate Loss (for both TRAIN and EVAL modes)
    loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits)

    # Configure the Training Op (for TRAIN mode)
    if mode == tf.estimator.ModeKeys.TRAIN:
        optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.001)
        optimizer = smd.get_hook().wrap_optimizer(optimizer)
        train_op = optimizer.minimize(loss=loss, global_step=tf.train.get_global_step())
        return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op)

    # Add evaluation metrics (for EVAL mode)
    eval_metric_ops = {
        "accuracy": tf.metrics.accuracy(labels=labels, predictions=predictions["classes"])
    }
    return tf.estimator.EstimatorSpec(mode=mode, loss=loss, eval_metric_ops=eval_metric_ops)
github awslabs / sagemaker-debugger / examples / tensorflow / sagemaker_byoc / tf_keras_resnet.py View on Github external
def main():
    parser = argparse.ArgumentParser(description="Train resnet50 cifar10")
    parser.add_argument("--batch_size", type=int, default=32)
    parser.add_argument("--epoch", type=int, default=3)
    parser.add_argument("--model_dir", type=str, default="./model_keras_resnet")
    opt = parser.parse_args()

    model = ResNet50(weights=None, input_shape=(32, 32, 3), classes=10)

    ##### Enabling SageMaker Debugger ###########
    # Create hook from the configuration provided through sagemaker python sdk
    hook = smd.KerasHook.create_from_json_file()
    optimizer = tf.keras.optimizers.Adam()

    ##### Enabling SageMaker Debugger ###########
    # wrap the optimizer so the hook can identify the gradients
    optimizer = hook.wrap_optimizer(optimizer)

    model.compile(loss="categorical_crossentropy", optimizer=optimizer, metrics=["accuracy"])
    train(opt.batch_size, opt.epoch, model, hook)
github awslabs / sagemaker-debugger / examples / tensorflow / local / mnist.py View on Github external
"--num_eval_steps",
        type=int,
        help="Number of steps to evaluate for. If this"
        "is passed, it doesnt evaluate over the full eval set",
    )
    parser.add_argument("--model_dir", type=str, default="/tmp/mnist_model")
    args = parser.parse_args()

    if args.random_seed:
        tf.set_random_seed(2)
        np.random.seed(2)
        random.seed(12)

    ##### Enabling SageMaker Debugger ###########
    # creating hook
    hook = smd.EstimatorHook(
        out_dir=args.out_dir,
        include_collections=["weights", "gradients"],
        save_config=smd.SaveConfig(save_interval=args.save_interval),
    )

    def cnn_model_fn(features, labels, mode):
        """Model function for CNN."""
        # Input Layer
        input_layer = tf.reshape(features["x"], [-1, 28, 28, 1])

        # Convolutional Layer #1
        conv1 = tf.layers.conv2d(
            inputs=input_layer,
            filters=32,
            kernel_size=[5, 5],
            padding="same",
github awslabs / sagemaker-debugger / examples / tensorflow / scripts / distributed_training / mirrored_strategy_mnist.py View on Github external
# Use multiple GPUs by MirroredStragtegy.
    # All avaiable GPUs will be used if `num_gpus` is omitted.
    if num_gpus > 1:
        distribution = tf.contrib.distribute.MirroredStrategy(num_gpus=num_gpus)
        print("### Doing Multi GPU Training")
    else:
        distribution = None
    # Pass to RunConfig
    config = tf.estimator.RunConfig(
        train_distribute=distribution, model_dir="/tmp/mnist_convnet_model"
    )

    # save tensors as reductions if necessary
    rdnc = (
        smd.ReductionConfig(reductions=["mean"], abs_reductions=["max"], norms=["l1"])
        if FLAGS.reductions
        else None
    )

    ts_hook = smd.SessionHook(
        out_dir=FLAGS.smdebug_path,
        save_all=FLAGS.save_all,
        include_collections=["weights", "gradients", "losses", "biases"],
        save_config=smd.SaveConfig(save_interval=FLAGS.save_frequency),
        reduction_config=rdnc,
    )

    ts_hook.set_mode(smd.modes.TRAIN)

    # Create the Estimator
    # pass RunConfig
github awslabs / sagemaker-debugger / examples / tensorflow / local / tf_keras_resnet.py View on Github external
def main():
    parser = argparse.ArgumentParser(description="Train resnet50 cifar10")
    parser.add_argument("--batch_size", type=int, default=32)
    parser.add_argument("--epoch", type=int, default=3)
    parser.add_argument("--model_dir", type=str, default="./model_keras_resnet")
    parser.add_argument("--out_dir", type=str)
    parser.add_argument("--save_interval", type=int, default=500)
    opt = parser.parse_args()

    model = ResNet50(weights=None, input_shape=(32, 32, 3), classes=10)

    ##### Enabling SageMaker Debugger ###########
    # creating hook
    hook = smd.KerasHook(
        out_dir=opt.out_dir,
        include_collections=["weights", "gradients", "losses"],
        save_config=smd.SaveConfig(save_interval=opt.save_interval),
    )

    optimizer = tf.keras.optimizers.Adam()

    ##### Enabling SageMaker Debugger ###########
    # wrap the optimizer so the hook can identify the gradients
    optimizer = hook.wrap_optimizer(optimizer)
    model.compile(loss="categorical_crossentropy", optimizer=optimizer, metrics=["accuracy"])

    # start the training.
    train(opt.batch_size, opt.epoch, model, hook)