How to use the smdebug.tensorflow.get_hook function in smdebug

To help you get started, we’ve selected a few smdebug examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github awslabs / sagemaker-debugger / tests / zero_code_change / tensorflow_integration_tests.py View on Github external
loss="sparse_categorical_crossentropy", optimizer=opt, metrics=["accuracy"]
            )
            history = model.fit(
                x_train, y_train, batch_size=16, epochs=5, validation_split=0.2, callbacks=[hook]
            )
            test_scores = model.evaluate(x_test, y_test, verbose=2, callbacks=[hook])
        else:
            model.compile(
                loss="sparse_categorical_crossentropy", optimizer=opt, metrics=["accuracy"]
            )
            history = model.fit(x_train, y_train, batch_size=16, epochs=5, validation_split=0.2)
            test_scores = model.evaluate(x_test, y_test, verbose=2)

        # Check that hook created and tensors saved
        trial = smd.create_trial(path=sim.out_dir)
        assert smd.get_hook() is not None, "Hook was not created."
        assert len(trial.steps()) > 0, "Nothing saved at any step."
        assert len(trial.tensor_names()) > 0, "Tensors were not saved."
        assert len(trial.tensor_names(collection="gradients")) > 0
        if not tf_optimizer:
            # as this is only supported for keras optimizers currently
            assert len(trial.tensor_names(collection="optimizer_variables")) > 0
github awslabs / sagemaker-debugger / tests / zero_code_change / tensorflow_integration_tests.py View on Github external
with SagemakerSimulator(json_file_contents=json_file_contents) as sim:
        train_op, X, Y = get_train_op_and_placeholders()
        init = tf.compat.v1.global_variables_initializer()
        mnist = get_data()

        sess = tf.train.MonitoredSession()

        with sess:
            sess.run(init)
            for step in range(1, 101):
                batch_x, batch_y = mnist.train.next_batch(32)
                sess.run(train_op, feed_dict={X: batch_x, Y: batch_y})

        # Check that hook created and tensors saved
        trial = smd.create_trial(path=sim.out_dir)
        assert smd.get_hook() is not None, "Hook was not created."
        assert len(trial.steps()) > 0, "Nothing saved at any step."
        assert len(trial.tensor_names()) > 0, "Tensors were not saved."
        assert len(trial.tensor_names(collection="gradients")) > 0
github awslabs / sagemaker-debugger / tests / zero_code_change / tensorflow_integration_tests.py View on Github external
if mirrored:
            test_basic("/opt/ml/output/tensors", zcc=True)
        else:
            # Setup
            mnist_classifier = get_estimator(nested_optimizer=nested, mirrored=mirrored)
            train_input_fn, eval_input_fn = get_input_fns()

            # Train and evaluate
            train_steps, eval_steps = 10, 10
            mnist_classifier.train(input_fn=train_input_fn, steps=train_steps)
            mnist_classifier.evaluate(input_fn=eval_input_fn, steps=eval_steps)

            # Check that hook created and tensors saved
            trial = smd.create_trial(path=sim.out_dir)
            print(trial)
            assert smd.get_hook() is not None, "Hook was not created."
            assert len(trial.steps()) > 0, "Nothing saved at any step."
            assert len(trial.tensor_names()) > 0, "Tensors were not saved."
            assert trial.steps() == [
                0,
                2,
                4,
                6,
                8,
                10,
                12,
                14,
                16,
                18,
            ], "Wrong step count for trial."
            print(trial.tensor_names(collection="gradients"))
            assert len(trial.tensor_names(collection="gradients")) > 0
github awslabs / sagemaker-debugger / tests / tensorflow / hooks / test_mirrored_strategy.py View on Github external
)
            else:
                mnist_classifier.evaluate(input_fn=input_fn_provider.eval_input_fn, steps=num_steps)
        elif s == "predict":
            print("Starting predict")
            if not zcc:
                ts_hook.set_mode(smd.modes.PREDICT)
                # Evaluate the model and print results
                p = mnist_classifier.predict(
                    input_fn=input_fn_provider.eval_input_fn, hooks=[ts_hook]
                )
            else:
                p = mnist_classifier.predict(input_fn=input_fn_provider.eval_input_fn)
            for i in range(num_steps):
                next(p)
    get_hook()._cleanup()
    return distribution
github awslabs / sagemaker-debugger / examples / tensorflow / scripts / distributed_training / horovod_mnist_estimator.py View on Github external
return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)

    # Calculate Loss (for both TRAIN and EVAL modes)
    onehot_labels = tf.one_hot(indices=tf.cast(labels, tf.int32), depth=10)
    loss = tf.losses.softmax_cross_entropy(onehot_labels=onehot_labels, logits=logits)

    # Configure the Training Op (for TRAIN mode)
    if mode == tf.estimator.ModeKeys.TRAIN:
        # Horovod: scale learning rate by the number of workers.
        optimizer = tf.train.MomentumOptimizer(learning_rate=0.001 * hvd.size(), momentum=0.9)

        # Horovod: add Horovod Distributed Optimizer.
        optimizer = hvd.DistributedOptimizer(optimizer)

        # Add smdebug Optimizer
        optimizer = smd.get_hook().wrap_optimizer(optimizer)

        train_op = optimizer.minimize(loss=loss, global_step=tf.train.get_global_step())
        return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op)

    # Add evaluation metrics (for EVAL mode)
    eval_metric_ops = {
        "accuracy": tf.metrics.accuracy(labels=labels, predictions=predictions["classes"])
    }
    return tf.estimator.EstimatorSpec(mode=mode, loss=loss, eval_metric_ops=eval_metric_ops)
github awslabs / sagemaker-debugger / examples / tensorflow / scripts / train_imagenet_resnet_hvd.py View on Github external
shortcut = x
            else:
                shortcut = builder.max_pooling2d(x, 1, stride)
        else:
            shortcut = builder.conv2d_linear(x, depth, 1, stride, "SAME")
        if basic:
            x = builder.pad2d(x, 1)
            x = builder.conv2d(x, depth_bottleneck, 3, stride, "VALID")
            x = builder.conv2d_linear(x, depth, 3, 1, "SAME")
        else:
            x = builder.conv2d(x, depth_bottleneck, 1, 1, "SAME")
            x = builder.conv2d(x, depth_bottleneck, 3, stride, "SAME")
            # x = builder.conv2d_linear(x, depth,            1, 1,      'SAME')
            x = builder.conv2d_linear_last_bn(x, depth, 1, 1, "SAME")
        x = tf.nn.relu(x + shortcut)
        smd.get_hook().add_to_collection("relu_activations", x)
        return x
github awslabs / sagemaker-debugger / examples / tensorflow / scripts / train_imagenet_resnet_hvd.py View on Github external
cdr_alpha,
                    lc_periods,
                    lc_alpha,
                    lc_beta,
                ),
            )
            learning_rate = tf.identity(learning_rate, "learning_rate")
            tf.summary.scalar("learning_rate", learning_rate)

        opt = tf.train.MomentumOptimizer(learning_rate, momentum, use_nesterov=True)
        opt = hvd.DistributedOptimizer(opt)
        if use_larc:
            opt = LarcOptimizer(opt, learning_rate, leta, clip=True)

        opt = MixedPrecisionOptimizer(opt, scale=loss_scale)
        opt = smd.get_hook().wrap_optimizer(opt)

        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) or []
        with tf.control_dependencies(update_ops):
            gate_gradients = tf.train.Optimizer.GATE_NONE
            train_op = opt.minimize(
                total_loss, global_step=tf.train.get_global_step(), gate_gradients=gate_gradients
            )
        train_op = tf.group(preload_op, gpucopy_op, train_op)

        return tf.estimator.EstimatorSpec(mode, loss=total_loss, train_op=train_op)