How to use smdebug - 10 common examples

To help you get started, we’ve selected a few smdebug examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github awslabs / sagemaker-debugger / tests / core / test_modes.py View on Github external
def test_mode_writing():
    run_id = "trial_" + datetime.now().strftime("%Y%m%d-%H%M%S%f")
    worker = socket.gethostname()
    for s in range(0, 10):
        fw = FileWriter(trial_dir="/tmp/ts_outputs/" + run_id, step=s, worker=worker)
        if s % 2 == 0:
            fw.write_tensor(
                tdata=np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32),
                tname="arr",
                mode=ModeKeys.TRAIN,
                mode_step=s // 2,
            )
        else:
            fw.write_tensor(
                tdata=np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32),
                tname="arr",
                mode=ModeKeys.EVAL,
                mode_step=s // 2,
            )
        fw.close()
    write_dummy_collection_file("/tmp/ts_outputs/" + run_id)
    files = glob.glob("/tmp/ts_outputs/" + run_id + "/**/*.tfevents", recursive=True)

    global_steps = []
    train_steps = []
    eval_steps = []
github awslabs / sagemaker-debugger / tests / analysis / trials / test_modes.py View on Github external
def test_mode_data():
    run_id = "trial_" + datetime.now().strftime("%Y%m%d-%H%M%S%f")
    trial_dir = "/tmp/ts_outputs/" + run_id

    c = CollectionManager()
    c.add("default")
    c.get("default").tensor_names = ["arr"]
    c.export(trial_dir, DEFAULT_COLLECTIONS_FILE_NAME)
    tr = create_trial(trial_dir)
    worker = socket.gethostname()
    for s in range(0, 10):
        fw = FileWriter(trial_dir=trial_dir, step=s, worker=worker)
        if s % 2 == 0:
            fw.write_tensor(
                tdata=np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32),
                tname="arr",
                mode=modes.TRAIN,
                mode_step=s // 2,
            )
        else:
            fw.write_tensor(
github awslabs / sagemaker-debugger / examples / tensorflow / scripts / distributed_training / horovod_mnist_estimator.py View on Github external
)

    # Setup the Tornasole Hook

    # save tensors as reductions if necessary
    rdnc = (
        smd.ReductionConfig(reductions=["mean"], abs_reductions=["max"], norms=["l1"])
        if FLAGS.reductions
        else None
    )

    ts_hook = smd.SessionHook(
        out_dir=FLAGS.smdebug_path,
        save_all=FLAGS.save_all,
        include_collections=["weights", "gradients", "losses", "biases"],
        save_config=smd.SaveConfig(save_interval=FLAGS.save_frequency),
        reduction_config=rdnc,
    )

    ts_hook.set_mode(smd.modes.TRAIN)

    # Horovod: adjust number of steps based on number of GPUs.
    mnist_classifier.train(
        input_fn=train_input_fn,
        steps=FLAGS.steps // hvd.size(),
        hooks=[logging_hook, bcast_hook, ts_hook],
    )

    # Evaluate the model and print results
    eval_input_fn = tf.estimator.inputs.numpy_input_fn(
        x={"x": eval_data}, y=eval_labels, num_epochs=1, shuffle=False
    )
github awslabs / sagemaker-debugger / tests / core / test_hook_save_scalar.py View on Github external
opt = hook.wrap_optimizer(opt)

    model.compile(
        optimizer=opt,
        loss="sparse_categorical_crossentropy",
        run_eagerly=False,
        metrics=["accuracy"],
    )
    hooks = [hook]
    hook.save_scalar("tf_keras_num_steps", steps, sm_metric=True)

    hook.save_scalar("tf_keras_before_train", 1, sm_metric=False)
    hook.set_mode(ModeKeys.TRAIN)
    model.fit(x_train, y_train, epochs=1, steps_per_epoch=steps, callbacks=hooks, verbose=0)

    hook.set_mode(ModeKeys.EVAL)
    model.evaluate(x_test, y_test, steps=10, callbacks=hooks, verbose=0)
    hook.save_scalar("tf_keras_after_train", 1, sm_metric=False)
github awslabs / sagemaker-debugger / tests / core / test_hook_save_scalar.py View on Github external
)

    opt = tf.train.RMSPropOptimizer(lr)
    opt = hook.wrap_optimizer(opt)

    model.compile(
        optimizer=opt,
        loss="sparse_categorical_crossentropy",
        run_eagerly=False,
        metrics=["accuracy"],
    )
    hooks = [hook]
    hook.save_scalar("tf_keras_num_steps", steps, sm_metric=True)

    hook.save_scalar("tf_keras_before_train", 1, sm_metric=False)
    hook.set_mode(ModeKeys.TRAIN)
    model.fit(x_train, y_train, epochs=1, steps_per_epoch=steps, callbacks=hooks, verbose=0)

    hook.set_mode(ModeKeys.EVAL)
    model.evaluate(x_test, y_test, steps=10, callbacks=hooks, verbose=0)
    hook.save_scalar("tf_keras_after_train", 1, sm_metric=False)
github awslabs / sagemaker-debugger / tests / zero_code_change / tensorflow_integration_tests.py View on Github external
model.compile(
                loss="sparse_categorical_crossentropy", optimizer=opt, metrics=["accuracy"]
            )
            history = model.fit(
                x_train, y_train, batch_size=16, epochs=5, validation_split=0.2, callbacks=[hook]
            )
            test_scores = model.evaluate(x_test, y_test, verbose=2, callbacks=[hook])
        else:
            model.compile(
                loss="sparse_categorical_crossentropy", optimizer=opt, metrics=["accuracy"]
            )
            history = model.fit(x_train, y_train, batch_size=16, epochs=5, validation_split=0.2)
            test_scores = model.evaluate(x_test, y_test, verbose=2)

        # Check that hook created and tensors saved
        trial = smd.create_trial(path=sim.out_dir)
        assert smd.get_hook() is not None, "Hook was not created."
        assert len(trial.steps()) > 0, "Nothing saved at any step."
        assert len(trial.tensor_names()) > 0, "Tensors were not saved."
        assert len(trial.tensor_names(collection="gradients")) > 0
        if not tf_optimizer:
            # as this is only supported for keras optimizers currently
            assert len(trial.tensor_names(collection="optimizer_variables")) > 0
github awslabs / sagemaker-debugger / tests / zero_code_change / tensorflow_integration_tests.py View on Github external
loss="sparse_categorical_crossentropy", optimizer=opt, metrics=["accuracy"]
            )
            history = model.fit(
                x_train, y_train, batch_size=16, epochs=5, validation_split=0.2, callbacks=[hook]
            )
            test_scores = model.evaluate(x_test, y_test, verbose=2, callbacks=[hook])
        else:
            model.compile(
                loss="sparse_categorical_crossentropy", optimizer=opt, metrics=["accuracy"]
            )
            history = model.fit(x_train, y_train, batch_size=16, epochs=5, validation_split=0.2)
            test_scores = model.evaluate(x_test, y_test, verbose=2)

        # Check that hook created and tensors saved
        trial = smd.create_trial(path=sim.out_dir)
        assert smd.get_hook() is not None, "Hook was not created."
        assert len(trial.steps()) > 0, "Nothing saved at any step."
        assert len(trial.tensor_names()) > 0, "Tensors were not saved."
        assert len(trial.tensor_names(collection="gradients")) > 0
        if not tf_optimizer:
            # as this is only supported for keras optimizers currently
            assert len(trial.tensor_names(collection="optimizer_variables")) > 0
github awslabs / sagemaker-debugger / tests / core / test_index_reader.py View on Github external
def test_fetch_tensor_with_present_event_files():
    """
        events files present: [0, 18, 27, 36, ...., 190]
        index files present: [0, 9, 18, 27, 36, ...., 190, 199]

        end_of_job file : present

    """
    path = "s3://smdebug-testing/resources/event-files-missing"

    trial = create_trial(path)
    # Get value from an event file that is present
    trial.tensor("gradients/pow_grad/sub:0").value(0)
github awslabs / sagemaker-debugger / tests / zero_code_change / pytorch_integration_tests.py View on Github external
if script_mode:
                    hook.record_tensor_value(tensor_name="loss", tensor_value=loss)
            loss.backward()
            optimizer.step()

            if i == 499:  # print every 2000 mini-batches
                break

        print("Finished Training")

        hook = smd.get_hook()
        print(f"hook = {hook}")

        from smdebug.trials import create_trial

        trial = create_trial(path=sim.out_dir)
        print(f"trial.steps() = {trial.steps()}")
        print(f"trial.tensor_names() = {trial.tensor_names()}")

        print(f"collection_manager = {hook.collection_manager}")

        losses_tensors = hook.collection_manager.get("losses").tensor_names
        print(f"'losses' collection tensor_names = {losses_tensors}")
        assert len(losses_tensors) > 0

        assert all(
            [
                name in trial.tensor_names()
                for name in hook.collection_manager.get("losses").tensor_names
            ]
github awslabs / sagemaker-debugger / tests / mxnet / test_hook_save_all.py View on Github external
def test_save_all(hook=None, out_dir=None):
    hook_created = False
    if hook is None:
        hook_created = True
        save_config = SaveConfig(save_steps=[0, 1, 2, 3])
        run_id = "trial_" + datetime.now().strftime("%Y%m%d-%H%M%S%f")
        out_dir = "/tmp/" + run_id
        print("Registering the hook with out_dir {}".format(out_dir))
        hook = t_hook(out_dir=out_dir, save_config=save_config, save_all=True)
    run_mnist_gluon_model(hook=hook, num_steps_train=7, num_steps_eval=5)
    # assert for steps and tensor_names
    print("Created the trial with out_dir {}".format(out_dir))
    tr = create_trial(out_dir)
    tensor_list = tr.tensor_names()
    assert tr
    assert len(tr.steps()) == 4
    # some tensor names, like input and output, can't be retrieved from training session, so here we only assert for tensor numbers
    # 46 is gotten from index file
    # if no assertion failure, then the script could save all tensors
    assert len(tensor_list) == 46
    if hook_created:
        shutil.rmtree(out_dir)