How to use the smdebug.trials.create_trial function in smdebug

To help you get started, we’ve selected a few smdebug examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github awslabs / sagemaker-debugger / tests / core / test_index_reader.py View on Github external
def test_fetch_tensor_with_present_event_files():
    """
        events files present: [0, 18, 27, 36, ...., 190]
        index files present: [0, 9, 18, 27, 36, ...., 190, 199]

        end_of_job file : present

    """
    path = "s3://smdebug-testing/resources/event-files-missing"

    trial = create_trial(path)
    # Get value from an event file that is present
    trial.tensor("gradients/pow_grad/sub:0").value(0)
github awslabs / sagemaker-debugger / tests / zero_code_change / pytorch_integration_tests.py View on Github external
if script_mode:
                    hook.record_tensor_value(tensor_name="loss", tensor_value=loss)
            loss.backward()
            optimizer.step()

            if i == 499:  # print every 2000 mini-batches
                break

        print("Finished Training")

        hook = smd.get_hook()
        print(f"hook = {hook}")

        from smdebug.trials import create_trial

        trial = create_trial(path=sim.out_dir)
        print(f"trial.steps() = {trial.steps()}")
        print(f"trial.tensor_names() = {trial.tensor_names()}")

        print(f"collection_manager = {hook.collection_manager}")

        losses_tensors = hook.collection_manager.get("losses").tensor_names
        print(f"'losses' collection tensor_names = {losses_tensors}")
        assert len(losses_tensors) > 0

        assert all(
            [
                name in trial.tensor_names()
                for name in hook.collection_manager.get("losses").tensor_names
            ]
github awslabs / sagemaker-debugger / tests / mxnet / test_hook_save_all.py View on Github external
def test_save_all(hook=None, out_dir=None):
    hook_created = False
    if hook is None:
        hook_created = True
        save_config = SaveConfig(save_steps=[0, 1, 2, 3])
        run_id = "trial_" + datetime.now().strftime("%Y%m%d-%H%M%S%f")
        out_dir = "/tmp/" + run_id
        print("Registering the hook with out_dir {}".format(out_dir))
        hook = t_hook(out_dir=out_dir, save_config=save_config, save_all=True)
    run_mnist_gluon_model(hook=hook, num_steps_train=7, num_steps_eval=5)
    # assert for steps and tensor_names
    print("Created the trial with out_dir {}".format(out_dir))
    tr = create_trial(out_dir)
    tensor_list = tr.tensor_names()
    assert tr
    assert len(tr.steps()) == 4
    # some tensor names, like input and output, can't be retrieved from training session, so here we only assert for tensor numbers
    # 46 is gotten from index file
    # if no assertion failure, then the script could save all tensors
    assert len(tensor_list) == 46
    if hook_created:
        shutil.rmtree(out_dir)
github awslabs / sagemaker-debugger / tests / analysis / trials / test_refresh.py View on Github external
def help_test_no_refresh(path):
    trial_name = str(uuid.uuid4())
    num_steps = 8
    num_tensors = 10

    for i in range(num_steps):
        generate_data(
            path=path,
            trial=trial_name,
            num_tensors=num_tensors,
            step=i,
            tname_prefix="foo",
            worker="algo-1",
            shape=(3, 3, 3),
        )
    tr = create_trial(path + trial_name)

    assert "foo_" + str(num_tensors + 1) not in tr.tensor_names()
    assert "foo_1" in tr.tensor_names()
    assert len(tr.steps()) == num_steps
    assert len(tr.tensor("foo_1").steps()) == num_steps

    for i in range(num_steps, num_steps * 2):
        generate_data(
            path=path,
            trial=trial_name,
            num_tensors=num_tensors,
            step=i,
            tname_prefix="foo",
            worker="algo-1",
            shape=(3, 3, 3),
            export_colls=False,
github awslabs / sagemaker-debugger / tests / pytorch / test_collection.py View on Github external
hook_created = False
    if hook is None:
        run_id = "trial_" + datetime.now().strftime("%Y%m%d-%H%M%S%f")
        out_dir = "/tmp/" + run_id
        hook = t_hook(
            out_dir=out_dir,
            save_config=SaveConfig(save_steps=[0, 1, 2, 3]),
            include_collections=["relu_activations"],
        )
        hook_created = True

    model = Net().to(torch.device("cpu"))
    hook.register_module(model)
    optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
    train(model, hook, torch.device("cpu"), optimizer, num_steps=10)
    tr = create_trial(out_dir)
    assert tr
    assert len(tr.tensor_names(collection="relu_activations")) > 0
    assert tr.tensor(tr.tensor_names(collection="relu_activations")[0]).value(0) is not None

    if hook_created:
        shutil.rmtree(out_dir)
github awslabs / sagemaker-debugger / tests / analysis / rules / test_invoker.py View on Github external
def test_invoker_exception():
    path = dump_data()
    tr = create_trial(path)
    r = ExplodingTensor(tr, collection_names="gradients,weights")

    c = 0
    for start_step in range(2):
        try:
            invoke_rule(r, start_step=start_step, end_step=3, raise_eval_cond=True)
        except RuleEvaluationConditionMet as e:
            c += 1
    assert c == 2
    shutil.rmtree(path)
github awslabs / sagemaker-debugger / tests / tensorflow / hooks / test_reductions.py View on Github external
def helper_test_reductions(trial_dir, hook, save_raw_tensor):
    simple_model(hook)
    _, files = get_dirs_files(trial_dir)
    from smdebug.trials import create_trial

    tr = create_trial(trial_dir)
    assert len(tr.tensors()) == 3, tr.tensors()
    for tname in tr.tensors():
        t = tr.tensor(tname)
        if tname in tr.tensors(collection="losses"):
            # no reductions
            assert t.value(0) is not None
        else:
            if save_raw_tensor is True:
                assert t.value(0) is not None
            else:
                try:
                    print(t.value(0))
                    assert False, (tname, e)
                except TensorUnavailableForStep as e:
                    pass
            assert len(t.reduction_values(0)) == 18
github awslabs / sagemaker-debugger / tests / tensorflow / hooks / test_estimator_modes.py View on Github external
def helper_test_mnist_trial(trial_dir):
    tr = create_trial(trial_dir)
    assert len(tr.steps()) == 3
    assert len(tr.steps(mode=smd.modes.TRAIN)) == 2
    assert len(tr.steps(mode=smd.modes.EVAL)) == 1
    assert len(tr.tensor_names()) == 13
    on_s3, bucket, prefix = is_s3(trial_dir)
    if not on_s3:
        shutil.rmtree(trial_dir, ignore_errors=True)
    else:
        delete_s3_prefix(bucket, prefix)
github awslabs / sagemaker-debugger / tests / tensorflow / hooks / test_session.py View on Github external
w0 = [[1], [1.0]]
            y = tf.matmul(x, w0)
        loss = tf.reduce_mean((tf.matmul(x, w) - y) ** 2, name="loss")
        hook.get_collection("losses").add(loss)
        global_step = tf.Variable(17, name="global_step", trainable=False)
        increment_global_step_op = tf.assign(global_step, global_step + 1)

        optimizer = tf.train.AdamOptimizer(0.1)
        optimizer = hook.wrap_optimizer(optimizer)
        optimizer_op = optimizer.minimize(loss, global_step=increment_global_step_op)
        sess = tf.train.MonitoredSession(hooks=[hook])
        for i in range(5):
            x_ = np.random.random((10, 2)) * 0.1
            sess.run([loss, optimizer_op, increment_global_step_op], {x: x_})
        sess.close()
        tr = create_trial(out_dir)
        assert len(tr.tensor_names())
github aws / sagemaker-xgboost-container / test / integration / local / test_debug_hook.py View on Github external
def test_smdebug_script_mode_single_machine(docker_image, opt_ml):

    customer_script = "xgboost_abalone_basic_hook_demo.py"
    hyperparameters = get_abalone_hyperparameters()

    local_mode.train(customer_script, data_dir, docker_image, opt_ml,
                     hyperparameters=hyperparameters, source_dir=source_dir)

    assert not local_mode.file_exists(opt_ml, 'output/failure'), 'Failure happened'

    tensors_dir = os.path.join(opt_ml, 'algo-1', 'output', 'tensors')
    trial = create_trial(tensors_dir)
    assert trial.tensor_names() == ["train-rmse", "validation-rmse"]
    assert trial.steps() == list(range(0, 20))