How to use the smdebug.pytorch.Hook function in smdebug

To help you get started, we’ve selected a few smdebug examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github awslabs / sagemaker-debugger / tests / zero_code_change / pytorch_integration_tests.py View on Github external
def test_pytorch(script_mode: bool = False, use_loss_module=False):
    smd.del_hook()

    sim_class = ScriptSimulator if script_mode else SagemakerSimulator
    with sim_class() as sim:
        trainloader, testloader = get_dataloaders()
        net = Net()
        criterion = nn.CrossEntropyLoss()
        optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)

        if script_mode:
            hook = smd.Hook(out_dir=sim.out_dir)
            hook.register_module(net)
            hook.register_loss(criterion)

        for i, data in enumerate(trainloader, 0):
            # get the inputs; data is a list of [inputs, labels]
            inputs, labels = data

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = net(inputs)
            if use_loss_module:
                loss = criterion(outputs, labels)
            else:
                loss = F.cross_entropy(outputs, labels)
github awslabs / sagemaker-debugger / tests / pytorch / test_distributed_training.py View on Github external
def run(rank, size, include_workers="one", num_epochs=10, batch_size=128, num_batches=10):
    """Distributed function to be implemented later."""
    torch.manual_seed(1234)
    device = torch.device("cpu")
    model = Net().to(device)
    optimizer = optim.SGD(model.parameters(), lr=1)

    shutil.rmtree(out_dir, ignore_errors=True)

    hook = smd.Hook(
        out_dir=out_dir,
        save_config=smd.SaveConfig(save_steps=[0, 1, 5]),
        save_all=True,
        include_workers=include_workers,
    )

    hook.register_module(model)

    for epoch in range(num_epochs):
        epoch_loss = 0.0
        for _ in range(num_batches):
            optimizer.zero_grad()
            data, target = dataset(batch_size)
            output = model(data)
            loss = F.mse_loss(output, target)
            epoch_loss += loss.item()
github awslabs / sagemaker-debugger / tests / core / test_hook_save_scalar.py View on Github external
def helper_pytorch_tests(collection, register_loss, save_config):
    coll_name, coll_regex = collection

    run_id = "trial_" + coll_name + "-" + datetime.now().strftime("%Y%m%d-%H%M%S%f")
    trial_dir = os.path.join(SMDEBUG_PT_HOOK_TESTS_DIR, run_id)

    hook = PT_Hook(
        out_dir=trial_dir,
        include_collections=[coll_name],
        save_config=save_config,
        export_tensorboard=True,
    )

    simple_pt_model(hook, register_loss=register_loss)
    hook.close()

    saved_scalars = ["scalar/pt_num_steps", "scalar/pt_before_train", "scalar/pt_after_train"]
    verify_files(trial_dir, save_config, saved_scalars)
github awslabs / sagemaker-debugger / tests / core / test_paths.py View on Github external
def test_tensorboard_dir_script_specify_tensorboard_dir():
    """ In script mode, passing `export_tensorboard` and `tensorboard_dir` works. """
    with ScriptSimulator(tensorboard_dir="/tmp/tensorboard_dir") as sim:
        hook = smd.Hook(
            out_dir=sim.out_dir, export_tensorboard=True, tensorboard_dir=sim.tensorboard_dir
        )
        assert hook.tensorboard_dir == sim.tensorboard_dir
github awslabs / sagemaker-debugger / examples / pytorch / scripts / pytorch_hook_demos.py View on Github external
# Inputs :  _input_, and
        # Output :  _output
        # In order to log the inputs and output of a module, we will create a collection as follows:
        assert module is not None

        # Create a hook that logs weights, biases, gradients and inputs/outputs of model every 5 steps from steps 0-100 while training.
        hook = Hook(
            out_dir=output_dir,
            save_config=SaveConfig(save_steps=[i * 5 for i in range(20)]),
            include_collections=["weights", "gradients", "biases", "l_mod"],
        )
        hook.get_collection("l_mod").add_module_tensors(module, inputs=True, outputs=True)
    elif hook_type == "weights-bias-gradients":
        save_config = SaveConfig(save_steps=[i * 5 for i in range(20)])
        # Create a hook that logs ONLY weights, biases, and gradients every 5 steps (from steps 0-100) while training the model.
        hook = Hook(out_dir=output_dir, save_config=save_config)
    return hook
github awslabs / sagemaker-debugger / examples / pytorch / scripts / torch_resnet.py View on Github external
def create_hook(output_dir, module, trial_id="trial-resnet", save_interval=100):
    # With the following SaveConfig, we will save tensors for steps 1, 2 and 3
    # (indexing starts with 0) and then continue to save tensors at interval of
    # 100,000 steps. Note: union operation is applied to produce resulting config
    # of save_steps and save_interval params.
    save_config = SaveConfig(save_interval)

    # The names of input and output tensors of a block are in following format
    # Inputs :  _input_, and
    # Output :  _output
    # In order to log the inputs and output of a model, we will create a collection as follows

    # Create a hook that logs weights, biases, gradients of model while training.
    hook = Hook(out_dir=output_dir)
    return hook
github awslabs / sagemaker-debugger / examples / pytorch / scripts / simple.py View on Github external
def create_hook(output_dir, module=None, hook_type="saveall", save_steps=None):
    # Create a hook that logs weights, biases, gradients and inputs/ouputs of model
    if hook_type == "saveall":
        hook = Hook(
            out_dir=output_dir,
            save_config=SaveConfig(save_steps=save_steps),
            save_all=True,
            export_tensorboard=True,
        )
    elif hook_type == "module-input-output":
        # The names of input and output tensors of a module are in following format
        # Inputs :  _input_, and
        # Output :  _output
        # In order to log the inputs and output of a module, we will create a collection as follows:
        assert module is not None

        # Create a hook that logs weights, biases, gradients and inputs/outputs of model
        hook = Hook(
            out_dir=output_dir,
            save_config=SaveConfig(save_steps=save_steps),
github awslabs / sagemaker-debugger / examples / pytorch / scripts / pytorch_hook_demos.py View on Github external
# Create a hook that logs weights, biases, gradients and inputs/ouputs of model every 10 steps while training.
    if hook_type == "saveall":
        hook = Hook(
            out_dir=output_dir,
            save_config=SaveConfig(save_steps=[i * 10 for i in range(20)]),
            save_all=True,
        )
    elif hook_type == "module-input-output":
        # The names of input and output tensors of a module are in following format
        # Inputs :  _input_, and
        # Output :  _output
        # In order to log the inputs and output of a module, we will create a collection as follows:
        assert module is not None

        # Create a hook that logs weights, biases, gradients and inputs/outputs of model every 5 steps from steps 0-100 while training.
        hook = Hook(
            out_dir=output_dir,
            save_config=SaveConfig(save_steps=[i * 5 for i in range(20)]),
            include_collections=["weights", "gradients", "biases", "l_mod"],
        )
        hook.get_collection("l_mod").add_module_tensors(module, inputs=True, outputs=True)
    elif hook_type == "weights-bias-gradients":
        save_config = SaveConfig(save_steps=[i * 5 for i in range(20)])
        # Create a hook that logs ONLY weights, biases, and gradients every 5 steps (from steps 0-100) while training the model.
        hook = Hook(out_dir=output_dir, save_config=save_config)
    return hook