How to use the smdebug.core.modes.ModeKeys.TRAIN function in smdebug

To help you get started, we’ve selected a few smdebug examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github awslabs / sagemaker-debugger / tests / core / test_modes.py View on Github external
def test_mode_writing():
    run_id = "trial_" + datetime.now().strftime("%Y%m%d-%H%M%S%f")
    worker = socket.gethostname()
    for s in range(0, 10):
        fw = FileWriter(trial_dir="/tmp/ts_outputs/" + run_id, step=s, worker=worker)
        if s % 2 == 0:
            fw.write_tensor(
                tdata=np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32),
                tname="arr",
                mode=ModeKeys.TRAIN,
                mode_step=s // 2,
            )
        else:
            fw.write_tensor(
                tdata=np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32),
                tname="arr",
                mode=ModeKeys.EVAL,
                mode_step=s // 2,
            )
        fw.close()
    write_dummy_collection_file("/tmp/ts_outputs/" + run_id)
    files = glob.glob("/tmp/ts_outputs/" + run_id + "/**/*.tfevents", recursive=True)

    global_steps = []
    train_steps = []
    eval_steps = []
github awslabs / sagemaker-debugger / tests / analysis / trials / test_has_passed_step_scenarios.py View on Github external
for i in range(0, 31, 10):
        dummy_step_creator(
            trial_dir=path, global_step=i + 40, mode="EVAL", mode_step=i, worker_name="worker_0"
        )

    trial = create_trial(path)
    num_workers = len(trial.workers())
    assert num_workers == 1
    assert trial.loaded_all_steps is True
    all_steps = trial.steps(show_incomplete_steps=True)
    completed_steps = trial.steps()
    assert all_steps == [0, 10, 20, 30, 40, 50, 60, 70]
    assert completed_steps == all_steps
    assert trial.has_passed_step(30) == StepState.AVAILABLE
    assert trial.has_passed_step(23, mode=ModeKeys.TRAIN) == StepState.UNAVAILABLE
    assert trial.has_passed_step(40, mode=ModeKeys.TRAIN) == StepState.UNAVAILABLE
    assert trial.has_passed_step(30, mode=ModeKeys.EVAL) == StepState.AVAILABLE
    assert trial.has_passed_step(23, mode=ModeKeys.EVAL) == StepState.UNAVAILABLE
    assert trial.has_passed_step(80) == StepState.UNAVAILABLE
    assert trial.has_passed_step(80, mode=ModeKeys.TRAIN) == StepState.UNAVAILABLE
    assert trial.has_passed_step(80, mode=ModeKeys.EVAL) == StepState.UNAVAILABLE
    assert trial.last_index_token == os.path.join(
        path, "index/000000000/000000000070_worker_0.json"
    )
    assert trial.last_complete_step == 70
    shutil.rmtree(path, ignore_errors=True)
github awslabs / sagemaker-debugger / tests / analysis / trials / test_has_passed_step_scenarios.py View on Github external
for i in range(0, 31, 10):
        dummy_step_creator(
            trial_dir=path, global_step=i + 40, mode="EVAL", mode_step=i, worker_name="worker_0"
        )

    trial = create_trial(path)
    num_workers = len(trial.workers())
    assert num_workers == 1
    assert trial.loaded_all_steps is True
    all_steps = trial.steps(show_incomplete_steps=True)
    completed_steps = trial.steps()
    assert all_steps == [0, 10, 20, 30, 40, 50, 60, 70]
    assert completed_steps == all_steps
    assert trial.has_passed_step(30) == StepState.AVAILABLE
    assert trial.has_passed_step(23, mode=ModeKeys.TRAIN) == StepState.UNAVAILABLE
    assert trial.has_passed_step(40, mode=ModeKeys.TRAIN) == StepState.UNAVAILABLE
    assert trial.has_passed_step(30, mode=ModeKeys.EVAL) == StepState.AVAILABLE
    assert trial.has_passed_step(23, mode=ModeKeys.EVAL) == StepState.UNAVAILABLE
    assert trial.has_passed_step(80) == StepState.UNAVAILABLE
    assert trial.has_passed_step(80, mode=ModeKeys.TRAIN) == StepState.UNAVAILABLE
    assert trial.has_passed_step(80, mode=ModeKeys.EVAL) == StepState.UNAVAILABLE
    assert trial.last_index_token == os.path.join(
        path, "index/000000000/000000000070_worker_0.json"
    )
    assert trial.last_complete_step == 70
    shutil.rmtree(path, ignore_errors=True)
github awslabs / sagemaker-debugger / tests / core / test_hook_save_scalar.py View on Github external
def verify_files(out_dir, save_config, saved_scalars=None):
    """
    Analyze the tensors saved and verify that metrics are stored correctly in the
    SM metrics json file
    """

    # Retrieve save_step for verification in the trial and the JSON file
    save_config_train_steps = save_config.get_save_config(ModeKeys.TRAIN).save_steps
    if not save_config_train_steps:
        save_interval = save_config.get_save_config(ModeKeys.TRAIN).save_interval
        save_config_train_steps = [i for i in range(0, 10, save_interval)]
    save_config_eval_steps = save_config.get_save_config(ModeKeys.EVAL).save_steps
    if not save_config_eval_steps:
        save_interval = save_config.get_save_config(ModeKeys.EVAL).save_interval
        save_config_eval_steps = [i for i in range(0, 10, save_interval)]

    save_steps = {"TRAIN": save_config_train_steps, "EVAL": save_config_eval_steps}

    check_trials(out_dir, save_steps, saved_scalars)
    check_metrics_file(save_steps, saved_scalars)
github awslabs / sagemaker-debugger / tests / core / test_hook_save_scalar.py View on Github external
def verify_files(out_dir, save_config, saved_scalars=None):
    """
    Analyze the tensors saved and verify that metrics are stored correctly in the
    SM metrics json file
    """

    # Retrieve save_step for verification in the trial and the JSON file
    save_config_train_steps = save_config.get_save_config(ModeKeys.TRAIN).save_steps
    if not save_config_train_steps:
        save_interval = save_config.get_save_config(ModeKeys.TRAIN).save_interval
        save_config_train_steps = [i for i in range(0, 10, save_interval)]
    save_config_eval_steps = save_config.get_save_config(ModeKeys.EVAL).save_steps
    if not save_config_eval_steps:
        save_interval = save_config.get_save_config(ModeKeys.EVAL).save_interval
        save_config_eval_steps = [i for i in range(0, 10, save_interval)]

    save_steps = {"TRAIN": save_config_train_steps, "EVAL": save_config_eval_steps}

    check_trials(out_dir, save_steps, saved_scalars)
    check_metrics_file(save_steps, saved_scalars)
github awslabs / sagemaker-debugger / tests / tensorflow / hooks / test_collection_defaults.py View on Github external
)
    hook = SessionHook.create_from_json_file()

    # Check save_intervals for each mode
    assert hook.save_config.get_save_config(ModeKeys.TRAIN).save_interval == 2
    assert hook.save_config.get_save_config(ModeKeys.EVAL).save_interval == 3
    assert hook.save_config.get_save_config(ModeKeys.PREDICT).save_interval == 1
    assert hook.save_config.get_save_config(ModeKeys.GLOBAL).save_interval == 1
    # Check include_collections
    assert "weights" in hook.include_collections and "losses" in hook.include_collections

    assert len(hook.include_collections) == 4
    # Check collection configurations for losses
    assert (
        hook.collection_manager.collections["losses"]
        .save_config.get_save_config(ModeKeys.TRAIN)
        .save_interval
        == 2
    )
    assert (
        hook.collection_manager.collections["losses"]
        .save_config.get_save_config(ModeKeys.EVAL)
        .save_interval
        == 4
    )
    assert (
        hook.collection_manager.collections["losses"]
        .save_config.get_save_config(ModeKeys.PREDICT)
        .save_interval
        == 1
    )
    assert (
github awslabs / sagemaker-debugger / tests / tensorflow / hooks / test_mirrored_strategy.py View on Github external
CollectionKeys.GRADIENTS,
            CollectionKeys.LOSSES,
        ],
        eval_distributed=False,
        zcc=zcc,
    )
    if skip_trial_check():
        return

    tr = create_trial_fast_refresh(out_dir)
    # wts, grads, losses
    assert (
        len(tr.tensor_names()) == 8 + 8 + (1 * strategy.num_replicas_in_sync) + 1
    )  # 1 main loss, and 1 from each worker
    assert len(tr.steps()) == 7
    assert len(tr.steps(ModeKeys.TRAIN)) == 3
    assert len(tr.steps(ModeKeys.EVAL)) == 2
    assert len(tr.steps(ModeKeys.PREDICT)) == 2

    assert "dense_1/kernel:0" in tr.tensor_names(collection="weights")
    for tname in tr.tensor_names(collection="weights"):
        for s in tr.tensor(tname).steps(ModeKeys.TRAIN):
            assert len(tr.tensor(tname).workers(s, ModeKeys.TRAIN)) == strategy.num_replicas_in_sync
            for worker in tr.tensor(tname).workers(s, ModeKeys.TRAIN):
                assert tr.tensor(tname).value(s, worker=worker, mode=ModeKeys.TRAIN) is not None
        for s in tr.tensor(tname).steps(ModeKeys.EVAL):
            assert len(tr.tensor(tname).workers(s, ModeKeys.EVAL)) == 1  # as eval_dist = False
            assert tr.tensor(tname).value(s, mode=ModeKeys.EVAL) is not None

    tensornames = tr.tensor_names(regex="Identity_\d+:0")
    for s in tr.tensor(tensornames[0]).steps(ModeKeys.TRAIN):
        for w in tr.tensor(tensornames[0]).workers(s, ModeKeys.TRAIN):
github awslabs / sagemaker-debugger / tests / core / test_hook_save_scalar.py View on Github external
x = self.fc2(x)
            return F.log_softmax(x, dim=1)

    model = Net().to(torch.device("cpu"))
    criterion = nn.NLLLoss()
    hook.register_module(model)
    if register_loss:
        hook.register_loss(criterion)

    hook.save_scalar("pt_num_steps", steps, sm_metric=True)

    model.train()
    optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)

    hook.save_scalar("pt_before_train", 1, sm_metric=False)
    hook.set_mode(ModeKeys.TRAIN)
    for i in range(steps):
        batch_size = 32
        data, target = torch.rand(batch_size, 1, 28, 28), torch.rand(batch_size).long()
        data, target = data.to(torch.device("cpu")), target.to(torch.device("cpu"))
        optimizer.zero_grad()
        output = model(Variable(data, requires_grad=True))
        if register_loss:
            loss = criterion(output, target)
        else:
            loss = F.nll_loss(output, target)
        loss.backward()
        optimizer.step()
    hook.save_scalar("pt_after_train", 1, sm_metric=False)

    model.eval()
    hook.set_mode(ModeKeys.EVAL)
github awslabs / sagemaker-debugger / smdebug / core / modes.py View on Github external
# Standard Library
from enum import Enum


# Note that Keras has similar concept of ModeKeys
class ModeKeys(Enum):
    TRAIN = 1  # training/fitting mode
    EVAL = 2  # testing/evaluation mode
    PREDICT = 3  # prediction/inference mode
    GLOBAL = 4


ALLOWED_MODES = [ModeKeys.TRAIN, ModeKeys.EVAL, ModeKeys.PREDICT, ModeKeys.GLOBAL]
ALLOWED_MODE_NAMES = [x.name for x in ALLOWED_MODES]
MODE_STEP_PLUGIN_NAME = "mode_step"
MODE_PLUGIN_NAME = "mode"
github awslabs / sagemaker-debugger / smdebug / tensorflow / base_hook.py View on Github external
"""self.device_map is a mapping between a tf device string to a serialized (filename-friendly) device string
                Example -> /job:worker/replica:0/task:1/device:GPU:0 : _job-worker_replica-0_task-1_device-GPU-0"""
        self.device_map = {}
        self.writer_map = {}
        # This will be None if the var wasn't set, i.e. not param server
        self.tf_config_json = load_tf_config_json(os.getenv("TF_CONFIG"))
        self._hook_supported = None
        self._exported_collections = False
        self._distribution_strategy = {
            ModeKeys.TRAIN: None,
            ModeKeys.EVAL: None,
            ModeKeys.PREDICT: None,
            ModeKeys.GLOBAL: None,
        }
        self._prepared_tensors = {
            ModeKeys.TRAIN: False,
            ModeKeys.EVAL: False,
            ModeKeys.PREDICT: False,
            ModeKeys.GLOBAL: False,
        }
        self._exported_model = {
            ModeKeys.TRAIN: False,
            ModeKeys.EVAL: False,
            ModeKeys.PREDICT: False,
            ModeKeys.GLOBAL: False,
        }
        set_hook(self)