Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_mode_writing():
run_id = "trial_" + datetime.now().strftime("%Y%m%d-%H%M%S%f")
worker = socket.gethostname()
for s in range(0, 10):
fw = FileWriter(trial_dir="/tmp/ts_outputs/" + run_id, step=s, worker=worker)
if s % 2 == 0:
fw.write_tensor(
tdata=np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32),
tname="arr",
mode=ModeKeys.TRAIN,
mode_step=s // 2,
)
else:
fw.write_tensor(
tdata=np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32),
tname="arr",
mode=ModeKeys.EVAL,
mode_step=s // 2,
)
fw.close()
write_dummy_collection_file("/tmp/ts_outputs/" + run_id)
files = glob.glob("/tmp/ts_outputs/" + run_id + "/**/*.tfevents", recursive=True)
global_steps = []
train_steps = []
eval_steps = []
for i in range(0, 31, 10):
dummy_step_creator(
trial_dir=path, global_step=i + 40, mode="EVAL", mode_step=i, worker_name="worker_0"
)
trial = create_trial(path)
num_workers = len(trial.workers())
assert num_workers == 1
assert trial.loaded_all_steps is True
all_steps = trial.steps(show_incomplete_steps=True)
completed_steps = trial.steps()
assert all_steps == [0, 10, 20, 30, 40, 50, 60, 70]
assert completed_steps == all_steps
assert trial.has_passed_step(30) == StepState.AVAILABLE
assert trial.has_passed_step(23, mode=ModeKeys.TRAIN) == StepState.UNAVAILABLE
assert trial.has_passed_step(40, mode=ModeKeys.TRAIN) == StepState.UNAVAILABLE
assert trial.has_passed_step(30, mode=ModeKeys.EVAL) == StepState.AVAILABLE
assert trial.has_passed_step(23, mode=ModeKeys.EVAL) == StepState.UNAVAILABLE
assert trial.has_passed_step(80) == StepState.UNAVAILABLE
assert trial.has_passed_step(80, mode=ModeKeys.TRAIN) == StepState.UNAVAILABLE
assert trial.has_passed_step(80, mode=ModeKeys.EVAL) == StepState.UNAVAILABLE
assert trial.last_index_token == os.path.join(
path, "index/000000000/000000000070_worker_0.json"
)
assert trial.last_complete_step == 70
shutil.rmtree(path, ignore_errors=True)
for i in range(0, 31, 10):
dummy_step_creator(
trial_dir=path, global_step=i + 40, mode="EVAL", mode_step=i, worker_name="worker_0"
)
trial = create_trial(path)
num_workers = len(trial.workers())
assert num_workers == 1
assert trial.loaded_all_steps is True
all_steps = trial.steps(show_incomplete_steps=True)
completed_steps = trial.steps()
assert all_steps == [0, 10, 20, 30, 40, 50, 60, 70]
assert completed_steps == all_steps
assert trial.has_passed_step(30) == StepState.AVAILABLE
assert trial.has_passed_step(23, mode=ModeKeys.TRAIN) == StepState.UNAVAILABLE
assert trial.has_passed_step(40, mode=ModeKeys.TRAIN) == StepState.UNAVAILABLE
assert trial.has_passed_step(30, mode=ModeKeys.EVAL) == StepState.AVAILABLE
assert trial.has_passed_step(23, mode=ModeKeys.EVAL) == StepState.UNAVAILABLE
assert trial.has_passed_step(80) == StepState.UNAVAILABLE
assert trial.has_passed_step(80, mode=ModeKeys.TRAIN) == StepState.UNAVAILABLE
assert trial.has_passed_step(80, mode=ModeKeys.EVAL) == StepState.UNAVAILABLE
assert trial.last_index_token == os.path.join(
path, "index/000000000/000000000070_worker_0.json"
)
assert trial.last_complete_step == 70
shutil.rmtree(path, ignore_errors=True)
def verify_files(out_dir, save_config, saved_scalars=None):
"""
Analyze the tensors saved and verify that metrics are stored correctly in the
SM metrics json file
"""
# Retrieve save_step for verification in the trial and the JSON file
save_config_train_steps = save_config.get_save_config(ModeKeys.TRAIN).save_steps
if not save_config_train_steps:
save_interval = save_config.get_save_config(ModeKeys.TRAIN).save_interval
save_config_train_steps = [i for i in range(0, 10, save_interval)]
save_config_eval_steps = save_config.get_save_config(ModeKeys.EVAL).save_steps
if not save_config_eval_steps:
save_interval = save_config.get_save_config(ModeKeys.EVAL).save_interval
save_config_eval_steps = [i for i in range(0, 10, save_interval)]
save_steps = {"TRAIN": save_config_train_steps, "EVAL": save_config_eval_steps}
check_trials(out_dir, save_steps, saved_scalars)
check_metrics_file(save_steps, saved_scalars)
def verify_files(out_dir, save_config, saved_scalars=None):
"""
Analyze the tensors saved and verify that metrics are stored correctly in the
SM metrics json file
"""
# Retrieve save_step for verification in the trial and the JSON file
save_config_train_steps = save_config.get_save_config(ModeKeys.TRAIN).save_steps
if not save_config_train_steps:
save_interval = save_config.get_save_config(ModeKeys.TRAIN).save_interval
save_config_train_steps = [i for i in range(0, 10, save_interval)]
save_config_eval_steps = save_config.get_save_config(ModeKeys.EVAL).save_steps
if not save_config_eval_steps:
save_interval = save_config.get_save_config(ModeKeys.EVAL).save_interval
save_config_eval_steps = [i for i in range(0, 10, save_interval)]
save_steps = {"TRAIN": save_config_train_steps, "EVAL": save_config_eval_steps}
check_trials(out_dir, save_steps, saved_scalars)
check_metrics_file(save_steps, saved_scalars)
)
hook = SessionHook.create_from_json_file()
# Check save_intervals for each mode
assert hook.save_config.get_save_config(ModeKeys.TRAIN).save_interval == 2
assert hook.save_config.get_save_config(ModeKeys.EVAL).save_interval == 3
assert hook.save_config.get_save_config(ModeKeys.PREDICT).save_interval == 1
assert hook.save_config.get_save_config(ModeKeys.GLOBAL).save_interval == 1
# Check include_collections
assert "weights" in hook.include_collections and "losses" in hook.include_collections
assert len(hook.include_collections) == 4
# Check collection configurations for losses
assert (
hook.collection_manager.collections["losses"]
.save_config.get_save_config(ModeKeys.TRAIN)
.save_interval
== 2
)
assert (
hook.collection_manager.collections["losses"]
.save_config.get_save_config(ModeKeys.EVAL)
.save_interval
== 4
)
assert (
hook.collection_manager.collections["losses"]
.save_config.get_save_config(ModeKeys.PREDICT)
.save_interval
== 1
)
assert (
CollectionKeys.GRADIENTS,
CollectionKeys.LOSSES,
],
eval_distributed=False,
zcc=zcc,
)
if skip_trial_check():
return
tr = create_trial_fast_refresh(out_dir)
# wts, grads, losses
assert (
len(tr.tensor_names()) == 8 + 8 + (1 * strategy.num_replicas_in_sync) + 1
) # 1 main loss, and 1 from each worker
assert len(tr.steps()) == 7
assert len(tr.steps(ModeKeys.TRAIN)) == 3
assert len(tr.steps(ModeKeys.EVAL)) == 2
assert len(tr.steps(ModeKeys.PREDICT)) == 2
assert "dense_1/kernel:0" in tr.tensor_names(collection="weights")
for tname in tr.tensor_names(collection="weights"):
for s in tr.tensor(tname).steps(ModeKeys.TRAIN):
assert len(tr.tensor(tname).workers(s, ModeKeys.TRAIN)) == strategy.num_replicas_in_sync
for worker in tr.tensor(tname).workers(s, ModeKeys.TRAIN):
assert tr.tensor(tname).value(s, worker=worker, mode=ModeKeys.TRAIN) is not None
for s in tr.tensor(tname).steps(ModeKeys.EVAL):
assert len(tr.tensor(tname).workers(s, ModeKeys.EVAL)) == 1 # as eval_dist = False
assert tr.tensor(tname).value(s, mode=ModeKeys.EVAL) is not None
tensornames = tr.tensor_names(regex="Identity_\d+:0")
for s in tr.tensor(tensornames[0]).steps(ModeKeys.TRAIN):
for w in tr.tensor(tensornames[0]).workers(s, ModeKeys.TRAIN):
x = self.fc2(x)
return F.log_softmax(x, dim=1)
model = Net().to(torch.device("cpu"))
criterion = nn.NLLLoss()
hook.register_module(model)
if register_loss:
hook.register_loss(criterion)
hook.save_scalar("pt_num_steps", steps, sm_metric=True)
model.train()
optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
hook.save_scalar("pt_before_train", 1, sm_metric=False)
hook.set_mode(ModeKeys.TRAIN)
for i in range(steps):
batch_size = 32
data, target = torch.rand(batch_size, 1, 28, 28), torch.rand(batch_size).long()
data, target = data.to(torch.device("cpu")), target.to(torch.device("cpu"))
optimizer.zero_grad()
output = model(Variable(data, requires_grad=True))
if register_loss:
loss = criterion(output, target)
else:
loss = F.nll_loss(output, target)
loss.backward()
optimizer.step()
hook.save_scalar("pt_after_train", 1, sm_metric=False)
model.eval()
hook.set_mode(ModeKeys.EVAL)
# Standard Library
from enum import Enum
# Note that Keras has similar concept of ModeKeys
class ModeKeys(Enum):
TRAIN = 1 # training/fitting mode
EVAL = 2 # testing/evaluation mode
PREDICT = 3 # prediction/inference mode
GLOBAL = 4
ALLOWED_MODES = [ModeKeys.TRAIN, ModeKeys.EVAL, ModeKeys.PREDICT, ModeKeys.GLOBAL]
ALLOWED_MODE_NAMES = [x.name for x in ALLOWED_MODES]
MODE_STEP_PLUGIN_NAME = "mode_step"
MODE_PLUGIN_NAME = "mode"
"""self.device_map is a mapping between a tf device string to a serialized (filename-friendly) device string
Example -> /job:worker/replica:0/task:1/device:GPU:0 : _job-worker_replica-0_task-1_device-GPU-0"""
self.device_map = {}
self.writer_map = {}
# This will be None if the var wasn't set, i.e. not param server
self.tf_config_json = load_tf_config_json(os.getenv("TF_CONFIG"))
self._hook_supported = None
self._exported_collections = False
self._distribution_strategy = {
ModeKeys.TRAIN: None,
ModeKeys.EVAL: None,
ModeKeys.PREDICT: None,
ModeKeys.GLOBAL: None,
}
self._prepared_tensors = {
ModeKeys.TRAIN: False,
ModeKeys.EVAL: False,
ModeKeys.PREDICT: False,
ModeKeys.GLOBAL: False,
}
self._exported_model = {
ModeKeys.TRAIN: False,
ModeKeys.EVAL: False,
ModeKeys.PREDICT: False,
ModeKeys.GLOBAL: False,
}
set_hook(self)