Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_fetch_tensor_with_present_event_files():
"""
events files present: [0, 18, 27, 36, ...., 190]
index files present: [0, 9, 18, 27, 36, ...., 190, 199]
end_of_job file : present
"""
path = "s3://smdebug-testing/resources/event-files-missing"
trial = create_trial(path)
# Get value from an event file that is present
trial.tensor("gradients/pow_grad/sub:0").value(0)
if script_mode:
hook.record_tensor_value(tensor_name="loss", tensor_value=loss)
loss.backward()
optimizer.step()
if i == 499: # print every 2000 mini-batches
break
print("Finished Training")
hook = smd.get_hook()
print(f"hook = {hook}")
from smdebug.trials import create_trial
trial = create_trial(path=sim.out_dir)
print(f"trial.steps() = {trial.steps()}")
print(f"trial.tensor_names() = {trial.tensor_names()}")
print(f"collection_manager = {hook.collection_manager}")
losses_tensors = hook.collection_manager.get("losses").tensor_names
print(f"'losses' collection tensor_names = {losses_tensors}")
assert len(losses_tensors) > 0
assert all(
[
name in trial.tensor_names()
for name in hook.collection_manager.get("losses").tensor_names
]
def test_save_all(hook=None, out_dir=None):
hook_created = False
if hook is None:
hook_created = True
save_config = SaveConfig(save_steps=[0, 1, 2, 3])
run_id = "trial_" + datetime.now().strftime("%Y%m%d-%H%M%S%f")
out_dir = "/tmp/" + run_id
print("Registering the hook with out_dir {}".format(out_dir))
hook = t_hook(out_dir=out_dir, save_config=save_config, save_all=True)
run_mnist_gluon_model(hook=hook, num_steps_train=7, num_steps_eval=5)
# assert for steps and tensor_names
print("Created the trial with out_dir {}".format(out_dir))
tr = create_trial(out_dir)
tensor_list = tr.tensor_names()
assert tr
assert len(tr.steps()) == 4
# some tensor names, like input and output, can't be retrieved from training session, so here we only assert for tensor numbers
# 46 is gotten from index file
# if no assertion failure, then the script could save all tensors
assert len(tensor_list) == 46
if hook_created:
shutil.rmtree(out_dir)
def help_test_no_refresh(path):
trial_name = str(uuid.uuid4())
num_steps = 8
num_tensors = 10
for i in range(num_steps):
generate_data(
path=path,
trial=trial_name,
num_tensors=num_tensors,
step=i,
tname_prefix="foo",
worker="algo-1",
shape=(3, 3, 3),
)
tr = create_trial(path + trial_name)
assert "foo_" + str(num_tensors + 1) not in tr.tensor_names()
assert "foo_1" in tr.tensor_names()
assert len(tr.steps()) == num_steps
assert len(tr.tensor("foo_1").steps()) == num_steps
for i in range(num_steps, num_steps * 2):
generate_data(
path=path,
trial=trial_name,
num_tensors=num_tensors,
step=i,
tname_prefix="foo",
worker="algo-1",
shape=(3, 3, 3),
export_colls=False,
hook_created = False
if hook is None:
run_id = "trial_" + datetime.now().strftime("%Y%m%d-%H%M%S%f")
out_dir = "/tmp/" + run_id
hook = t_hook(
out_dir=out_dir,
save_config=SaveConfig(save_steps=[0, 1, 2, 3]),
include_collections=["relu_activations"],
)
hook_created = True
model = Net().to(torch.device("cpu"))
hook.register_module(model)
optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
train(model, hook, torch.device("cpu"), optimizer, num_steps=10)
tr = create_trial(out_dir)
assert tr
assert len(tr.tensor_names(collection="relu_activations")) > 0
assert tr.tensor(tr.tensor_names(collection="relu_activations")[0]).value(0) is not None
if hook_created:
shutil.rmtree(out_dir)
def test_invoker_exception():
path = dump_data()
tr = create_trial(path)
r = ExplodingTensor(tr, collection_names="gradients,weights")
c = 0
for start_step in range(2):
try:
invoke_rule(r, start_step=start_step, end_step=3, raise_eval_cond=True)
except RuleEvaluationConditionMet as e:
c += 1
assert c == 2
shutil.rmtree(path)
def helper_test_reductions(trial_dir, hook, save_raw_tensor):
simple_model(hook)
_, files = get_dirs_files(trial_dir)
from smdebug.trials import create_trial
tr = create_trial(trial_dir)
assert len(tr.tensors()) == 3, tr.tensors()
for tname in tr.tensors():
t = tr.tensor(tname)
if tname in tr.tensors(collection="losses"):
# no reductions
assert t.value(0) is not None
else:
if save_raw_tensor is True:
assert t.value(0) is not None
else:
try:
print(t.value(0))
assert False, (tname, e)
except TensorUnavailableForStep as e:
pass
assert len(t.reduction_values(0)) == 18
def helper_test_mnist_trial(trial_dir):
tr = create_trial(trial_dir)
assert len(tr.steps()) == 3
assert len(tr.steps(mode=smd.modes.TRAIN)) == 2
assert len(tr.steps(mode=smd.modes.EVAL)) == 1
assert len(tr.tensor_names()) == 13
on_s3, bucket, prefix = is_s3(trial_dir)
if not on_s3:
shutil.rmtree(trial_dir, ignore_errors=True)
else:
delete_s3_prefix(bucket, prefix)
w0 = [[1], [1.0]]
y = tf.matmul(x, w0)
loss = tf.reduce_mean((tf.matmul(x, w) - y) ** 2, name="loss")
hook.get_collection("losses").add(loss)
global_step = tf.Variable(17, name="global_step", trainable=False)
increment_global_step_op = tf.assign(global_step, global_step + 1)
optimizer = tf.train.AdamOptimizer(0.1)
optimizer = hook.wrap_optimizer(optimizer)
optimizer_op = optimizer.minimize(loss, global_step=increment_global_step_op)
sess = tf.train.MonitoredSession(hooks=[hook])
for i in range(5):
x_ = np.random.random((10, 2)) * 0.1
sess.run([loss, optimizer_op, increment_global_step_op], {x: x_})
sess.close()
tr = create_trial(out_dir)
assert len(tr.tensor_names())
def test_smdebug_script_mode_single_machine(docker_image, opt_ml):
customer_script = "xgboost_abalone_basic_hook_demo.py"
hyperparameters = get_abalone_hyperparameters()
local_mode.train(customer_script, data_dir, docker_image, opt_ml,
hyperparameters=hyperparameters, source_dir=source_dir)
assert not local_mode.file_exists(opt_ml, 'output/failure'), 'Failure happened'
tensors_dir = os.path.join(opt_ml, 'algo-1', 'output', 'tensors')
trial = create_trial(tensors_dir)
assert trial.tensor_names() == ["train-rmse", "validation-rmse"]
assert trial.steps() == list(range(0, 20))