Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
# As of TensorFlow v1.9, gradients are not supported on
# integer tensors
dtypes = [tf.float32, tf.float64]
dims = [1, 2, 3]
for dtype, dim in itertools.product(dtypes, dims):
with tf.device("/cpu:0"):
if _executing_eagerly():
tensor = self.tfe.Variable(self.random_uniform(
[5] * dim, -100, 100, dtype=dtype))
with tf.GradientTape() as tape:
summed = hvd.allreduce(tensor, average=False)
else:
tensor = self.random_uniform(
[5] * dim, -100, 100, dtype=dtype)
summed = hvd.allreduce(tensor, average=False)
grad_ys = tf.ones([5] * dim)
if _executing_eagerly():
grad_out = tape.gradient(summed, tensor, grad_ys)
else:
grad = tf.gradients(summed, tensor, grad_ys)[0]
grad_out = self.evaluate(grad)
expected = np.ones([5] * dim) * size
err = np.linalg.norm(expected - grad_out)
self.assertLess(err, 0.00000001,
"gradient %s differs from expected %s, "
"error: %s" % (grad_out, expected, str(err)))
batch_mean_square = gen_nccl_ops.nccl_all_reduce(
input=batch_mean_square,
reduction='sum',
num_devices=num_dev,
shared_name=shared_name + '_NCCL_mean_square') * (1.0 / num_dev)
elif sync_statistics == 'horovod':
# Require https://github.com/uber/horovod/pull/331
import horovod.tensorflow as hvd
if hvd.size() == 1:
logger.warn("BatchNorm(sync_statistics='horovod') is used with only one process!")
else:
import horovod
hvd_version = tuple(map(int, horovod.__version__.split('.')[:3]))
assert hvd_version >= (0, 13, 6), "sync_statistics=horovod needs horovod>=0.13.6 !"
batch_mean = hvd.allreduce(batch_mean, average=True)
batch_mean_square = hvd.allreduce(batch_mean_square, average=True)
batch_var = batch_mean_square - tf.square(batch_mean)
batch_mean_vec = batch_mean
batch_var_vec = batch_var
beta, gamma, moving_mean, moving_var = get_bn_variables(
num_chan, scale, center, beta_initializer, gamma_initializer)
if new_shape is not None:
batch_mean = tf.reshape(batch_mean, new_shape)
batch_var = tf.reshape(batch_var, new_shape)
# Using fused_batch_norm(is_training=False) is actually slightly faster,
# but hopefully this call will be JITed in the future.
xn = tf.nn.batch_normalization(
inputs, batch_mean, batch_var,
tf.reshape(beta, new_shape),
tf.reshape(gamma, new_shape), epsilon)
else grad
for grad in grads
]
# TODO: Does placing this clip before or after allreduce affect accuracy?
# Placing before has a regularization effect, no single example can contribute as much.
# Placing before also gives a 20% speedup when training BERT-large, probably because the
# gradient operations can be fused by XLA.
(grads, grad_norm) = tf.clip_by_global_norm(grads, clip_norm=max_grad_norm)
weight_norm = tf.math.sqrt(
tf.math.reduce_sum([tf.norm(var, ord=2) ** 2 for var in model.trainable_variables])
)
grads = [
hvd.allreduce(grad, compression=hvd.Compression.fp16) if grad is not None else None
for grad in grads
]
optimizer.apply_gradients(
[
(tf.cast(grad, var.dtype), var)
for (grad, var) in zip(grads, model.trainable_variables)
if grad is not None
]
)
# Clear the gradient accumulator
gradient_accumulator.reset()
loss = hvd.allreduce(loss)
mlm_loss = hvd.allreduce(mlm_loss)
optimizer.apply_gradients(
[
(tf.cast(grad, var.dtype), var)
for (grad, var) in zip(grads, model.trainable_variables)
if grad is not None
]
)
# Clear the gradient accumulator
gradient_accumulator.reset()
loss = hvd.allreduce(loss)
mlm_loss = hvd.allreduce(mlm_loss)
mlm_acc = hvd.allreduce(mlm_acc)
sop_loss = hvd.allreduce(sop_loss)
sop_acc = hvd.allreduce(sop_acc)
return loss, mlm_loss, mlm_acc, sop_loss, sop_acc, grad_norm, weight_norm
hvd.allreduce(grad, compression=hvd.Compression.fp16) if grad is not None else None
for grad in grads
]
optimizer.apply_gradients(
[
(tf.cast(grad, var.dtype), var)
for (grad, var) in zip(grads, model.trainable_variables)
if grad is not None
]
)
# Clear the gradient accumulator
gradient_accumulator.reset()
loss = hvd.allreduce(loss)
mlm_loss = hvd.allreduce(mlm_loss)
mlm_acc = hvd.allreduce(mlm_acc)
sop_loss = hvd.allreduce(sop_loss)
sop_acc = hvd.allreduce(sop_acc)
return loss, mlm_loss, mlm_acc, sop_loss, sop_acc, grad_norm, weight_norm
for user_batch, item_batch, dup_batch \
in zip(data_generator.eval_users, data_generator.eval_items, data_generator.dup_mask):
sess.run(
eval_op,
feed_dict={
users: user_batch,
items: item_batch,
is_dup:dup_batch, dropout: 0.0
}
)
eval_duration = time.time() - eval_start
# Report results
hit_rate_sum = sess.run(hvd.allreduce(hr_sum, average=False))
hit_rate_cnt = sess.run(hvd.allreduce(hr_cnt, average=False))
ndcg_sum = sess.run(hvd.allreduce(ndcg_sum, average=False))
ndcg_cnt = sess.run(hvd.allreduce(ndcg_cnt, average=False))
hit_rate = hit_rate_sum / hit_rate_cnt
ndcg = ndcg_sum / ndcg_cnt
if hvd.rank() == 0:
LOGGER.log("Eval Time: {:.4f}, HR: {:.4f}, NDCG: {:.4f}"
.format(eval_duration, hit_rate, ndcg))
eval_throughput = pos_test_users.shape[0] * (args.valid_negative + 1) / eval_duration
LOGGER.log('Average Eval Throughput: {:.4f}'.format(eval_throughput))
return
# Performance Metrics
train_times = list()
eval_times = list()
def _make_variable(self, metric, value):
with tf.name_scope('MetricAverageCallback'):
var = tf.Variable(value, name=metric)
self.backend.get_session().run(var.initializer)
allreduce_op = hvd.allreduce(var, device_dense=self.device)
return var, allreduce_op
if isinstance(grad_tensor, tf.Tensor):
grad = grad_tensor
grad_consumers = [c for c in grad.consumers()]
agg_grad = hvd.allreduce(grad,
average=True)
update_consumers(grad_consumers, grad, agg_grad)
update_control_consumers(op_to_control_consumer_ops[grad.op],
grad.op, agg_grad.op)
else:
grad = grad_tensor.values
indices = grad_tensor.indices
dense_shape = grad_tensor.dense_shape
grad_consumers = [c for c in grad.consumers()]
indices_consumers = [c for c in indices.consumers()]
agg_grad = \
hvd.allreduce(tf.IndexedSlices(grad, indices, dense_shape),
average=config.average_sparse)
update_consumers(grad_consumers, grad, agg_grad.values)
update_consumers(indices_consumers, indices, agg_grad.indices)
update_control_consumers(op_to_control_consumer_ops[grad.op],
grad.op, agg_grad.values.op)
update_control_consumers(
op_to_control_consumer_ops[indices.op], indices.op,
agg_grad.indices.op)
gradients_info._grad = agg_grad
def all_reduce_fn(tensor):
return hvd.allreduce(tensor, compression=hvd.Compression.fp16)