How to use the horovod.tensorflow.allreduce function in horovod

To help you get started, we’ve selected a few horovod examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github horovod / horovod / test / test_tensorflow.py View on Github external
# As of TensorFlow v1.9, gradients are not supported on
        # integer tensors
        dtypes = [tf.float32, tf.float64]
        dims = [1, 2, 3]
        for dtype, dim in itertools.product(dtypes, dims):
            with tf.device("/cpu:0"):
                if _executing_eagerly():
                    tensor = self.tfe.Variable(self.random_uniform(
                        [5] * dim, -100, 100, dtype=dtype))
                    with tf.GradientTape() as tape:
                        summed = hvd.allreduce(tensor, average=False)
                else:
                    tensor = self.random_uniform(
                        [5] * dim, -100, 100, dtype=dtype)
                    summed = hvd.allreduce(tensor, average=False)

                grad_ys = tf.ones([5] * dim)
                if _executing_eagerly():
                    grad_out = tape.gradient(summed, tensor, grad_ys)
                else:
                    grad = tf.gradients(summed, tensor, grad_ys)[0]
                    grad_out = self.evaluate(grad)

            expected = np.ones([5] * dim) * size
            err = np.linalg.norm(expected - grad_out)
            self.assertLess(err, 0.00000001,
                            "gradient %s differs from expected %s, "
                            "error: %s" % (grad_out, expected, str(err)))
github tensorpack / tensorpack / tensorpack / models / batch_norm.py View on Github external
batch_mean_square = gen_nccl_ops.nccl_all_reduce(
                    input=batch_mean_square,
                    reduction='sum',
                    num_devices=num_dev,
                    shared_name=shared_name + '_NCCL_mean_square') * (1.0 / num_dev)
        elif sync_statistics == 'horovod':
            # Require https://github.com/uber/horovod/pull/331
            import horovod.tensorflow as hvd
            if hvd.size() == 1:
                logger.warn("BatchNorm(sync_statistics='horovod') is used with only one process!")
            else:
                import horovod
                hvd_version = tuple(map(int, horovod.__version__.split('.')[:3]))
                assert hvd_version >= (0, 13, 6), "sync_statistics=horovod needs horovod>=0.13.6 !"

                batch_mean = hvd.allreduce(batch_mean, average=True)
                batch_mean_square = hvd.allreduce(batch_mean_square, average=True)
        batch_var = batch_mean_square - tf.square(batch_mean)
        batch_mean_vec = batch_mean
        batch_var_vec = batch_var

        beta, gamma, moving_mean, moving_var = get_bn_variables(
            num_chan, scale, center, beta_initializer, gamma_initializer)
        if new_shape is not None:
            batch_mean = tf.reshape(batch_mean, new_shape)
            batch_var = tf.reshape(batch_var, new_shape)
            # Using fused_batch_norm(is_training=False) is actually slightly faster,
            # but hopefully this call will be JITed in the future.
            xn = tf.nn.batch_normalization(
                inputs, batch_mean, batch_var,
                tf.reshape(beta, new_shape),
                tf.reshape(gamma, new_shape), epsilon)
github aws-samples / deep-learning-models / models / nlp / albert / run_pretraining.py View on Github external
else grad
        for grad in grads
    ]

    # TODO: Does placing this clip before or after allreduce affect accuracy?
    # Placing before has a regularization effect, no single example can contribute as much.
    # Placing before also gives a 20% speedup when training BERT-large, probably because the
    # gradient operations can be fused by XLA.
    (grads, grad_norm) = tf.clip_by_global_norm(grads, clip_norm=max_grad_norm)

    weight_norm = tf.math.sqrt(
        tf.math.reduce_sum([tf.norm(var, ord=2) ** 2 for var in model.trainable_variables])
    )

    grads = [
        hvd.allreduce(grad, compression=hvd.Compression.fp16) if grad is not None else None
        for grad in grads
    ]

    optimizer.apply_gradients(
        [
            (tf.cast(grad, var.dtype), var)
            for (grad, var) in zip(grads, model.trainable_variables)
            if grad is not None
        ]
    )

    # Clear the gradient accumulator
    gradient_accumulator.reset()

    loss = hvd.allreduce(loss)
    mlm_loss = hvd.allreduce(mlm_loss)
github aws-samples / deep-learning-models / models / nlp / albert / run_pretraining.py View on Github external
optimizer.apply_gradients(
        [
            (tf.cast(grad, var.dtype), var)
            for (grad, var) in zip(grads, model.trainable_variables)
            if grad is not None
        ]
    )

    # Clear the gradient accumulator
    gradient_accumulator.reset()

    loss = hvd.allreduce(loss)
    mlm_loss = hvd.allreduce(mlm_loss)
    mlm_acc = hvd.allreduce(mlm_acc)
    sop_loss = hvd.allreduce(sop_loss)
    sop_acc = hvd.allreduce(sop_acc)

    return loss, mlm_loss, mlm_acc, sop_loss, sop_acc, grad_norm, weight_norm
github aws-samples / deep-learning-models / models / nlp / albert / run_pretraining.py View on Github external
hvd.allreduce(grad, compression=hvd.Compression.fp16) if grad is not None else None
        for grad in grads
    ]

    optimizer.apply_gradients(
        [
            (tf.cast(grad, var.dtype), var)
            for (grad, var) in zip(grads, model.trainable_variables)
            if grad is not None
        ]
    )

    # Clear the gradient accumulator
    gradient_accumulator.reset()

    loss = hvd.allreduce(loss)
    mlm_loss = hvd.allreduce(mlm_loss)
    mlm_acc = hvd.allreduce(mlm_acc)
    sop_loss = hvd.allreduce(sop_loss)
    sop_acc = hvd.allreduce(sop_acc)

    return loss, mlm_loss, mlm_acc, sop_loss, sop_acc, grad_norm, weight_norm
github NVIDIA / DeepLearningExamples / TensorFlow / Recommendation / NCF / ncf.py View on Github external
for user_batch, item_batch, dup_batch \
            in zip(data_generator.eval_users, data_generator.eval_items, data_generator.dup_mask):
            sess.run(
                eval_op,
                feed_dict={
                    users: user_batch,
                    items: item_batch,
                    is_dup:dup_batch, dropout: 0.0
                }
            )
        eval_duration = time.time() - eval_start

        # Report results
        hit_rate_sum = sess.run(hvd.allreduce(hr_sum, average=False))
        hit_rate_cnt = sess.run(hvd.allreduce(hr_cnt, average=False))
        ndcg_sum = sess.run(hvd.allreduce(ndcg_sum, average=False))
        ndcg_cnt = sess.run(hvd.allreduce(ndcg_cnt, average=False))

        hit_rate = hit_rate_sum / hit_rate_cnt
        ndcg = ndcg_sum / ndcg_cnt

        if hvd.rank() == 0:
            LOGGER.log("Eval Time: {:.4f}, HR: {:.4f}, NDCG: {:.4f}"
                       .format(eval_duration, hit_rate, ndcg))

            eval_throughput = pos_test_users.shape[0] * (args.valid_negative + 1) / eval_duration
            LOGGER.log('Average Eval Throughput: {:.4f}'.format(eval_throughput))
        return

    # Performance Metrics
    train_times = list()
    eval_times = list()
github horovod / horovod / horovod / _keras / callbacks.py View on Github external
def _make_variable(self, metric, value):
        with tf.name_scope('MetricAverageCallback'):
            var = tf.Variable(value, name=metric)
            self.backend.get_session().run(var.initializer)
            allreduce_op = hvd.allreduce(var, device_dense=self.device)
            return var, allreduce_op
github snuspl / parallax / parallax / parallax / core / python / mpi / graph_transform.py View on Github external
if isinstance(grad_tensor, tf.Tensor):
        grad = grad_tensor
        grad_consumers = [c for c in grad.consumers()]
        agg_grad = hvd.allreduce(grad,
                                 average=True)
        update_consumers(grad_consumers, grad, agg_grad)
        update_control_consumers(op_to_control_consumer_ops[grad.op],
                                 grad.op, agg_grad.op)
    else:
        grad = grad_tensor.values
        indices = grad_tensor.indices
        dense_shape = grad_tensor.dense_shape
        grad_consumers = [c for c in grad.consumers()]
        indices_consumers = [c for c in indices.consumers()]
        agg_grad = \
            hvd.allreduce(tf.IndexedSlices(grad, indices, dense_shape),
                          average=config.average_sparse)
        update_consumers(grad_consumers, grad, agg_grad.values)
        update_consumers(indices_consumers, indices, agg_grad.indices)
        update_control_consumers(op_to_control_consumer_ops[grad.op],
                                 grad.op, agg_grad.values.op)
        update_control_consumers(
            op_to_control_consumer_ops[indices.op], indices.op,
            agg_grad.indices.op)
    gradients_info._grad = agg_grad
github THUNLP-MT / THUMT / thumt / bin / dist_trainer.py View on Github external
def all_reduce_fn(tensor):
            return hvd.allreduce(tensor, compression=hvd.Compression.fp16)