How to use the horovod.tensorflow.size function in horovod

To help you get started, we’ve selected a few horovod examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github Edy-Barraza / Transformer_Distillation / network_distillation_distributed.py View on Github external
def main(_):

  hvd.init()
  FLAGS.output_dir = FLAGS.output_dir if hvd.rank() == 0 else os.path.join(FLAGS.output_dir, str(hvd.rank()))
  FLAGS.num_train_steps = FLAGS.num_train_steps // hvd.size()
  FLAGS.num_warmup_steps = FLAGS.num_warmup_steps // hvd.size()
  tf.logging.set_verbosity(tf.logging.INFO)

  if not FLAGS.do_train and not FLAGS.do_eval:
    raise ValueError("At least one of `do_train` or `do_eval` must be True.")

  bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)

  tf.gfile.MakeDirs(FLAGS.output_dir)

  input_files = []
  for input_pattern in FLAGS.input_file.split(","):
    input_files.extend(tf.gfile.Glob(input_pattern))

  tf.logging.info("*** Input Files ***")
  for input_file in input_files:
    tf.logging.info("  %s" % input_file)
github HewlettPackard / dlcookbook-dlbs / python / nvtfcnn_benchmarks / cnn / nvutils / runner.py View on Github external
nstep = num_training_samples * num_iter // global_batch_size
        decay_steps = nstep
    else:
        nstep = num_iter
        num_epochs = max(nstep * global_batch_size // num_training_samples, 1)
        decay_steps = 90 * num_training_samples // global_batch_size

    nstep_per_epoch = num_training_samples // global_batch_size

    # Horovod: pin GPU to be used to process local rank (one GPU per process)
    config = tf.ConfigProto()
    #config.gpu_options.allow_growth = True
    config.gpu_options.visible_device_list = str(hvd.local_rank())
    config.gpu_options.force_gpu_compatible = True # Force pinned memory
    config.intra_op_parallelism_threads = 1 # Avoid pool of Eigen threads
    config.inter_op_parallelism_threads = max(2, 40//hvd.size()-2)

    classifier = tf.estimator.Estimator(
        model_fn=_cnn_model_function,
        model_dir=log_dir,
        params={
            'model':         infer_func,
            'format':        image_format,
            'dtype' : tf.float16 if precision == 'fp16' else tf.float32,
            'momentum' : momentum,
            'learning_rate_init' : learning_rate_init,
            'learning_rate_power' : learning_rate_power,
            'decay_steps' : decay_steps,
            'weight_decay' : weight_decay,
            'loss_scale' : loss_scale,
            'larc_eta' : larc_eta,
            'larc_mode' : larc_mode,
github polyaxon / polyaxon / examples / in_cluster / horovod / tensorflow / mnist.py View on Github external
# initialization of all workers when training is started with random weights or
    # restored from a checkpoint.
    bcast_hook = hvd.BroadcastGlobalVariablesHook(0)

    # Train the model
    train_input_fn = tf.estimator.inputs.numpy_input_fn(
        x={"x": train_data},
        y=train_labels,
        batch_size=100,
        num_epochs=None,
        shuffle=True)

    # Horovod: adjust number of steps based on number of GPUs.
    mnist_classifier.train(
        input_fn=train_input_fn,
        steps=3000 // hvd.size(),
        hooks=[logging_hook, bcast_hook])

    # Evaluate the model and print results
    eval_input_fn = tf.estimator.inputs.numpy_input_fn(
        x={"x": eval_data},
        y=eval_labels,
        num_epochs=1,
        shuffle=False)
    eval_results = mnist_classifier.evaluate(input_fn=eval_input_fn)
    print(eval_results)

    # Polyaxon
    if hvd.rank() == 0:
        experiment.log_metrics(**eval_results)
github NVIDIA / DALI / docs / examples / tensorflow / demo / nvutils / runner.py View on Github external
eval_idx_filenames = None
    if data_idx_dir is not None:
        filename_pattern = os.path.join(data_idx_dir, '%s-*')
        eval_idx_filenames = sorted(tf.gfile.Glob(filename_pattern % 'validation'))
    else:
        raise ValueError("data_idx_dir must be specified")


    # Horovod: pin GPU to be used to process local rank (one GPU per process)
    config = tf.ConfigProto()
    #config.gpu_options.allow_growth = True
    config.gpu_options.visible_device_list = str(hvd.local_rank())
    config.gpu_options.force_gpu_compatible = True # Force pinned memory
    config.intra_op_parallelism_threads = 1 # Avoid pool of Eigen threads
    config.inter_op_parallelism_threads = 40 // hvd.size() - 2

    classifier_eval = tf.estimator.Estimator(
        model_fn=_cnn_model_function,
        model_dir=log_dir,
        params={
            'model':         infer_func,
            'format':        image_format,
            'dtype' : tf.float16 if precision == 'fp16' else tf.float32,
            'momentum' : momentum,
            'learning_rate_init' : learning_rate_init,
            'learning_rate_power' : learning_rate_power,
            'decay_steps' : None,
            'weight_decay' : weight_decay,
            'loss_scale' : loss_scale,
            'larc_eta' : larc_eta,
            'larc_mode' : larc_mode,
github horovod / horovod / test / test_tensorflow.py View on Github external
def test_horovod_allgather_grad_cpu(self):
        """Test the correctness of the allgather gradient on CPU."""
        hvd.init()
        rank = hvd.rank()
        size = hvd.size()

        # As of TensorFlow v1.9, gradients are not supported on
        # integer tensors
        dtypes = [tf.float32, tf.float64]
        dims = [1, 2, 3]
        for dtype, dim in itertools.product(dtypes, dims):
            tensor_sizes = [3, 2, 7, 4, 6, 8, 10] * 5
            tensor_sizes = tensor_sizes[:size]

            if _executing_eagerly():
                with tf.GradientTape() as tape:
                    tensor = self.tfe.Variable(
                        tf.ones([tensor_sizes[rank]] + [17] * (dim - 1)) * rank)
                    if dtype == tf.bool:
                        tensor = tensor % 2
                    tensor = tf.cast(tensor, dtype=dtype)
github NVIDIA / DeepLearningExamples / TensorFlow / LanguageModeling / BERT / run_classifier.py View on Github external
label_list = processor.get_labels()

  tokenizer = tokenization.FullTokenizer(
      vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)

  master_process = True
  training_hooks = []
  global_batch_size = FLAGS.train_batch_size
  hvd_rank = 0

  config = tf.ConfigProto()
  if FLAGS.horovod:

      tf.logging.info("Multi-GPU training with TF Horovod")
      tf.logging.info("hvd.size() = %d hvd.rank() = %d", hvd.size(), hvd.rank())
      global_batch_size = FLAGS.train_batch_size * hvd.size()
      master_process = (hvd.rank() == 0)
      hvd_rank = hvd.rank()
      config.gpu_options.allow_growth = True
      config.gpu_options.visible_device_list = str(hvd.local_rank())
      if hvd.size() > 1:
          training_hooks.append(hvd.BroadcastGlobalVariablesHook(0))
  if FLAGS.use_xla:
    config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1

  run_config = tf.estimator.RunConfig(
      model_dir=FLAGS.output_dir if master_process else None,
      session_config=config,
      save_checkpoints_steps=FLAGS.save_checkpoints_steps if master_process else None,
      keep_checkpoint_max=1)
github yenchenlin / pix2pix-flow / two_encode.py View on Github external
def get_data(hps, sess):
    if hps.image_size == -1:
        hps.image_size = {'edges2shoes': 32, 'mnist': 32, 'cifar10': 32, 'imagenet-oord': 64,
                          'imagenet': 256, 'celeba': 256, 'lsun_realnvp': 64, 'lsun': 256}[hps.problem]
    if hps.n_test == -1:
        hps.n_test = {'edges2shoes': 200, 'mnist': 10000, 'cifar10': 10000, 'imagenet-oord': 50000, 'imagenet': 50000,
                      'celeba': 3000, 'lsun_realnvp': 300*hvd.size(), 'lsun': 300*hvd.size()}[hps.problem]
    hps.n_y = {'edges2shoes': 10, 'mnist': 10, 'cifar10': 10, 'imagenet-oord': 1000,
               'imagenet': 1000, 'celeba': 1, 'lsun_realnvp': 1, 'lsun': 1}[hps.problem]
    if hps.data_dir == "":
        hps.data_dir = {'edges2shoes': None, 'mnist': None, 'cifar10': None, 'imagenet-oord': '/mnt/host/imagenet-oord-tfr', 'imagenet': '/mnt/host/imagenet-tfr',
                        'celeba': '/mnt/host/celeba-reshard-tfr', 'lsun_realnvp': '/mnt/host/lsun_realnvp', 'lsun': '/mnt/host/lsun'}[hps.problem]

    if hps.problem == 'lsun_realnvp':
        hps.rnd_crop = True
    else:
        hps.rnd_crop = False

    if hps.category:
        hps.data_dir += ('/%s' % hps.category)

    # Use anchor_size to rescale batch size based on image_size
    s = hps.anchor_size
github microsoft / DistributedDeepLearning / HorovodTF / src / imagenet_estimator_tf_horovod.py View on Github external
def _log_summary(data_length, duration):
    logger = _get_logger()
    images_per_second = data_length / duration
    logger.info('Data length:      {}'.format(data_length))
    logger.info('Total duration:   {:.3f}'.format(duration))
    logger.info('Total images/sec: {:.3f}'.format(images_per_second))
    logger.info('Batch size:       (Per GPU {}: Total {})'.format(_BATCHSIZE,
                                                                  hvd.size() * _BATCHSIZE if _DISTRIBUTED else _BATCHSIZE))
    logger.info('Distributed:      {}'.format('True' if _DISTRIBUTED else 'False'))
    logger.info('Num GPUs:         {:.3f}'.format(hvd.size() if _DISTRIBUTED else 1))
    logger.info('Dataset:          {}'.format('Synthetic' if _FAKE else 'Imagenet'))
github awslabs / sagemaker-debugger / examples / tensorflow / scripts / train_imagenet_resnet_hvd.py View on Github external
increased_aug=False,
):
    if synthetic and training:
        input_shape = [height, width, 3]
        input_element = nest.map_structure(
            lambda s: tf.constant(0.5, tf.float32, s), tf.TensorShape(input_shape)
        )
        label_element = nest.map_structure(
            lambda s: tf.constant(1, tf.int32, s), tf.TensorShape([1])
        )
        element = (input_element, label_element)
        ds = tf.data.Dataset.from_tensors(element).repeat()
    else:
        shuffle_buffer_size = 10000
        num_readers = 1
        if hvd.size() > len(filenames):
            assert (hvd.size() % len(filenames)) == 0
            filenames = filenames * (hvd.size() / len(filenames))

        ds = tf.data.Dataset.from_tensor_slices(filenames)
        if shard:
            # split the dataset into parts for each GPU
            ds = ds.shard(hvd.size(), hvd.rank())

        if not training:
            # make sure all ranks have the same amount
            ds = ds.take(take_count)

        if training:
            ds = ds.shuffle(1000, seed=7 * (1 + hvd.rank()))

        ds = ds.interleave(tf.data.TFRecordDataset, cycle_length=num_readers, block_length=1)
github microsoft / DistributedDeepLearning / HorovodTF / src / imagenet_estimator_tf_horovod.py View on Github external
def _log_summary(data_length, duration):
    logger = _get_logger()
    images_per_second = data_length / duration
    logger.info('Data length:      {}'.format(data_length))
    logger.info('Total duration:   {:.3f}'.format(duration))
    logger.info('Total images/sec: {:.3f}'.format(images_per_second))
    logger.info('Batch size:       (Per GPU {}: Total {})'.format(_BATCHSIZE,
                                                                  hvd.size() * _BATCHSIZE if _DISTRIBUTED else _BATCHSIZE))
    logger.info('Distributed:      {}'.format('True' if _DISTRIBUTED else 'False'))
    logger.info('Num GPUs:         {:.3f}'.format(hvd.size() if _DISTRIBUTED else 1))
    logger.info('Dataset:          {}'.format('Synthetic' if _FAKE else 'Imagenet'))