How to use the horovod.tensorflow.BroadcastGlobalVariablesHook function in horovod

To help you get started, we’ve selected a few horovod examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github jianzfb / antgo / antgo / trainer / tftrainer.py View on Github external
restore_fns = _get_init_fn(self, model, self.dump_dir, self.ctx)
          if restore_fns is not None:
            for restore_fn in restore_fns:
              restore_fn(self.sess)

          # Restore from custom auxilary init funcs
          for func in self._aux_init_funcs:
            func(self.sess)

        # resotre from auxilary checkpoint
        for auxilary_scope, auxilary_checkpoint in self.auxilary_checkpoints.items():
          self.restore_scopy_from(model, auxilary_scope, auxilary_checkpoint)

        # Horovod boardcast global variables
        if self.is_distribute_training:
          bgv = hvd.BroadcastGlobalVariablesHook(0)
          bgv.begin()
          bgv.after_create_session(self.sess, self.coord)
github NVIDIA / DeepLearningExamples / TensorFlow / Classification / RN50v1.5 / runtime / runner.py View on Github external
else:

                training_logging_hook = hooks.TrainingLoggingHook(
                    log_file_path=os.path.join(self.run_hparams.log_dir, "training.json"),
                    global_batch_size=global_batch_size,
                    num_steps=num_steps,
                    num_samples=num_samples,
                    num_epochs=num_epochs,
                    log_every=log_every_n_steps
                )

                training_hooks.append(training_logging_hook)

        if hvd_utils.is_using_hvd():
            bcast_hook = hvd.BroadcastGlobalVariablesHook(0)
            training_hooks.append(bcast_hook)

        training_hooks.append(hooks.PrefillStagingAreasHook())

      
        estimator_params = {
            'batch_size': batch_size,
            'steps_per_epoch': steps_per_epoch,
            'num_gpus': num_gpus,
            'momentum': momentum,
            'learning_rate_init': learning_rate_init,
            'weight_decay': weight_decay,
            'loss_scale': loss_scale,
            'apply_loss_scaling': use_static_loss_scaling
        }
github feifeibear / Distributed-ResNet-Tensorflow / resnet_cifar_main_horovod.py View on Github external
elif train_step < 60000:
        self._lrn_rate = 0.01
      elif train_step < 80000:
        self._lrn_rate = 0.001
      else:
        self._lrn_rate = 0.0001

  if FLAGS.job_name == None: 
    #serial version
    
    checkpoint_dir = FLAGS.log_root if hvd.rank() == 0 else None

    with tf.train.MonitoredTrainingSession(
        checkpoint_dir=checkpoint_dir,
        save_checkpoint_secs=60,
        hooks=[hvd.BroadcastGlobalVariablesHook(0), logging_hook, _LearningRateSetterHook()],
        chief_only_hooks=[summary_hook],
        # Since we provide a SummarySaverHook, we need to disable default
        # SummarySaverHook. To do that we set save_summaries_steps to 0.
        save_summaries_steps=0,
        config=create_config_proto()) as mon_sess:
      while not mon_sess.should_stop():
        mon_sess.run(model.train_op)

  else:
    is_chief = (FLAGS.task_index == 0)
    with tf.train.MonitoredTrainingSession(
        master=server.target,
        is_chief=is_chief,
        checkpoint_dir=FLAGS.log_root,
        save_checkpoint_secs=60,
        hooks=[logging_hook, _LearningRateSetterHook()],
github renmengye / revnet-public / run_imagenet_train_horovod.py View on Github external
with log.verbose_level(2):
        m = get_model("resnet", config, **kwargs)

  global_step = tf.get_variable(
      "global_step", [],
      initializer=tf.constant_initializer(0),
      trainable=False,
      dtype=tf.int64)
  lr = tf.train.piecewise_constant(
      global_step, config.learn_rate_decay_steps,
      [config.learn_rate] + list(config.learn_rate_list))
  m._lr = lr
  m._global_step = global_step
  opt = tf.train.MomentumOptimizer(lr, 0.9)
  opt = hvd.DistributedOptimizer(opt)
  hooks = [hvd.BroadcastGlobalVariablesHook(0)]
  m._train_op = opt.minimize(m.cost, global_step=global_step, name="train_step")
  tf.summary.scalar("train ce", m.cross_ent)
  return m, hooks
github criteo / tf-yarn / examples / collective_all_reduce_example.py View on Github external
def eval_input_fn():
        dataset = winequality.get_dataset(WINE_EQUALITY_FILE, split="test")
        return dataset.shuffle(1000).batch(128)

    estimator = tf.compat.v1.estimator.LinearClassifier(
        feature_columns=winequality.get_feature_columns(),
        model_dir=f"{HDFS_DIR}",
        n_classes=winequality.get_n_classes(),
        optimizer=lambda: hvd.DistributedOptimizer(tf.compat.v1.train.AdamOptimizer()))

    return Experiment(
        estimator,
        tf.estimator.TrainSpec(
            train_input_fn,
            max_steps=10,
            hooks=[hvd.BroadcastGlobalVariablesHook(0)]
        ),
        tf.estimator.EvalSpec(
            eval_input_fn,
            steps=10,
            start_delay_secs=0,
            throttle_secs=30
        )
github microsoft / DistributedDeepLearning / {{cookiecutter.project_name}} / TensorFlow_imagenet / src / resnet_main.py View on Github external
def _get_hooks(batch_size, is_distributed=defaults.DISTRIBUTED):
    logger = logging.getLogger(__name__)

    if is_distributed:
        exps_hook = ExamplesPerSecondHook(batch_size * hvd.size())
        bcast_hook = hvd.BroadcastGlobalVariablesHook(0)
        logger.info("Rank: {} Cluster Size {}".format(hvd.rank(), hvd.size()))
        return [bcast_hook, exps_hook]
    else:
        exps_hook = ExamplesPerSecondHook(batch_size)
        return [exps_hook]
github microsoft / DistributedDeepLearning / ImagenetEstimatorHorovod.py View on Github external
rfig = tf.estimator.RunConfig(save_checkpoints_steps=10000, session_config=config)

    # Horovod: save checkpoints only on worker 0 to prevent other workers from
    # corrupting them.
    model_dir = os.getenv('AZ_BATCHAI_OUTPUT_MODEL') if hvd.rank() == 0 else None

    params = {"learning_rate": LR}
    # rfig = tf.estimator.RunConfig(save_checkpoints_steps=1000)
    logger.info('Creating estimator with params: {}'.format(params))
    model = tf.estimator.Estimator(model_fn=model_fn,
                                   params=params,
                                   model_dir=model_dir,
                                   config=rfig)

    bcast_hook = hvd.BroadcastGlobalVariablesHook(0)
    logger.info('{} {}'.format(hvd.local_rank(), hvd.size()))
    for epoch in range(EPOCHS):
        logger.info('Running epoch {}...'.format(epoch))
        model.train(input_fn=train_input_fn, steps=10000, hooks=[bcast_hook])  # data_length//batch_size
        logger.info('Validation...')
        model.evaluate(input_fn=validation_input_fn, steps=10)  # validation_length//batch_size
github joeyearsley / efficient_densenet_tensorflow / train.py View on Github external
custom_params = {
        'data_format': FLAGS.data_format,
        'batch_size': FLAGS.batch_size,
        'fp16': FLAGS.fp16,
        'efficient': FLAGS.efficient
    }

    features, labels = input_fn(True, FLAGS.data_dir, FLAGS.batch_size, None)
    with tf.variable_scope('model', custom_getter=float32_variable_storage_getter):
        train_op, loss, global_step = cifar10_model_fn(features, labels, custom_params)

    # BroadcastGlobalVariablesHook broadcasts initial variable states from rank 0
    # to all other processes. This is necessary to ensure consistent initialization
    # of all workers when training is started with random weights or restored
    # from a checkpoint.
    hooks = [hvd.BroadcastGlobalVariablesHook(0),
             tf.train.StopAtStepHook(last_step=10000),
             tf.train.LoggingTensorHook(tensors={'step': global_step, 'loss': loss},
                                        every_n_iter=10),
             ]

    # Pin GPU to be used to process local rank (one GPU per process)
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.gpu_options.visible_device_list = str(hvd.local_rank())

    # Save checkpoints only on worker 0 to prevent other workers from corrupting them.
    checkpoint_dir = './checkpoints' if hvd.rank() == 0 else None

    # The MonitoredTrainingSession takes care of session initialization,
    # restoring from a checkpoint, saving to a checkpoint, and closing when done
    # or an error occurs.
github NVIDIA / DeepLearningExamples / TensorFlow / LanguageModeling / BERT / run_ner.py View on Github external
is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2

    master_process = True
    training_hooks = []
    global_batch_size = FLAGS.train_batch_size
    hvd_rank = 0

    config = tf.ConfigProto()
    if FLAGS.horovod:
      global_batch_size = FLAGS.train_batch_size * hvd.size()
      master_process = (hvd.rank() == 0)
      hvd_rank = hvd.rank()
      config.gpu_options.visible_device_list = str(hvd.local_rank())
      if hvd.size() > 1:
        training_hooks.append(hvd.BroadcastGlobalVariablesHook(0))

    if FLAGS.use_xla:
        config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1
    run_config = tf.estimator.RunConfig(
      model_dir=FLAGS.output_dir if master_process else None,
      session_config=config,
      save_checkpoints_steps=FLAGS.save_checkpoints_steps if master_process else None,
      keep_checkpoint_max=1)

    if master_process:
      tf.logging.info("***** Configuaration *****")
      for key in FLAGS.__flags.keys():
          tf.logging.info('  {}: {}'.format(key, getattr(FLAGS, key)))
      tf.logging.info("**************************")

    train_examples = None