How to use the horovod.tensorflow function in horovod

To help you get started, we’ve selected a few horovod examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

NVIDIA / DALI / docs / examples / tensorflow / demo / nvutils / runner.py View on Github

filename_pattern = os.path.join(data_dir, '%s-*')
    eval_filenames  = sorted(tf.gfile.Glob(filename_pattern % 'validation'))
    num_eval_samples = _get_num_records(eval_filenames)

    eval_idx_filenames = None
    if data_idx_dir is not None:
        filename_pattern = os.path.join(data_idx_dir, '%s-*')
        eval_idx_filenames = sorted(tf.gfile.Glob(filename_pattern % 'validation'))
    else:
        raise ValueError("data_idx_dir must be specified")


    # Horovod: pin GPU to be used to process local rank (one GPU per process)
    config = tf.ConfigProto()
    #config.gpu_options.allow_growth = True
    config.gpu_options.visible_device_list = str(hvd.local_rank())
    config.gpu_options.force_gpu_compatible = True # Force pinned memory
    config.intra_op_parallelism_threads = 1 # Avoid pool of Eigen threads
    config.inter_op_parallelism_threads = 40 // hvd.size() - 2

    classifier_eval = tf.estimator.Estimator(
        model_fn=_cnn_model_function,
        model_dir=log_dir,
        params={
            'model':         infer_func,
            'format':        image_format,
            'dtype' : tf.float16 if precision == 'fp16' else tf.float32,
            'momentum' : momentum,
            'learning_rate_init' : learning_rate_init,
            'learning_rate_power' : learning_rate_power,
            'decay_steps' : None,
            'weight_decay' : weight_decay,

polyaxon / polyaxon / examples / in_cluster / horovod / tensorflow / mnist.py View on Github

# condition among the workers that share the same filesystem. If the
    # directory already exists by the time this worker gets around to creating
    # it, ignore the resulting exception and continue.
    cache_dir = os.path.join(os.path.expanduser('~'), '.keras', 'datasets')
    if not os.path.exists(cache_dir):
        try:
            os.mkdir(cache_dir)
        except OSError as e:
            if e.errno == errno.EEXIST and os.path.isdir(cache_dir):
                pass
            else:
                raise

    # Download and load MNIST dataset.
    (train_data, train_labels), (eval_data, eval_labels) = \
        keras.datasets.mnist.load_data('MNIST-data-%d' % hvd.rank())

    # Polyaxon
    if hvd.rank() == 0:
        experiment.log_data_ref(data=train_data, data_name='x_train')
        experiment.log_data_ref(data=train_labels, data_name='y_train')
        experiment.log_data_ref(data=eval_data, data_name='x_test')
        experiment.log_data_ref(data=eval_labels, data_name='y_test')

    # The shape of downloaded data is (-1, 28, 28), hence we need to reshape it
    # into (-1, 784) to feed into our network. Also, need to normalize the
    # features between 0 and 1.
    train_data = np.reshape(train_data, (-1, 784)) / 255.0
    eval_data = np.reshape(eval_data, (-1, 784)) / 255.0

    # Horovod: pin GPU to be used to process local rank (one GPU per process)
    config = tf.ConfigProto()

Tejalsjsu / DeepGradientCompression / examples / BERT / run_classifier.py View on Github

if hvd.rank() &lt; remainder:
        start_index = hvd.rank() * (num_examples_per_rank+1)
        end_index = start_index + num_examples_per_rank + 1
      else:
        start_index = hvd.rank() * num_examples_per_rank + remainder
        end_index = start_index + (num_examples_per_rank)

  model_fn = model_fn_builder(
      bert_config=bert_config,
      num_labels=len(label_list),
      init_checkpoint=FLAGS.init_checkpoint,
      learning_rate=FLAGS.learning_rate if not FLAGS.horovod else FLAGS.learning_rate * hvd.size(),
      num_train_steps=num_train_steps,
      num_warmup_steps=num_warmup_steps,
      use_one_hot_embeddings=False,
      hvd=None if not FLAGS.horovod else hvd)

  estimator = tf.estimator.Estimator(
      model_fn=model_fn,
      config=run_config)

  if FLAGS.do_train:

    file_based_convert_examples_to_features(
        train_examples[start_index:end_index], label_list, FLAGS.max_seq_length, tokenizer, tmp_filenames[hvd_rank])

    tf.logging.info("***** Running training *****")
    tf.logging.info("  Num examples = %d", len(train_examples))
    tf.logging.info("  Batch size = %d", FLAGS.train_batch_size)
    tf.logging.info("  Num steps = %d", num_train_steps)
    train_input_fn = file_based_input_fn_builder(
        input_file=tmp_filenames,

NVIDIA / DeepLearningExamples / TensorFlow / Classification / RN50v1.5 / model / resnet_v1_5.py View on Github

num_batches_per_epoch=params["steps_per_epoch"],
                            num_gpus=params["num_gpus"]
                        )

                    tf.identity(learning_rate, name='learning_rate_ref')
                    tf.summary.scalar('learning_rate', learning_rate)

                    optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate, momentum=params["momentum"])

                    if params["apply_loss_scaling"]:

                        optimizer = FixedLossScalerOptimizer(optimizer, scale=params["loss_scale"])

                    if hvd_utils.is_using_hvd():
                        
                        optimizer = hvd.DistributedOptimizer(optimizer)

                    update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
                    if mode != tf.estimator.ModeKeys.TRAIN:
                        update_ops += [acc_top1_update_op, acc_top5_update_op]
                    
                    deterministic = True
                    gate_gradients = (tf.train.Optimizer.GATE_OP if deterministic else tf.train.Optimizer.GATE_NONE)

                    backprop_op = optimizer.minimize(total_loss, gate_gradients=gate_gradients, global_step=global_step)

                    
                    if self.model_hparams.use_dali:
                    
                        train_ops = tf.group(backprop_op, update_ops, name='train_ops')
                    
                    else:

armandmcqueen / tensorpack-mask-rcnn / MaskRCNN_no_batch / eval.py View on Github

def _eval(self):
        logdir = self._output_dir
        if cfg.TRAINER == 'replicated':
            all_results = multithread_predict_dataflow(self.dataflows, self.predictors)
        else:
            filenames = [os.path.join(
                logdir, 'outputs{}-part{}.json'.format(self.global_step, rank)
            ) for rank in range(hvd.local_size())]

            if self._horovod_run_eval:
                local_results = predict_dataflow(self.dataflow, self.predictor)
                fname = filenames[hvd.local_rank()]
                with open(fname, 'w') as f:
                    json.dump(local_results, f)
            self.barrier.eval()
            if hvd.rank() > 0:
                return
            all_results = []
            for fname in filenames:
                with open(fname, 'r') as f:
                    obj = json.load(f)
                all_results.extend(obj)
                os.unlink(fname)

        output_file = os.path.join(
            logdir, '{}-outputs{}.json'.format(self._eval_dataset, self.global_step))

        scores = DetectionDataset().eval_or_save_inference_results(
            all_results, self._eval_dataset, output_file)
        for k, v in scores.items():
            self.trainer.monitors.put_scalar(k, v)

aws-samples / deep-learning-models / legacy / models / resnet / tensorflow2 / train_tf2_resnet.py View on Github

cmdline = add_cli_args()
    FLAGS, unknown_args = cmdline.parse_known_args()
    ds = create_data(FLAGS.data_dir, FLAGS.synthetic, FLAGS.batch_size)
    model = tf.keras.applications.ResNet50(weights=None, classes=1000)
    opt = tf.keras.optimizers.SGD(learning_rate=FLAGS.learning_rate * hvd.size(), momentum=0.1)
    loss_func = tf.keras.losses.SparseCategoricalCrossentropy()

    loop_time = time()
    if hvd.local_rank() == 0:
        print("Step \t Throughput \t Loss")
    for batch, (images, labels) in enumerate(ds):
        loss = train_step(model, opt, loss_func, images, labels, batch==0)
        if hvd.local_rank() == 0:
            duration = time() - loop_time
            loop_time = time()
            throughput = (hvd.size()*FLAGS.batch_size)/duration
            print("{} \t images/sec: {} \t {}".format(batch, throughput, loss))
        if batch==FLAGS.num_batches:
            break
    if hvd.rank() == 0:
        print("\nFinished in {}".format(time()-start))

awslabs / sagemaker-debugger / tornasole / tensorflow / hook.py View on Github

"""
        This function returns the name of the worker based on
        the distribution strategy.

        We do not use this function for MirroredStrategy.
        Device names are used as worker names for this MirroredStrategy.
        The names of the workers are managed by device_map in the case of this strategy.

        It is safe to return the TORNASOLE_CONFIG_DEFAULT_WORKER_NAME in this case.
        :return: str
        """
        try:
            import horovod.tensorflow as hvd

            if hvd.size():
                return f"worker_{hvd.rank()}"
        except (ModuleNotFoundError, ValueError, ImportError):
            pass
        tf_config = os.getenv("TF_CONFIG")
        if tf_config and is_parameter_server_strategy(tf_config):
            return get_worker_id_from_tf_config(tf_config)
        return CONFIG_DEFAULT_WORKER_NAME

THUNLP-MT / THUMT / thumt / bin / dist_trainer.py View on Github

def session_config():
    optimizer_options = tf.OptimizerOptions(opt_level=tf.OptimizerOptions.L1,
                                            do_function_inlining=True)
    graph_options = tf.GraphOptions(optimizer_options=optimizer_options)
    config = tf.ConfigProto(allow_soft_placement=True,
                            graph_options=graph_options)
    config.gpu_options.visible_device_list = str(hvd.local_rank())

    return config

aws-samples / deep-learning-models / legacy / models / resnet / tensorflow / train_imagenet_resnet_hvd.py View on Github

'lc_beta': FLAGS.lc_beta,
            'loss_scale': FLAGS.loss_scale,
            'adv_bn_init': FLAGS.adv_bn_init,
            'conv_init': tf.variance_scaling_initializer() if FLAGS.adv_conv_init else None
        },
        config=tf.estimator.RunConfig(
            # tf_random_seed=31 * (1 + hvd.rank()),
            session_config=config,
            save_summary_steps=FLAGS.save_summary_steps if do_checkpoint else None,
            save_checkpoints_steps=FLAGS.save_checkpoints_steps if do_checkpoint else None,
            keep_checkpoint_max=None))

    if not FLAGS.eval:
        num_preproc_threads = FLAGS.num_parallel_calls
        rank0log(logger, "Using preprocessing threads per GPU: ", num_preproc_threads)
        training_hooks = [hvd.BroadcastGlobalVariablesHook(0),
                          PrefillStagingAreasHook()]
        if hvd.rank() == 0:
            training_hooks.append(
                LogSessionRunHook(global_batch_size,
                                  num_training_samples,
                                  FLAGS.display_every, logger))
        try:
            start_time = time.time()
            classifier.train(
                input_fn=lambda: make_dataset(
                    train_filenames,
                    training_samples_per_rank,
                    FLAGS.batch_size, height, width, 
                    FLAGS.brightness, FLAGS.contrast, FLAGS.saturation, FLAGS.hue, 
                    training=True, num_threads=num_preproc_threads, 
                    shard=True, synthetic=FLAGS.synthetic, increased_aug=FLAGS.increased_aug),

How to use the horovod.tensorflow function in horovod

To help you get started, we’ve selected a few horovod examples, based on popular ways it is used in public projects.

horovod

Package Health Score

Popular horovod functions

Similar packages