How to use the horovod.tensorflow function in horovod

To help you get started, we’ve selected a few horovod examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github NVIDIA / DALI / docs / examples / tensorflow / demo / nvutils / runner.py View on Github external
filename_pattern = os.path.join(data_dir, '%s-*')
    eval_filenames  = sorted(tf.gfile.Glob(filename_pattern % 'validation'))
    num_eval_samples = _get_num_records(eval_filenames)

    eval_idx_filenames = None
    if data_idx_dir is not None:
        filename_pattern = os.path.join(data_idx_dir, '%s-*')
        eval_idx_filenames = sorted(tf.gfile.Glob(filename_pattern % 'validation'))
    else:
        raise ValueError("data_idx_dir must be specified")


    # Horovod: pin GPU to be used to process local rank (one GPU per process)
    config = tf.ConfigProto()
    #config.gpu_options.allow_growth = True
    config.gpu_options.visible_device_list = str(hvd.local_rank())
    config.gpu_options.force_gpu_compatible = True # Force pinned memory
    config.intra_op_parallelism_threads = 1 # Avoid pool of Eigen threads
    config.inter_op_parallelism_threads = 40 // hvd.size() - 2

    classifier_eval = tf.estimator.Estimator(
        model_fn=_cnn_model_function,
        model_dir=log_dir,
        params={
            'model':         infer_func,
            'format':        image_format,
            'dtype' : tf.float16 if precision == 'fp16' else tf.float32,
            'momentum' : momentum,
            'learning_rate_init' : learning_rate_init,
            'learning_rate_power' : learning_rate_power,
            'decay_steps' : None,
            'weight_decay' : weight_decay,
github polyaxon / polyaxon / examples / in_cluster / horovod / tensorflow / mnist.py View on Github external
# condition among the workers that share the same filesystem. If the
    # directory already exists by the time this worker gets around to creating
    # it, ignore the resulting exception and continue.
    cache_dir = os.path.join(os.path.expanduser('~'), '.keras', 'datasets')
    if not os.path.exists(cache_dir):
        try:
            os.mkdir(cache_dir)
        except OSError as e:
            if e.errno == errno.EEXIST and os.path.isdir(cache_dir):
                pass
            else:
                raise

    # Download and load MNIST dataset.
    (train_data, train_labels), (eval_data, eval_labels) = \
        keras.datasets.mnist.load_data('MNIST-data-%d' % hvd.rank())

    # Polyaxon
    if hvd.rank() == 0:
        experiment.log_data_ref(data=train_data, data_name='x_train')
        experiment.log_data_ref(data=train_labels, data_name='y_train')
        experiment.log_data_ref(data=eval_data, data_name='x_test')
        experiment.log_data_ref(data=eval_labels, data_name='y_test')

    # The shape of downloaded data is (-1, 28, 28), hence we need to reshape it
    # into (-1, 784) to feed into our network. Also, need to normalize the
    # features between 0 and 1.
    train_data = np.reshape(train_data, (-1, 784)) / 255.0
    eval_data = np.reshape(eval_data, (-1, 784)) / 255.0

    # Horovod: pin GPU to be used to process local rank (one GPU per process)
    config = tf.ConfigProto()
github Tejalsjsu / DeepGradientCompression / examples / BERT / run_classifier.py View on Github external
if hvd.rank() < remainder:
        start_index = hvd.rank() * (num_examples_per_rank+1)
        end_index = start_index + num_examples_per_rank + 1
      else:
        start_index = hvd.rank() * num_examples_per_rank + remainder
        end_index = start_index + (num_examples_per_rank)

  model_fn = model_fn_builder(
      bert_config=bert_config,
      num_labels=len(label_list),
      init_checkpoint=FLAGS.init_checkpoint,
      learning_rate=FLAGS.learning_rate if not FLAGS.horovod else FLAGS.learning_rate * hvd.size(),
      num_train_steps=num_train_steps,
      num_warmup_steps=num_warmup_steps,
      use_one_hot_embeddings=False,
      hvd=None if not FLAGS.horovod else hvd)

  estimator = tf.estimator.Estimator(
      model_fn=model_fn,
      config=run_config)

  if FLAGS.do_train:

    file_based_convert_examples_to_features(
        train_examples[start_index:end_index], label_list, FLAGS.max_seq_length, tokenizer, tmp_filenames[hvd_rank])

    tf.logging.info("***** Running training *****")
    tf.logging.info("  Num examples = %d", len(train_examples))
    tf.logging.info("  Batch size = %d", FLAGS.train_batch_size)
    tf.logging.info("  Num steps = %d", num_train_steps)
    train_input_fn = file_based_input_fn_builder(
        input_file=tmp_filenames,
github NVIDIA / DeepLearningExamples / TensorFlow / Classification / RN50v1.5 / model / resnet_v1_5.py View on Github external
num_batches_per_epoch=params["steps_per_epoch"],
                            num_gpus=params["num_gpus"]
                        )

                    tf.identity(learning_rate, name='learning_rate_ref')
                    tf.summary.scalar('learning_rate', learning_rate)

                    optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate, momentum=params["momentum"])

                    if params["apply_loss_scaling"]:

                        optimizer = FixedLossScalerOptimizer(optimizer, scale=params["loss_scale"])

                    if hvd_utils.is_using_hvd():
                        
                        optimizer = hvd.DistributedOptimizer(optimizer)

                    update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
                    if mode != tf.estimator.ModeKeys.TRAIN:
                        update_ops += [acc_top1_update_op, acc_top5_update_op]
                    
                    deterministic = True
                    gate_gradients = (tf.train.Optimizer.GATE_OP if deterministic else tf.train.Optimizer.GATE_NONE)

                    backprop_op = optimizer.minimize(total_loss, gate_gradients=gate_gradients, global_step=global_step)

                    
                    if self.model_hparams.use_dali:
                    
                        train_ops = tf.group(backprop_op, update_ops, name='train_ops')
                    
                    else:
github armandmcqueen / tensorpack-mask-rcnn / MaskRCNN_no_batch / eval.py View on Github external
def _eval(self):
        logdir = self._output_dir
        if cfg.TRAINER == 'replicated':
            all_results = multithread_predict_dataflow(self.dataflows, self.predictors)
        else:
            filenames = [os.path.join(
                logdir, 'outputs{}-part{}.json'.format(self.global_step, rank)
            ) for rank in range(hvd.local_size())]

            if self._horovod_run_eval:
                local_results = predict_dataflow(self.dataflow, self.predictor)
                fname = filenames[hvd.local_rank()]
                with open(fname, 'w') as f:
                    json.dump(local_results, f)
            self.barrier.eval()
            if hvd.rank() > 0:
                return
            all_results = []
            for fname in filenames:
                with open(fname, 'r') as f:
                    obj = json.load(f)
                all_results.extend(obj)
                os.unlink(fname)

        output_file = os.path.join(
            logdir, '{}-outputs{}.json'.format(self._eval_dataset, self.global_step))

        scores = DetectionDataset().eval_or_save_inference_results(
            all_results, self._eval_dataset, output_file)
        for k, v in scores.items():
            self.trainer.monitors.put_scalar(k, v)
github aws-samples / deep-learning-models / legacy / models / resnet / tensorflow2 / train_tf2_resnet.py View on Github external
cmdline = add_cli_args()
    FLAGS, unknown_args = cmdline.parse_known_args()
    ds = create_data(FLAGS.data_dir, FLAGS.synthetic, FLAGS.batch_size)
    model = tf.keras.applications.ResNet50(weights=None, classes=1000)
    opt = tf.keras.optimizers.SGD(learning_rate=FLAGS.learning_rate * hvd.size(), momentum=0.1)
    loss_func = tf.keras.losses.SparseCategoricalCrossentropy()

    loop_time = time()
    if hvd.local_rank() == 0:
        print("Step \t Throughput \t Loss")
    for batch, (images, labels) in enumerate(ds):
        loss = train_step(model, opt, loss_func, images, labels, batch==0)
        if hvd.local_rank() == 0:
            duration = time() - loop_time
            loop_time = time()
            throughput = (hvd.size()*FLAGS.batch_size)/duration
            print("{} \t images/sec: {} \t {}".format(batch, throughput, loss))
        if batch==FLAGS.num_batches:
            break
    if hvd.rank() == 0:
        print("\nFinished in {}".format(time()-start))
github awslabs / sagemaker-debugger / tornasole / tensorflow / hook.py View on Github external
"""
        This function returns the name of the worker based on
        the distribution strategy.

        We do not use this function for MirroredStrategy.
        Device names are used as worker names for this MirroredStrategy.
        The names of the workers are managed by device_map in the case of this strategy.

        It is safe to return the TORNASOLE_CONFIG_DEFAULT_WORKER_NAME in this case.
        :return: str
        """
        try:
            import horovod.tensorflow as hvd

            if hvd.size():
                return f"worker_{hvd.rank()}"
        except (ModuleNotFoundError, ValueError, ImportError):
            pass
        tf_config = os.getenv("TF_CONFIG")
        if tf_config and is_parameter_server_strategy(tf_config):
            return get_worker_id_from_tf_config(tf_config)
        return CONFIG_DEFAULT_WORKER_NAME
github THUNLP-MT / THUMT / thumt / bin / dist_trainer.py View on Github external
def session_config():
    optimizer_options = tf.OptimizerOptions(opt_level=tf.OptimizerOptions.L1,
                                            do_function_inlining=True)
    graph_options = tf.GraphOptions(optimizer_options=optimizer_options)
    config = tf.ConfigProto(allow_soft_placement=True,
                            graph_options=graph_options)
    config.gpu_options.visible_device_list = str(hvd.local_rank())

    return config
github aws-samples / deep-learning-models / legacy / models / resnet / tensorflow / train_imagenet_resnet_hvd.py View on Github external
'lc_beta': FLAGS.lc_beta,
            'loss_scale': FLAGS.loss_scale,
            'adv_bn_init': FLAGS.adv_bn_init,
            'conv_init': tf.variance_scaling_initializer() if FLAGS.adv_conv_init else None
        },
        config=tf.estimator.RunConfig(
            # tf_random_seed=31 * (1 + hvd.rank()),
            session_config=config,
            save_summary_steps=FLAGS.save_summary_steps if do_checkpoint else None,
            save_checkpoints_steps=FLAGS.save_checkpoints_steps if do_checkpoint else None,
            keep_checkpoint_max=None))

    if not FLAGS.eval:
        num_preproc_threads = FLAGS.num_parallel_calls
        rank0log(logger, "Using preprocessing threads per GPU: ", num_preproc_threads)
        training_hooks = [hvd.BroadcastGlobalVariablesHook(0),
                          PrefillStagingAreasHook()]
        if hvd.rank() == 0:
            training_hooks.append(
                LogSessionRunHook(global_batch_size,
                                  num_training_samples,
                                  FLAGS.display_every, logger))
        try:
            start_time = time.time()
            classifier.train(
                input_fn=lambda: make_dataset(
                    train_filenames,
                    training_samples_per_rank,
                    FLAGS.batch_size, height, width, 
                    FLAGS.brightness, FLAGS.contrast, FLAGS.saturation, FLAGS.hue, 
                    training=True, num_threads=num_preproc_threads, 
                    shard=True, synthetic=FLAGS.synthetic, increased_aug=FLAGS.increased_aug),