How to use horovod - 10 common examples

To help you get started, we’ve selected a few horovod examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github Edy-Barraza / Transformer_Distillation / network_distillation_distributed.py View on Github external
def main(_):

  hvd.init()
  FLAGS.output_dir = FLAGS.output_dir if hvd.rank() == 0 else os.path.join(FLAGS.output_dir, str(hvd.rank()))
  FLAGS.num_train_steps = FLAGS.num_train_steps // hvd.size()
  FLAGS.num_warmup_steps = FLAGS.num_warmup_steps // hvd.size()
  tf.logging.set_verbosity(tf.logging.INFO)

  if not FLAGS.do_train and not FLAGS.do_eval:
    raise ValueError("At least one of `do_train` or `do_eval` must be True.")

  bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)

  tf.gfile.MakeDirs(FLAGS.output_dir)

  input_files = []
  for input_pattern in FLAGS.input_file.split(","):
    input_files.extend(tf.gfile.Glob(input_pattern))

  tf.logging.info("*** Input Files ***")
  for input_file in input_files:
    tf.logging.info("  %s" % input_file)
github microsoft / nlp-recipes / utils_nlp / models / transformers / question_answering_distributed.py View on Github external
If `cache_dir` and `load_model_from_dir` are the same and
                `overwrite_model` is `False`, the fitted model is saved
                to "cache_dir/fine_tuned". Defaults to False.
            overwrite_model (bool, optional): Whether to overwrite an existing model.
                If `cache_dir` and `load_model_from_dir` are the same and
                `overwrite_model` is `False`, the fitted model is saved to
                "cache_dir/fine_tuned". Defaults to False.

        """
        # tb_writer = SummaryWriter()
        # device = get_device("cpu" if num_gpus == 0 or not torch.cuda.is_available() else "gpu")
        # self.model = move_to_device(self.model, device, num_gpus)

        # hvd.init()

        rank = hvd.rank()
        local_rank = hvd.local_rank()
        world_size = hvd.size()

        torch.cuda.set_device(local_rank)
        device = torch.device("cuda", local_rank)
        is_master = rank == 0

        self.cache_dir = self.cache_dir + "/distributed_" + str(local_rank)

        self.model = self.model.to(device)

        # t_total = len(train_dataloader) * num_epochs

        # t_total = len(train_dataloader) // gradient_accumulation_steps * num_epochs

        max_steps = 48000
github microsoft / nlp-recipes / utils_nlp / azureml / azureml_bert_util.py View on Github external
def __init__(self, accumulation_step=1):
        hvd.init()
        self.local_rank = hvd.local_rank()
        self.world_size = hvd.size()
        self.rank = hvd.rank()
        self.n_gpu = torch.cuda.device_count()
        self.node_count = self.world_size // self.n_gpu
        self.accumulation_step = accumulation_step
        self.count_down = accumulation_step - 1
        self._multi_node = self.node_count > 1 
        if not self._multi_node:
            # use PyTorch build-in NCCL backend for single node training
            torch.distributed.init_process_group(backend='nccl', init_method='tcp://127.0.0.1:6000',
                                world_size=self.n_gpu,  rank=self.local_rank)
github microsoft / seismic-deeplearning / experiments / interpretation / dutchf3_patch / horovod / train.py View on Github external
Options loaded from default.py will be overridden by options loaded from cfg file
        Options passed in through options argument will override option loaded from cfg file

    Args:
        *options (str,int ,optional): Options used to overide what is loaded from the config.
                                      To see what options are available consult default.py
        cfg (str, optional): Location of config file to load. Defaults to None.
    """

    update_config(config, options=options, config_file=cfg)
    hvd.init()
    silence_other_ranks = True
    logging.config.fileConfig(config.LOG_CONFIG)
    logger = logging.getLogger(__name__)
    torch.manual_seed(config.SEED)
    torch.cuda.set_device(hvd.local_rank())
    torch.cuda.manual_seed(config.SEED)
    rank, world_size = hvd.rank(), hvd.size()

    scheduler_step = config.TRAIN.END_EPOCH // config.TRAIN.SNAPSHOTS
    torch.backends.cudnn.benchmark = config.CUDNN.BENCHMARK
    torch.manual_seed(config.SEED)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(config.SEED)
    np.random.seed(seed=config.SEED)
    # Setup Augmentations
    basic_aug = Compose(
        [
            Normalize(
                mean=(config.TRAIN.MEAN,),
                std=(config.TRAIN.STD,),
                max_pixel_value=1,
github NVIDIA / DALI / docs / examples / tensorflow / demo / nvutils / runner.py View on Github external
filename_pattern = os.path.join(data_dir, '%s-*')
    eval_filenames  = sorted(tf.gfile.Glob(filename_pattern % 'validation'))
    num_eval_samples = _get_num_records(eval_filenames)

    eval_idx_filenames = None
    if data_idx_dir is not None:
        filename_pattern = os.path.join(data_idx_dir, '%s-*')
        eval_idx_filenames = sorted(tf.gfile.Glob(filename_pattern % 'validation'))
    else:
        raise ValueError("data_idx_dir must be specified")


    # Horovod: pin GPU to be used to process local rank (one GPU per process)
    config = tf.ConfigProto()
    #config.gpu_options.allow_growth = True
    config.gpu_options.visible_device_list = str(hvd.local_rank())
    config.gpu_options.force_gpu_compatible = True # Force pinned memory
    config.intra_op_parallelism_threads = 1 # Avoid pool of Eigen threads
    config.inter_op_parallelism_threads = 40 // hvd.size() - 2

    classifier_eval = tf.estimator.Estimator(
        model_fn=_cnn_model_function,
        model_dir=log_dir,
        params={
            'model':         infer_func,
            'format':        image_format,
            'dtype' : tf.float16 if precision == 'fp16' else tf.float32,
            'momentum' : momentum,
            'learning_rate_init' : learning_rate_init,
            'learning_rate_power' : learning_rate_power,
            'decay_steps' : None,
            'weight_decay' : weight_decay,
github tensorlayer / openpose-plus / train.py View on Github external
if step == n_step:
                break

            tic = time.time()
            if step != 0 and (step % lr_decay_every_step == 0):
                new_lr_decay = lr_decay_factor**(step // lr_decay_every_step)
                sess.run(tf.assign(lr_v, scaled_lr * new_lr_decay))

            [_, _loss, _stage_losses, _l2, conf_result, paf_result] = \
                sess.run([train_op, total_loss, stage_losses, l2_loss, last_conf, last_paf])

            # tstring = time.strftime('%d-%m %H:%M:%S', time.localtime(time.time()))
            lr = sess.run(lr_v)
            print(
                'Worker{}: Total Loss at iteration {} / {} is: {} Learning rate {:10e} l2_loss {:10e} Took: {}s'.format(
                    hvd.rank(), step, n_step, _loss, lr, _l2,
                    time.time() - tic))
            for ix, ll in enumerate(_stage_losses):
                print('Worker{}:', hvd.rank(), 'Network#', ix, 'For Branch', ix % 2 + 1, 'Loss:', ll)

            # save intermediate results and model
            if hvd.rank() == 0:  # Horovod
                if (step != 0) and (step % save_interval == 0):
                    # save some results
                    [img_out, confs_ground, pafs_ground, conf_result, paf_result,
                     mask_out] = sess.run([x_, confs_, pafs_, last_conf, last_paf, mask])
                    draw_results(img_out, confs_ground, conf_result, pafs_ground, paf_result, mask_out,
                                 'train_%d_' % step)

                    # save model
                    # tl.files.save_npz(
                    #    net.all_params, os.path.join(model_path, 'pose' + str(step) + '.npz'), sess=sess)
github tensorlayer / openpose-plus / train.py View on Github external
tic = time.time()
            if step != 0 and (step % lr_decay_every_step == 0):
                new_lr_decay = lr_decay_factor**(step // lr_decay_every_step)
                sess.run(tf.assign(lr_v, scaled_lr * new_lr_decay))

            [_, _loss, _stage_losses, _l2, conf_result, paf_result] = \
                sess.run([train_op, total_loss, stage_losses, l2_loss, last_conf, last_paf])

            # tstring = time.strftime('%d-%m %H:%M:%S', time.localtime(time.time()))
            lr = sess.run(lr_v)
            print(
                'Worker{}: Total Loss at iteration {} / {} is: {} Learning rate {:10e} l2_loss {:10e} Took: {}s'.format(
                    hvd.rank(), step, n_step, _loss, lr, _l2,
                    time.time() - tic))
            for ix, ll in enumerate(_stage_losses):
                print('Worker{}:', hvd.rank(), 'Network#', ix, 'For Branch', ix % 2 + 1, 'Loss:', ll)

            # save intermediate results and model
            if hvd.rank() == 0:  # Horovod
                if (step != 0) and (step % save_interval == 0):
                    # save some results
                    [img_out, confs_ground, pafs_ground, conf_result, paf_result,
                     mask_out] = sess.run([x_, confs_, pafs_, last_conf, last_paf, mask])
                    draw_results(img_out, confs_ground, conf_result, pafs_ground, paf_result, mask_out,
                                 'train_%d_' % step)

                    # save model
                    # tl.files.save_npz(
                    #    net.all_params, os.path.join(model_path, 'pose' + str(step) + '.npz'), sess=sess)
                    # tl.files.save_npz(net.all_params, os.path.join(model_path, 'pose.npz'), sess=sess)
                    tl.files.save_npz_dict(
                        net.all_params, os.path.join(model_path, 'pose' + str(step) + '.npz'), sess=sess)
github HewlettPackard / dlcookbook-dlbs / python / nvtfcnn_benchmarks / cnn / nvutils / runner.py View on Github external
nstep = num_training_samples * num_iter // global_batch_size
        decay_steps = nstep
    else:
        nstep = num_iter
        num_epochs = max(nstep * global_batch_size // num_training_samples, 1)
        decay_steps = 90 * num_training_samples // global_batch_size

    nstep_per_epoch = num_training_samples // global_batch_size

    # Horovod: pin GPU to be used to process local rank (one GPU per process)
    config = tf.ConfigProto()
    #config.gpu_options.allow_growth = True
    config.gpu_options.visible_device_list = str(hvd.local_rank())
    config.gpu_options.force_gpu_compatible = True # Force pinned memory
    config.intra_op_parallelism_threads = 1 # Avoid pool of Eigen threads
    config.inter_op_parallelism_threads = max(2, 40//hvd.size()-2)

    classifier = tf.estimator.Estimator(
        model_fn=_cnn_model_function,
        model_dir=log_dir,
        params={
            'model':         infer_func,
            'format':        image_format,
            'dtype' : tf.float16 if precision == 'fp16' else tf.float32,
            'momentum' : momentum,
            'learning_rate_init' : learning_rate_init,
            'learning_rate_power' : learning_rate_power,
            'decay_steps' : decay_steps,
            'weight_decay' : weight_decay,
            'loss_scale' : loss_scale,
            'larc_eta' : larc_eta,
            'larc_mode' : larc_mode,
github polyaxon / polyaxon / examples / in_cluster / horovod / tensorflow / mnist.py View on Github external
# initialization of all workers when training is started with random weights or
    # restored from a checkpoint.
    bcast_hook = hvd.BroadcastGlobalVariablesHook(0)

    # Train the model
    train_input_fn = tf.estimator.inputs.numpy_input_fn(
        x={"x": train_data},
        y=train_labels,
        batch_size=100,
        num_epochs=None,
        shuffle=True)

    # Horovod: adjust number of steps based on number of GPUs.
    mnist_classifier.train(
        input_fn=train_input_fn,
        steps=3000 // hvd.size(),
        hooks=[logging_hook, bcast_hook])

    # Evaluate the model and print results
    eval_input_fn = tf.estimator.inputs.numpy_input_fn(
        x={"x": eval_data},
        y=eval_labels,
        num_epochs=1,
        shuffle=False)
    eval_results = mnist_classifier.evaluate(input_fn=eval_input_fn)
    print(eval_results)

    # Polyaxon
    if hvd.rank() == 0:
        experiment.log_metrics(**eval_results)
github NVIDIA / DALI / docs / examples / tensorflow / demo / nvutils / runner.py View on Github external
eval_idx_filenames = None
    if data_idx_dir is not None:
        filename_pattern = os.path.join(data_idx_dir, '%s-*')
        eval_idx_filenames = sorted(tf.gfile.Glob(filename_pattern % 'validation'))
    else:
        raise ValueError("data_idx_dir must be specified")


    # Horovod: pin GPU to be used to process local rank (one GPU per process)
    config = tf.ConfigProto()
    #config.gpu_options.allow_growth = True
    config.gpu_options.visible_device_list = str(hvd.local_rank())
    config.gpu_options.force_gpu_compatible = True # Force pinned memory
    config.intra_op_parallelism_threads = 1 # Avoid pool of Eigen threads
    config.inter_op_parallelism_threads = 40 // hvd.size() - 2

    classifier_eval = tf.estimator.Estimator(
        model_fn=_cnn_model_function,
        model_dir=log_dir,
        params={
            'model':         infer_func,
            'format':        image_format,
            'dtype' : tf.float16 if precision == 'fp16' else tf.float32,
            'momentum' : momentum,
            'learning_rate_init' : learning_rate_init,
            'learning_rate_power' : learning_rate_power,
            'decay_steps' : None,
            'weight_decay' : weight_decay,
            'loss_scale' : loss_scale,
            'larc_eta' : larc_eta,
            'larc_mode' : larc_mode,