How to use the horovod.tensorflow.rank function in horovod

To help you get started, we’ve selected a few horovod examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github tensorlayer / openpose-plus / train.py View on Github external
if step == n_step:
                break

            tic = time.time()
            if step != 0 and (step % lr_decay_every_step == 0):
                new_lr_decay = lr_decay_factor**(step // lr_decay_every_step)
                sess.run(tf.assign(lr_v, scaled_lr * new_lr_decay))

            [_, _loss, _stage_losses, _l2, conf_result, paf_result] = \
                sess.run([train_op, total_loss, stage_losses, l2_loss, last_conf, last_paf])

            # tstring = time.strftime('%d-%m %H:%M:%S', time.localtime(time.time()))
            lr = sess.run(lr_v)
            print(
                'Worker{}: Total Loss at iteration {} / {} is: {} Learning rate {:10e} l2_loss {:10e} Took: {}s'.format(
                    hvd.rank(), step, n_step, _loss, lr, _l2,
                    time.time() - tic))
            for ix, ll in enumerate(_stage_losses):
                print('Worker{}:', hvd.rank(), 'Network#', ix, 'For Branch', ix % 2 + 1, 'Loss:', ll)

            # save intermediate results and model
            if hvd.rank() == 0:  # Horovod
                if (step != 0) and (step % save_interval == 0):
                    # save some results
                    [img_out, confs_ground, pafs_ground, conf_result, paf_result,
                     mask_out] = sess.run([x_, confs_, pafs_, last_conf, last_paf, mask])
                    draw_results(img_out, confs_ground, conf_result, pafs_ground, paf_result, mask_out,
                                 'train_%d_' % step)

                    # save model
                    # tl.files.save_npz(
                    #    net.all_params, os.path.join(model_path, 'pose' + str(step) + '.npz'), sess=sess)
github tensorlayer / openpose-plus / train.py View on Github external
tic = time.time()
            if step != 0 and (step % lr_decay_every_step == 0):
                new_lr_decay = lr_decay_factor**(step // lr_decay_every_step)
                sess.run(tf.assign(lr_v, scaled_lr * new_lr_decay))

            [_, _loss, _stage_losses, _l2, conf_result, paf_result] = \
                sess.run([train_op, total_loss, stage_losses, l2_loss, last_conf, last_paf])

            # tstring = time.strftime('%d-%m %H:%M:%S', time.localtime(time.time()))
            lr = sess.run(lr_v)
            print(
                'Worker{}: Total Loss at iteration {} / {} is: {} Learning rate {:10e} l2_loss {:10e} Took: {}s'.format(
                    hvd.rank(), step, n_step, _loss, lr, _l2,
                    time.time() - tic))
            for ix, ll in enumerate(_stage_losses):
                print('Worker{}:', hvd.rank(), 'Network#', ix, 'For Branch', ix % 2 + 1, 'Loss:', ll)

            # save intermediate results and model
            if hvd.rank() == 0:  # Horovod
                if (step != 0) and (step % save_interval == 0):
                    # save some results
                    [img_out, confs_ground, pafs_ground, conf_result, paf_result,
                     mask_out] = sess.run([x_, confs_, pafs_, last_conf, last_paf, mask])
                    draw_results(img_out, confs_ground, conf_result, pafs_ground, paf_result, mask_out,
                                 'train_%d_' % step)

                    # save model
                    # tl.files.save_npz(
                    #    net.all_params, os.path.join(model_path, 'pose' + str(step) + '.npz'), sess=sess)
                    # tl.files.save_npz(net.all_params, os.path.join(model_path, 'pose.npz'), sess=sess)
                    tl.files.save_npz_dict(
                        net.all_params, os.path.join(model_path, 'pose' + str(step) + '.npz'), sess=sess)
github itmessager / Figma_RCNN / detection / tensorpacks / joint_train.py View on Github external
if cfg.TRAINER == 'replicated':
            with ThreadPoolExecutor(max_workers=self.num_predictor) as executor, \
                    tqdm.tqdm(total=sum([df.size() for df in self.dataflows])) as pbar:
                futures = []
                for dataflow, pred in zip(self.dataflows, self.predictors):
                    futures.append(executor.submit(eval_coco, dataflow, pred, pbar))
                all_results = list(itertools.chain(*[fut.result() for fut in futures]))
        else:
            if self._horovod_run_eval:
                local_results = eval_coco(self.dataflow, self.predictor)
                output_partial = os.path.join(
                    logdir, 'outputs{}-part{}.json'.format(self.global_step, hvd.local_rank()))
                with open(output_partial, 'w') as f:
                    json.dump(local_results, f)
            self.barrier.eval()
            if hvd.rank() > 0:
                return
            all_results = []
            for k in range(hvd.local_size()):
                output_partial = os.path.join(
                    logdir, 'outputs{}-part{}.json'.format(self.global_step, k))
                with open(output_partial, 'r') as f:
                    obj = json.load(f)
                all_results.extend(obj)
                os.unlink(output_partial)

        output_file = os.path.join(
            logdir, 'outputs{}.json'.format(self.global_step))
        with open(output_file, 'w') as f:
            json.dump(all_results, f)
        try:
            scores = print_evaluation_scores(output_file)
github horovod / horovod / examples / tensorflow_word2vec.py View on Github external
def maybe_download(filename, expected_bytes):
    """Download a file if not present, and make sure it's the right size."""
    if not os.path.exists(filename):
        filename, _ = urllib.request.urlretrieve(url, filename)
    statinfo = os.stat(filename)
    if statinfo.st_size == expected_bytes:
        print('Found and verified', filename)
    else:
        print(statinfo.st_size)
        raise Exception(
            'Failed to verify ' + url + '. Can you get to it with a browser?')
    return filename

filename = maybe_download('text8-%d.zip' % hvd.rank(), 31344016)


# Read the data into a list of strings.
def read_data(filename):
    """Extract the first file enclosed in a zip file as a list of words."""
    with zipfile.ZipFile(filename) as f:
        data = tf.compat.as_str(f.read(f.namelist()[0])).split()
    return data

vocabulary = read_data(filename)
print('Data size', len(vocabulary))

# Step 2: Build the dictionary and replace rare words with UNK token.
vocabulary_size = 50000
github aws-samples / deep-learning-models / legacy / models / resnet / tensorflow / train_imagenet_resnet_hvd.py View on Github external
def rank0log(logger, *args, **kwargs):
    if hvd.rank() == 0:
        if logger:
            logger.info(''.join([str(x) for x in list(args)]))
        else:
            print(*args, **kwargs)
github aws-samples / deep-learning-models / models / vision / detection / tools / train_dali.py View on Github external
def main(cfg):
    ######################################################################################
    # Create Training Data
    ######################################################################################
    datasets = build_dataset(cfg.data.train)
    tf_datasets = [build_dataloader(datasets,
                         cfg.batch_size_per_device,
                         cfg.workers_per_gpu,
                         num_gpus=hvd.size(),
                         dist=True)]
    dali_tdf = dali.dali_dataset(cfg.data.train.dataset_dir, 
                                 batch_size=cfg.batch_size_per_device, 
                                 device_id=hvd.rank(), num_gpus=hvd.size())
    tf_datasets = [(dali_tdf, tf_datasets[0][1])]
    ######################################################################################
    # Build Model
    ######################################################################################
    model = build_detector(cfg.model,
                           train_cfg=cfg.train_cfg,
                           test_cfg=cfg.test_cfg)
    # Pass example through so tensor shapes are defined
    _ = model(next(iter(tf_datasets[0][0])), use_dali=True)
    model.layers[0].layers[0].load_weights(cfg.weights_path, by_name=False)
    ######################################################################################
    # Create Model Runner
    ######################################################################################
    runner = sagemaker_runner.Runner(model, batch_processor, name=cfg.model_name, 
                                     optimizer=cfg.optimizer, work_dir=cfg.work_dir,
                                     logger=get_root_logger(cfg.log_level), amp_enabled=cfg.fp16,
github yenchenlin / pix2pix-flow / tfops.py View on Github external
def print_act_stats(x, _str=""):
    if not do_print_act_stats:
        return x
    if hvd.rank() != 0:
        return x
    if len(x.get_shape()) == 1:
        x_mean, x_var = tf.nn.moments(x, [0], keep_dims=True)
    if len(x.get_shape()) == 2:
        x_mean, x_var = tf.nn.moments(x, [0], keep_dims=True)
    if len(x.get_shape()) == 4:
        x_mean, x_var = tf.nn.moments(x, [0, 1, 2], keep_dims=True)
    stats = [tf.reduce_min(x_mean), tf.reduce_mean(x_mean), tf.reduce_max(x_mean),
             tf.reduce_min(tf.sqrt(x_var)), tf.reduce_mean(tf.sqrt(x_var)), tf.reduce_max(tf.sqrt(x_var))]
    return tf.Print(x, stats, "["+_str+"] "+x.name)
github lsds / KungFu / srcs / python / kungfu / tensorflow / v1 / benchmarks / __main__.py View on Github external
def _rank(method):
    if method == 'HOROVOD':
        import horovod.tensorflow as hvd
        return hvd.rank()
    else:
        return current_rank()
github yyht / BERT / t2t_bert / porn_classification / base_train.py View on Github external
def train_fn(op, loss):
			i = 0
			total_loss = 0
			cnt = 0
			while True:
				try:
					[_, train_loss] = sess.run([op, loss])
					i += 1
					cnt += 1
					total_loss += train_loss
					# print("==device id {} global step {}".format(hvd.rank(), step))
					if np.mod(i, num_storage_steps) == 0:
						print(total_loss/cnt)
						if hvd.rank() == 0:
							model_io_fn.save_model(sess, FLAGS.model_output+"/oqmrc_{}.ckpt".format(int(i/num_storage_steps)))
						cnt = 0
						total_loss = 0
				except tf.errors.OutOfRangeError:
					print("End of dataset")
					break
		import time
github awslabs / sagemaker-debugger / examples / tensorflow / scripts / train_imagenet_resnet_hvd.py View on Github external
label_element = nest.map_structure(
            lambda s: tf.constant(1, tf.int32, s), tf.TensorShape([1])
        )
        element = (input_element, label_element)
        ds = tf.data.Dataset.from_tensors(element).repeat()
    else:
        shuffle_buffer_size = 10000
        num_readers = 1
        if hvd.size() > len(filenames):
            assert (hvd.size() % len(filenames)) == 0
            filenames = filenames * (hvd.size() / len(filenames))

        ds = tf.data.Dataset.from_tensor_slices(filenames)
        if shard:
            # split the dataset into parts for each GPU
            ds = ds.shard(hvd.size(), hvd.rank())

        if not training:
            # make sure all ranks have the same amount
            ds = ds.take(take_count)

        if training:
            ds = ds.shuffle(1000, seed=7 * (1 + hvd.rank()))

        ds = ds.interleave(tf.data.TFRecordDataset, cycle_length=num_readers, block_length=1)
        counter = tf.data.Dataset.range(sys.maxsize)
        ds = tf.data.Dataset.zip((ds, counter))
        preproc_func = lambda record, counter_: parse_and_preprocess_image_record(
            record,
            counter_,
            height,
            width,