How to use the horovod.torch.init function in horovod

To help you get started, we’ve selected a few horovod examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github microsoft / nlp-recipes / utils_nlp / azureml / azureml_bert_util.py View on Github external
def __init__(self, accumulation_step=1):
        hvd.init()
        self.local_rank = hvd.local_rank()
        self.world_size = hvd.size()
        self.rank = hvd.rank()
        self.n_gpu = torch.cuda.device_count()
        self.node_count = self.world_size // self.n_gpu
        self.accumulation_step = accumulation_step
        self.count_down = accumulation_step - 1
        self._multi_node = self.node_count > 1 
        if not self._multi_node:
            # use PyTorch build-in NCCL backend for single node training
            torch.distributed.init_process_group(backend='nccl', init_method='tcp://127.0.0.1:6000',
                                world_size=self.n_gpu,  rank=self.local_rank)
github horovod / horovod / test / test_torch.py View on Github external
def test_horovod_allreduce_average(self):
        """Test that the allreduce correctly averages 1D, 2D, 3D tensors."""
        hvd.init()
        size = hvd.size()
        dtypes = self.filter_supported_types([torch.IntTensor, torch.LongTensor,
                     torch.FloatTensor, torch.DoubleTensor])
        if torch.cuda.is_available():
            dtypes += [torch.cuda.IntTensor, torch.cuda.LongTensor,
                       torch.cuda.FloatTensor, torch.cuda.DoubleTensor]
            if _fp16_supported:
                dtypes += [torch.cuda.HalfTensor]
        dims = [1, 2, 3]
        for dtype, dim in itertools.product(dtypes, dims):
            torch.manual_seed(1234)
            tensor = torch.FloatTensor(*([17] * dim)).random_(-100, 100)
            tensor = self.cast_and_place(tensor, dtype)
            averaged = hvd.allreduce(tensor, average=True)
            max_difference = averaged.data.sub(tensor).max()
github horovod / horovod / test / test_torch.py View on Github external
def test_horovod_allgather(self):
        """Test that the allgather correctly gathers 1D, 2D, 3D tensors."""
        hvd.init()
        rank = hvd.rank()
        size = hvd.size()

        dtypes = [torch.ByteTensor, torch.CharTensor, torch.ShortTensor,
                  torch.IntTensor, torch.LongTensor, torch.FloatTensor, torch.DoubleTensor]
        if _fp16_supported:
            dtypes += [torch.HalfTensor]
        if torch.cuda.is_available():
            dtypes += [torch.cuda.ByteTensor, torch.cuda.CharTensor, torch.cuda.ShortTensor,
                       torch.cuda.IntTensor, torch.cuda.LongTensor,
                       torch.cuda.FloatTensor, torch.cuda.DoubleTensor]
            if _fp16_supported:
                dtypes += [torch.cuda.HalfTensor]
        dims = [1, 2, 3]
        for dtype, dim in itertools.product(dtypes, dims):
            tensor = torch.FloatTensor(*([17] * dim)).fill_(1).mul_(rank)
github horovod / horovod / test / test_torch.py View on Github external
def test_delta_optimizer(self):
        """Test that delta optimizer."""
        hvd.init()
        # TODO support non-MPI Adasum operation
        # Only do this test if there are GPUs available.
        if not hvd.mpi_enabled() or not torch.cuda.is_available():
            return

        local_rank = hvd.local_rank()
        size = hvd.size()

        # This test does not apply if there is only one worker.
        if size == 1:
            return
        class Net(torch.nn.Module):
            def __init__(self):
                super(Net, self).__init__()
                self.conv1 = torch.nn.Conv2d(1, 100, 1).cuda(local_rank)
                self.conv2 = torch.nn.Conv2d(100, 1, 1).cuda(local_rank)
github jzlianglu / pykaldi2 / bin / train_chain.py View on Github external
print("pytorch version:{}".format(th.__version__))

    with open(args.data) as f:
        data = yaml.safe_load(f)
        config["source_paths"] = [j for i, j in data['clean_source'].items()]
        if 'dir_noise' in data:
            config["dir_noise_paths"] = [j for i, j in data['dir_noise'].items()]
        if 'rir' in data:
            config["rir_paths"] = [j for i, j in data['rir'].items()]
    config['data_path'] = args.dataPath

    print("Experiment starts with config {}".format(json.dumps(config, sort_keys=True, indent=4)))

    # Initialize Horovod
    hvd.init()

    th.cuda.set_device(hvd.local_rank())

    print("Run experiments with world size {}".format(hvd.size()))

    dataset = SpeechDataset(config)
    transform=None
    if args.transform is not None and os.path.isfile(args.transform):
        with open(args.transform, 'rb') as f:
            transform = pickle.load(f)
            dataset.transform = transform

    train_dataloader = SeqDataloader(dataset,
                                    batch_size=args.batch_size,
                                    num_workers = args.data_loader_threads,
                                    distributed=True,
github jzlianglu / pykaldi2 / bin / train_se2.py View on Github external
print("pytorch version:{}".format(th.__version__))

    with open(args.data) as f:
        data = yaml.safe_load(f)
        config["source_paths"] = [j for i, j in data['clean_source'].items()]
        if 'dir_noise' in data:
            config["dir_noise_paths"] = [j for i, j in data['dir_noise'].items()]
        if 'rir' in data:
            config["rir_paths"] = [j for i, j in data['rir'].items()]
    config['data_path'] = args.dataPath

    print("Experiment starts with config {}".format(json.dumps(config, sort_keys=True, indent=4)))

    # Initialize Horovod
    hvd.init()

    th.cuda.set_device(hvd.local_rank())

    print("Run experiments with world size {}".format(hvd.size()))

    dataset = SpeechDataset(config)
    transform=None
    if args.transform is not None and os.path.isfile(args.transform):
        with open(args.transform, 'rb') as f:
            transform = pickle.load(f)
            dataset.transform = transform

    train_dataloader = SeqDataloader(dataset,
                                    batch_size=args.batch_size,
                                    num_workers = args.data_loader_threads,
                                    distributed=True,
github microsoft / nlp-recipes / examples / question_answering / distributed_question_answering_squad_transformers_HVD.py View on Github external
from utils_nlp.eval.question_answering import evaluate_qa
from utils_nlp.common.timer import Timer

parser = argparse.ArgumentParser()
parser.add_argument("--cache_dir", type=str, default="./")
parser.add_argument("--model_name", type=str, default="distilbert-base-uncased")
parser.add_argument("--do_lower_case", type=bool, default=True)
parser.add_argument("--quick_run", type=bool, default=False)
parser.add_argument("--gradient_accumulation_steps", type=int, default=1)

args = parser.parse_args()

HOROVOD = True


hvd.init()

rank = hvd.rank()
local_rank = hvd.local_rank()
world_size = hvd.size()

print("rank: {}".format(rank))
print("local_rank: {}".format(local_rank))
print("world_size: {}".format(world_size))

MODEL_NAME = args.model_name
DO_LOWER_CASE = args.do_lower_case

TRAIN_DATA_USED_PERCENT = 1
DEV_DATA_USED_PERCENT = 1
NUM_EPOCHS = 2
github chenghuige / wenzheng / projects / feed / rank / tf / torch-only-train.py View on Github external
import pyt.model as base
import torch
import text_dataset
from pyt.dataset import get_dataset
from pyt.model import *
from torch import nn
from torch.nn import functional as F
from torch.utils.data import DataLoader

flags = tf.app.flags
FLAGS = flags.FLAGS

logging = melt.logging

import horovod.torch as hvd
hvd.init()
# Horovod: pin GPU to local rank.
torch.cuda.set_device(hvd.local_rank())

def main(_):
  FLAGS.torch_only = True
  #FLAGS.valid_input = None
  melt.init()
  fit = melt.get_fit()

  FLAGS.eval_batch_size = 512 * FLAGS.valid_multiplier

  model_name = FLAGS.model
  model = getattr(base, model_name)() 

  loss_fn = nn.BCEWithLogitsLoss()
github fhkingma / bitswap / model / imagenet_train.py View on Github external
schedule = True if args.schedule == 1 else False
    decay = args.decay
    assert nz > 0

    # setup seeds to maintain experiment reproducibility
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    np.random.seed(seed)
    torch.backends.cudnn.deterministic = True

    # Distributed: set up horovod over multiple gpu's
    if distributed:
        import horovod.torch as hvd

        # initialize horovod
        hvd.init()

        # pin gpu to "local rank" (see Horovod documentation)
        torch.cuda.set_device(hvd.local_rank())
        print(f"My local rank is {hvd.local_rank()}")

        # distribute mini-batches over the different gpu's
        batch_size //= hvd.size()

    # string-tag for logging
    tag = f'nz{nz}'

    # define the "root process": only one of the gpu's has to log relevant values
    # set only one gpu as root process
    root_process = True
    if distributed and not hvd.rank() == 0:
        root_process = False
github horovod / horovod / examples / pytorch_synthetic_benchmark.py View on Github external
help='number of warm-up batches that don\'t count towards benchmark')
parser.add_argument('--num-batches-per-iter', type=int, default=10,
                    help='number of batches per benchmark iteration')
parser.add_argument('--num-iters', type=int, default=10,
                    help='number of benchmark iterations')

parser.add_argument('--no-cuda', action='store_true', default=False,
                    help='disables CUDA training')

parser.add_argument('--use-adasum', action='store_true', default=False,
                    help='use adasum algorithm to do reduction')

args = parser.parse_args()
args.cuda = not args.no_cuda and torch.cuda.is_available()

hvd.init()

if args.cuda:
    # Horovod: pin GPU to local rank.
    torch.cuda.set_device(hvd.local_rank())

cudnn.benchmark = True

# Set up standard model.
model = getattr(models, args.model)()

# By default, Adasum doesn't need scaling up learning rate.
lr_scaler = hvd.size() if not args.use_adasum else 1

if args.cuda:
    # Move model to GPU.
    model.cuda()