How to use the horovod.torch.broadcast_parameters function in horovod

To help you get started, we’ve selected a few horovod examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github jzlianglu / pykaldi2 / bin / train_se2.py View on Github external
print("Number of minibatches: {}".format(len(train_dataloader)))

    if not os.path.isdir(args.exp_dir):
        os.makedirs(args.exp_dir)

    # ceate model
    model_config = config["model_config"]
    model = lstm.LSTMAM(model_config["feat_dim"], model_config["label_size"], model_config["hidden_size"], model_config["num_layers"], model_config["dropout"], True)

    model.cuda()

    # setup the optimizer
    optimizer = th.optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay)

    # Broadcast parameters and opterimizer state from rank 0 to all other processes.
    hvd.broadcast_parameters(model.state_dict(), root_rank=0)
    hvd.broadcast_optimizer_state(optimizer, root_rank=0)

    # Add Horovod Distributed Optimizer
    optimizer = hvd.DistributedOptimizer(optimizer, named_parameters=model.named_parameters())

    if os.path.isfile(args.seed_model):
        checkpoint = th.load(args.seed_model)                                            
        state_dict = checkpoint['model']                                            
        model.load_state_dict(state_dict)                                           
        print("=> loaded checkpoint '{}' ".format(args.seed_model))                      
    else:
        sys.stderr.write('ERROR: The model file %s does not exist!\n'%(args.seed_model))
        sys.exit(0)      

    HCLG = args.den_dir + "/HCLG.fst"
    words_txt = args.den_dir + "/words.txt"
github horovod / horovod / examples / pytorch_mnist.py View on Github external
# By default, Adasum doesn't need scaling up learning rate.
lr_scaler = hvd.size() if not args.use_adasum else 1

if args.cuda:
    # Move model to GPU.
    model.cuda()
    # If using GPU Adasum allreduce, scale learning rate by local_size.
    if args.use_adasum and hvd.nccl_built():
        lr_scaler = hvd.local_size()

# Horovod: scale learning rate by lr_scaler.
optimizer = optim.SGD(model.parameters(), lr=args.lr * lr_scaler,
                      momentum=args.momentum)

# Horovod: broadcast parameters & optimizer state.
hvd.broadcast_parameters(model.state_dict(), root_rank=0)
hvd.broadcast_optimizer_state(optimizer, root_rank=0)

# Horovod: (optional) compression algorithm.
compression = hvd.Compression.fp16 if args.fp16_allreduce else hvd.Compression.none

# Horovod: wrap optimizer with DistributedOptimizer.
optimizer = hvd.DistributedOptimizer(optimizer,
                                     named_parameters=model.named_parameters(),
                                     compression=compression,
                                     op=hvd.Adasum if args.use_adasum else hvd.Average)


def train(epoch):
    model.train()
    # Horovod: set epoch to sampler for shuffling.
    train_sampler.set_epoch(epoch)
github mit-han-lab / ProxylessNAS / training / main.py View on Github external
# Restore from a previous checkpoint, if initial_epoch is specified.
# Horovod: restore on the first worker which will broadcast weights to other workers.
if resume_from_epoch > 0 and hvd.rank() == 0:
    filepath = args.checkpoint_format.format(epoch=resume_from_epoch)
    checkpoint = torch.load(filepath)
    model.load_state_dict(checkpoint['model'])
    optimizer.load_state_dict(checkpoint['optimizer'])

if args.label_smoothing:
    criterion = cross_encropy_with_label_smoothing
else:
    criterion = nn.CrossEntropyLoss()


# Horovod: broadcast parameters & optimizer state.
hvd.broadcast_parameters(model.state_dict(), root_rank=0)
hvd.broadcast_optimizer_state(optimizer, root_rank=0)


def train(epoch):
    model.train()
    train_sampler.set_epoch(epoch)
    train_loss = Metric('train_loss')
    train_accuracy = Metric('train_accuracy')

    with tqdm(total=len(train_loader),
              desc='Train Epoch     #{}'.format(epoch + 1),
              disable=not verbose) as t:
        for batch_idx, (data, target) in enumerate(train_loader):
            lr_cur = adjust_learning_rate(epoch, batch_idx, type=args.lr_scheduler)

            if args.cuda:
github chenghuige / wenzheng / utils / melt / eager / train.py View on Github external
optimizer, is_dynamic_opt = get_torch_optimizer(optimizer, model, num_steps_per_epoch, param_groups)

    start_epoch = 0  
    latest_path = latest_checkpoint + '.pyt' if latest_checkpoint else os.path.join(FLAGS.model_dir, 'latest.pyt')
    if not os.path.exists(latest_path):
      latest_path = os.path.join(FLAGS.model_dir, 'latest.pyt')
    if os.path.exists(latest_path):
      # NOTICE must only load 0! if all load now will OOM...
      if not use_horovod or hvd.rank() == 0:
        logging.info('loading torch model from', latest_path)
        checkpoint = load_torch_model(model, latest_path)
        if FLAGS.torch_load_optimizer:
          optimizer.load_state_dict(checkpoint['optimizer']) 

      if use_horovod:  
        hvd.broadcast_parameters(model.state_dict(), root_rank=0)
        optimizer_ = optimizer if not hasattr(optimizer, 'optimizer') else optimizer.optimizer 
        hvd.broadcast_optimizer_state(optimizer_, root_rank=0)
   
      if not FLAGS.torch_finetune:
        if not use_horovod or hvd.rank() == 0:
          start_epoch = checkpoint['epoch']
          step = checkpoint['step']
        else:
          start_epoch = 0
          step = 0
      
        if use_horovod:
          temp = np.array([start_epoch, step])
          comm.Bcast(temp, root=0)
          start_epoch, step = temp[0], temp[1]
github jzlianglu / pykaldi2 / bin / train_ce.py View on Github external
# ceate model
    model_config = config["model_config"]
    model = lstm.LSTMStack(model_config["feat_dim"], model_config["label_size"], model_config["hidden_size"], model_config["num_layers"], model_config["dropout"], True)

    # Start training
    th.backends.cudnn.enabled = True
    if th.cuda.is_available():
        model.cuda()

    # optimizer
    optimizer = th.optim.Adam(model.parameters(), lr=args.lr, amsgrad=True)

    if args.hvd:
        # Broadcast parameters and opterimizer state from rank 0 to all other processes.
        hvd.broadcast_parameters(model.state_dict(), root_rank=0)
        hvd.broadcast_optimizer_state(optimizer, root_rank=0)

        # Add Horovod Distributed Optimizer
        optimizer = hvd.DistributedOptimizer(optimizer, named_parameters=model.named_parameters())

    # criterion
    criterion = nn.CrossEntropyLoss(ignore_index=-100)

    start_epoch = 0
    if args.resume_from_model:

        assert os.path.isfile(args.resume_from_model), "ERROR: model file {} does not exit!".format(args.resume_from_model)

        checkpoint = th.load(args.resume_from_model)
        state_dict = checkpoint['model']
        start_epoch = checkpoint['epoch']
github RomanovMikeV / setka / scorch / train.py View on Github external
net = model.Network(**model_kwargs)

    # Model CPU -> GPU
    if use_cuda:
        torch.cuda.set_device(hvd.local_rank())
        net = net.cuda()

    # Creating a Socket for a network
    socket = model.Socket(net)

    # Preparing Socket for Horovod training
    for opt_index in range(len(socket.optimizers)):
        socket.optimizers[opt_index].optimizer = hvd.DistributedOptimizer(
            socket.optimizers[opt_index].optimizer)

    hvd.broadcast_parameters(socket.model.state_dict(), root_rank=0)

    # Creating the datasets
    ds_index = dataset.DataSetIndex(**dataset_kwargs)

    train_dataset = dataset.DataSet(
        ds_index, mode='train')

    valid_dataset = dataset.DataSet(
        ds_index, mode='valid')

    test_dataset = dataset.DataSet(
        ds_index, mode='test')

    # Preparing the datasets for Horovod
    train_sampler = torch.utils.data.distributed.DistributedSampler(
        train_dataset, num_replicas=hvd.size(), rank=hvd.rank())
github tczhangzhi / pytorch-distributed / horovod_distributed.py View on Github external
# create model
    if args.pretrained:
        print("=> using pre-trained model '{}'".format(args.arch))
        model = models.__dict__[args.arch](pretrained=True)
    else:
        print("=> creating model '{}'".format(args.arch))
        model = models.__dict__[args.arch]()

    model.cuda()
    # When using a single GPU per process and per
    # DistributedDataParallel, we need to divide the batch size
    # ourselves based on the total number of GPUs we have
    args.batch_size = int(args.batch_size / ngpus_per_node)
    
    hvd.broadcast_parameters(model.state_dict(), root_rank=0)

    # define loss function (criterion) and optimizer
    criterion = nn.CrossEntropyLoss().cuda()

    optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay)
    hvd.broadcast_optimizer_state(optimizer, root_rank=0)
    compression = hvd.Compression.fp16

    optimizer = hvd.DistributedOptimizer(optimizer, named_parameters=model.named_parameters(), compression=compression)

    cudnn.benchmark = True

    # Data loading code
    traindir = os.path.join(args.data, 'train')
    valdir = os.path.join(args.data, 'val')
    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
github microsoft / nlp-recipes / utils_nlp / models / bert / question_answering_distributed_v1.py View on Github external
"params": [
                    p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay)
                ],
                "weight_decay": 0.0,
            },
        ]
        optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate, eps=1e-6)

        if distributed:
            optimizer = hvd.DistributedOptimizer(
                optimizer,
                named_parameters=self.model.named_parameters(),
                backward_passes_per_step=gradient_accumulation_steps,
            )

            hvd.broadcast_parameters(self.model.state_dict(), root_rank=0)
            hvd.broadcast_optimizer_state(optimizer, root_rank=0)

        if warmup_proportion:
            warmup_steps = t_total * warmup_proportion
        else:
            warmup_steps = 0

        scheduler = WarmupLinearSchedule(optimizer, warmup_steps=warmup_steps, t_total=t_total)

        global_step = 0
        tr_loss = 0.0
        # self.model.zero_grad()
        self.model.train()
        train_iterator = trange(int(num_epochs), desc="Epoch")
        for _ in train_iterator:
            for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration", mininterval=60)):
github microsoft / DistributedDeepLearning / HorovodPytorch / src / imagenet_pytorch_horovod.py View on Github external
kwargs = {'num_workers': 6, 'pin_memory': True}
    train_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=_BATCHSIZE, sampler=train_sampler, **kwargs)

    # Autotune
    cudnn.benchmark = True

    logger.info("Loading model")
    # Load symbol
    model = models.__dict__['resnet50'](pretrained=False)

    model.cuda()

    if _DISTRIBUTED:
        # Horovod: broadcast parameters.
        hvd.broadcast_parameters(model.state_dict(), root_rank=0)

    num_gpus= hvd.size() if _DISTRIBUTED else 1
    # Horovod: scale learning rate by the number of GPUs.
    optimizer = optim.SGD(model.parameters(), lr=_LR * num_gpus,
                          momentum=0.9)
    if _DISTRIBUTED:
        # Horovod: wrap optimizer with DistributedOptimizer.
        optimizer = hvd.DistributedOptimizer(
            optimizer, named_parameters=model.named_parameters())

    criterion=F.cross_entropy
    # Main training-loop
    logger.info("Training ...")
    for epoch in range(_EPOCHS):
        with Timer(output=logger.info, prefix="Training") as t:
            model.train()
github jzlianglu / pykaldi2 / bin / train_chain.py View on Github external
print("Number of minibatches: {}".format(len(train_dataloader)))

    if not os.path.isdir(args.exp_dir):
        os.makedirs(args.exp_dir)

    # ceate model
    model_config = config["model_config"]
    model = lstm.LSTMAM(model_config["feat_dim"], model_config["label_size"], model_config["hidden_size"], model_config["num_layers"], model_config["dropout"], True)

    model.cuda()

    # setup the optimizer
    optimizer = th.optim.Adam(model.parameters(), lr=args.lr, amsgrad=True)

    # Broadcast parameters and opterimizer state from rank 0 to all other processes.
    hvd.broadcast_parameters(model.state_dict(), root_rank=0)
    hvd.broadcast_optimizer_state(optimizer, root_rank=0)

    # Add Horovod Distributed Optimizer
    optimizer = hvd.DistributedOptimizer(optimizer, named_parameters=model.named_parameters())

    if os.path.isfile(args.seed_model):
        checkpoint = th.load(args.seed_model)                                            
        state_dict = checkpoint['model']
        from collections import OrderedDict                  
        new_state_dict = OrderedDict()                       
        for k, v in state_dict.items():                      
            header = k[:7]                                   
            name = k[7:] # remove 'module.' of dataparallel  
            new_state_dict[name]=v                           
        if header == "module.":                              
            model.load_state_dict(new_state_dict)