How to use the horovod.torch.DistributedOptimizer function in horovod

To help you get started, we’ve selected a few horovod examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github horovod / horovod / test / test_torch.py View on Github external
class Net(torch.nn.Module):
            def __init__(self):
                super(Net, self).__init__()
                self.conv1 = torch.nn.Conv2d(1, 100, 1)
                self.conv2 = torch.nn.Conv2d(100, 1, 1)

            def forward(self, x):
                x = self.conv1(x)
                x = self.conv2(x)
                return x

        model = Net()
        opt = torch.optim.SGD(model.parameters(), lr=0.1)
        try:
            hvd.DistributedOptimizer(opt,
                named_parameters=list(model.named_parameters())[0:1])
            assert False, 'hvd.DistributedOptimizer did not throw error'
        except ValueError:
            pass
github jzlianglu / pykaldi2 / bin / train_se.py View on Github external
# ceate model
    model_config = config["model_config"]
    model = lstm.LSTMAM(model_config["feat_dim"], model_config["label_size"], model_config["hidden_size"], model_config["num_layers"], model_config["dropout"], True)

    model.cuda()

    # setup the optimizer
    optimizer = th.optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum)

    # Broadcast parameters and opterimizer state from rank 0 to all other processes.
    hvd.broadcast_parameters(model.state_dict(), root_rank=0)
    hvd.broadcast_optimizer_state(optimizer, root_rank=0)

    # Add Horovod Distributed Optimizer
    optimizer = hvd.DistributedOptimizer(optimizer, named_parameters=model.named_parameters())

    if os.path.isfile(args.seed_model):
        checkpoint = th.load(args.seed_model)                                            
        state_dict = checkpoint['model']                                            
        model.load_state_dict(state_dict)                                           
        print("=> loaded checkpoint '{}' ".format(args.seed_model))                      
    else:
        sys.stderr.write('ERROR: The model file %s does not exist!\n'%(args.seed_model))
        sys.exit(0)      

    HCLG = args.den_dir + "/HCLG.fst"
    words_txt = args.den_dir + "/words.txt"
    silence_phones = args.den_dir + "/phones/silence.csl"

    if not os.path.isfile(HCLG):
        sys.stderr.write('ERROR: The HCLG file %s does not exist!\n'%(HCLG))
github microsoft / nlp-recipes / scenarios / sentence_similarity / gensen_train.py View on Github external
Returns: hvd.DistributedOptimizer: Optimizer to use for computing
    gradients and applying updates.

    """
    # Horovod: scale learning rate by the number of GPUs.
    optimizer = optim.Adam(model.parameters(), lr=learning_rate * hvd.size())

    # Horovod: broadcast parameters & optimizer state.
    hvd.broadcast_parameters(model.state_dict(), root_rank=0)
    hvd.broadcast_optimizer_state(optimizer, root_rank=0)

    # Horovod: (optional) compression algorithm.
    compression = hvd.Compression.fp16

    # Horovod: wrap optimizer with DistributedOptimizer.
    optimizer = hvd.DistributedOptimizer(
        optimizer,
        named_parameters=model.named_parameters(),
        compression=compression,
    )

    return optimizer
github jzlianglu / pykaldi2 / bin / train_ce.py View on Github external
# Start training
    th.backends.cudnn.enabled = True
    if th.cuda.is_available():
        model.cuda()

    # optimizer
    optimizer = th.optim.Adam(model.parameters(), lr=args.lr, amsgrad=True)

    if args.hvd:
        # Broadcast parameters and opterimizer state from rank 0 to all other processes.
        hvd.broadcast_parameters(model.state_dict(), root_rank=0)
        hvd.broadcast_optimizer_state(optimizer, root_rank=0)

        # Add Horovod Distributed Optimizer
        optimizer = hvd.DistributedOptimizer(optimizer, named_parameters=model.named_parameters())

    # criterion
    criterion = nn.CrossEntropyLoss(ignore_index=-100)

    start_epoch = 0
    if args.resume_from_model:

        assert os.path.isfile(args.resume_from_model), "ERROR: model file {} does not exit!".format(args.resume_from_model)

        checkpoint = th.load(args.resume_from_model)
        state_dict = checkpoint['model']
        start_epoch = checkpoint['epoch']
        model.load_state_dict(state_dict)
        optimizer.load_state_dict(checkpoint['optimizer'])
        print("=> loaded checkpoint '{}' ".format(args.resume_from_model))
github polyaxon / polyaxon / examples / in_cluster / horovod / pytorch / synthetic_benchmark.py View on Github external
cudnn.benchmark = True

# Set up standard model.
model = getattr(models, args.model)()

if args.cuda:
    # Move model to GPU.
    model.cuda()

optimizer = optim.SGD(model.parameters(), lr=0.01)

# Horovod: (optional) compression algorithm.
compression = hvd.Compression.fp16 if args.fp16_allreduce else hvd.Compression.none

# Horovod: wrap optimizer with DistributedOptimizer.
optimizer = hvd.DistributedOptimizer(optimizer,
                                     named_parameters=model.named_parameters(),
                                     compression=compression)

# Horovod: broadcast parameters & optimizer state.
hvd.broadcast_parameters(model.state_dict(), root_rank=0)
hvd.broadcast_optimizer_state(optimizer, root_rank=0)

# Set up fixed fake data
data = torch.randn(args.batch_size, 3, 224, 224)
target = torch.LongTensor(args.batch_size).random_() % 1000
if args.cuda:
    data, target = data.cuda(), target.cuda()


def benchmark_step():
    optimizer.zero_grad()
github microsoft / nlp-recipes / utils_nlp / models / bert / question_answering_distributed_v2.py View on Github external
for n, p in self.model.named_parameters()
                    if not any(nd in n for nd in no_decay)
                ],
                "weight_decay": 0.01,
            },
            {
                "params": [
                    p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay)
                ],
                "weight_decay": 0.0,
            },
        ]
        optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate, eps=1e-6)

        if distributed:
            optimizer = hvd.DistributedOptimizer(
                optimizer,
                named_parameters=self.model.named_parameters(),
                backward_passes_per_step=gradient_accumulation_steps,
            )

            hvd.broadcast_parameters(self.model.state_dict(), root_rank=0)
            hvd.broadcast_optimizer_state(optimizer, root_rank=0)

        if warmup_proportion:
            warmup_steps = t_total * warmup_proportion
        else:
            warmup_steps = 0

        scheduler = WarmupLinearSchedule(optimizer, warmup_steps=warmup_steps, t_total=t_total)

        global_step = 0
github vlimant / mpi_learn / mpi_learn / train / optimizer.py View on Github external
def build_torch(self, model):
        import torch
        lookup = {
            'sgd':      torch.optim.SGD,
            'adadelta': torch.optim.Adadelta,
            'rmsprop':  torch.optim.RMSprop,
            'adam':     torch.optim.Adam
            }
        if self.name not in lookup:
            logging.warning("No optimizer '{}' found, using SGD instead".format(self.name))
            self.name = 'sgd'
        opt = lookup[self.name](model.parameters(), **self.config)
        if self.horovod_wrapper:
            import horovod.torch as hvd
            opt = hvd.DistributedOptimizer(opt, named_parameters=model.named_parameters())
        return opt
github microsoft / seismic-deeplearning / experiments / interpretation / dutchf3_patch / horovod / train.py View on Github external
class_weights = torch.tensor(
        config.DATASET.CLASS_WEIGHTS, device=device, requires_grad=False
    )

    criterion = torch.nn.CrossEntropyLoss(
        weight=class_weights, ignore_index=255, reduction="mean"
    )
    # Horovod: broadcast parameters & optimizer state.
    hvd.broadcast_parameters(model.state_dict(), root_rank=0)
    hvd.broadcast_optimizer_state(optimizer, root_rank=0)

    # Horovod: (optional) compression algorithm.
    compression = hvd.Compression.fp16 if config.HOROVOD.FP16 else hvd.Compression.none

    # Horovod: wrap optimizer with DistributedOptimizer.
    optimizer = hvd.DistributedOptimizer(optimizer,
                                         named_parameters=model.named_parameters(),
                                         compression=compression)

    # summary_writer = create_summary_writer(log_dir=config.LOG_DIR)
    snapshot_duration = scheduler_step * len(train_loader)
    warmup_duration = 5 * len(train_loader)
    warmup_scheduler = LinearCyclicalScheduler(
        optimizer,
        "lr",
        start_value=config.TRAIN.MAX_LR,
        end_value=config.TRAIN.MAX_LR * world_size,
        cycle_size=10 * len(train_loader),
    )
    cosine_scheduler = CosineAnnealingScheduler(
        optimizer,
        "lr",
github microsoft / nlp-recipes / examples / sentence_similarity / gensen_train.py View on Github external
Returns: hvd.DistributedOptimizer: Optimizer to use for computing
    gradients and applying updates.

    """
    # Horovod: scale learning rate by the number of GPUs.
    optimizer = optim.Adam(model.parameters(), lr=learning_rate * hvd.size())

    # Horovod: broadcast parameters & optimizer state.
    hvd.broadcast_parameters(model.state_dict(), root_rank=0)
    hvd.broadcast_optimizer_state(optimizer, root_rank=0)

    # Horovod: (optional) compression algorithm.
    compression = hvd.Compression.fp16

    # Horovod: wrap optimizer with DistributedOptimizer.
    optimizer = hvd.DistributedOptimizer(
        optimizer,
        named_parameters=model.named_parameters(),
        compression=compression,
    )

    return optimizer