How to use the horovod.torch.local_rank function in horovod

To help you get started, we’ve selected a few horovod examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github microsoft / seismic-deeplearning / experiments / interpretation / dutchf3_patch / horovod / train.py View on Github external
Options loaded from default.py will be overridden by options loaded from cfg file
        Options passed in through options argument will override option loaded from cfg file

    Args:
        *options (str,int ,optional): Options used to overide what is loaded from the config.
                                      To see what options are available consult default.py
        cfg (str, optional): Location of config file to load. Defaults to None.
    """

    update_config(config, options=options, config_file=cfg)
    hvd.init()
    silence_other_ranks = True
    logging.config.fileConfig(config.LOG_CONFIG)
    logger = logging.getLogger(__name__)
    torch.manual_seed(config.SEED)
    torch.cuda.set_device(hvd.local_rank())
    torch.cuda.manual_seed(config.SEED)
    rank, world_size = hvd.rank(), hvd.size()

    scheduler_step = config.TRAIN.END_EPOCH // config.TRAIN.SNAPSHOTS
    torch.backends.cudnn.benchmark = config.CUDNN.BENCHMARK
    torch.manual_seed(config.SEED)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(config.SEED)
    np.random.seed(seed=config.SEED)
    # Setup Augmentations
    basic_aug = Compose(
        [
            Normalize(
                mean=(config.TRAIN.MEAN,),
                std=(config.TRAIN.STD,),
                max_pixel_value=1,
github horovod / horovod / test / test_torch.py View on Github external
if not _v2_api:
            return

        hvd.init()
        rank = hvd.rank()
        size = hvd.size()

        # This test does not apply if there is only one worker.
        if size == 1:
            return

        dims = [17] * 3
        tensor = torch.FloatTensor(*dims)

        if rank == 0:
            ret = hvd.join(hvd.local_rank())
        else:
            try:
                broadcasted_tensor = hvd.broadcast(tensor, 1)
                assert False, 'hvd.broadcast did not throw error'
            except (torch.FatalError, RuntimeError):
                pass

            if torch.cuda.is_available():
                ret = hvd.join(hvd.local_rank())
            else:
                ret = hvd.join()
github microsoft / DistributedDeepLearning / HorovodPytorch / src / imagenet_pytorch_horovod.py View on Github external
def main():
    logger = _get_logger()
    if _DISTRIBUTED:
        # Horovod: initialize Horovod.

        hvd.init()
        logger.info("Runnin Distributed")
        torch.manual_seed(_SEED)
        # Horovod: pin GPU to local rank.
        torch.cuda.set_device(hvd.local_rank())
        torch.cuda.manual_seed(_SEED)

    logger.info("PyTorch version {}".format(torch.__version__))

    if _FAKE:
        logger.info("Setting up fake loaders")
        train_dataset = FakeData(n_classes=1000, data_transform=torch.FloatTensor)
    else:
        normalize = transforms.Normalize(_RGB_MEAN, _RGB_SD)

        train_X, train_y, valid_X, valid_y = _create_data_fn(os.getenv('AZ_BATCHAI_INPUT_TRAIN'), os.getenv('AZ_BATCHAI_INPUT_TEST'))

        logger.info("Setting up loaders")
        train_dataset = ImageNet(
            train_X,
            train_y,
github microsoft / nlp-recipes / utils_nlp / models / bert / question_answering_distributed_v1.py View on Github external
overwrite_model (bool, optional): Whether to overwrite an existing model.
                If `cache_dir` and `load_model_from_dir` are the same and
                `overwrite_model` is `False`, the fitted model is saved to
                "cache_dir/fine_tuned". Defaults to False.

        """
        # dist
        step_per_log = 100
        is_master = False

        if distributed:
            hvd.init()
            # run = Run.get_context()

            rank = hvd.rank()
            local_rank = hvd.local_rank()
            world_size = hvd.size()

            # torch.distributed.init_process_group(
            #     backend="nccl",
            #     init_method="tcp://127.0.0.1:6000",
            #     world_size=world_size,
            #     rank=local_rank,
            # )

            torch.cuda.set_device(local_rank)
            device = torch.device("cuda", local_rank)
            is_master = rank == 0

            self.cache_dir = self.cache_dir + "/distributed_" + str(rank)

            self.model = self.model.to(device)
github zomux / lanmt / run.py View on Github external
# Experimental options
ap.add_argument("--opt_fp16", action="store_true")

# Paths
ap.add_argument("--model_path",
                default="{}/lanmt.pt".format(DATA_ROOT))
ap.add_argument("--result_path",
                default="{}/lanmt.result".format(DATA_ROOT))
OPTS.parse(ap)

# Determine the number of GPUs to use
horovod_installed = importlib.util.find_spec("horovod") is not None
if torch.cuda.is_available() and horovod_installed:
    import horovod.torch as hvd
    hvd.init()
    torch.cuda.set_device(hvd.local_rank())
    part_index = hvd.rank()
    part_num = hvd.size()
    gpu_num = hvd.size()
else:
    part_index = 0
    part_num = 1
    gpu_num = 1
if is_root_node():
    print("Running on {} GPUs".format(gpu_num))

# Get the path variables
(
    train_src_corpus,
    train_tgt_corpus,
    distilled_tgt_corpus,
    truncate_datapoints,
github kakaobrain / fast-autoaugment / FastAutoAugment / train.py View on Github external
def train_and_eval(tag, dataroot, test_ratio=0.0, cv_fold=0, reporter=None, metric='last', save_path=None, only_eval=False, horovod=False):
    if horovod:
        import horovod.torch as hvd
        hvd.init()
        device = torch.device('cuda', hvd.local_rank())
        torch.cuda.set_device(device)

    if not reporter:
        reporter = lambda **kwargs: 0

    max_epoch = C.get()['epoch']
    trainsampler, trainloader, validloader, testloader_ = get_dataloaders(C.get()['dataset'], C.get()['batch'], dataroot, test_ratio, split_idx=cv_fold, horovod=horovod)

    # create a model & an optimizer
    model = get_model(C.get()['model'], num_class(C.get()['dataset']), data_parallel=(not horovod))

    criterion = nn.CrossEntropyLoss()
    if C.get()['optimizer']['type'] == 'sgd':
        optimizer = optim.SGD(
            model.parameters(),
            lr=C.get()['lr'],
github microsoft / nlp-recipes / scenarios / sentence_similarity / gensen_train.py View on Github external
import torch.optim as optim

from utils_nlp.gensen.multi_task_model import MultitaskModel
from utils_nlp.gensen.utils import (
    BufferedDataIterator,
    NLIIterator,
    compute_validation_loss,
)

cudnn.benchmark = True
logger = logging.getLogger(__name__)

hvd.init()
if torch.cuda.is_available():
    # Horovod: pin GPU to local rank.
    torch.cuda.set_device(hvd.local_rank())


def metric_average(value, name):
    """
    Sync the validation loss with nodes.
    :param value:
    :param name:
    :return:
    """
    tensor = torch.tensor(value)
    avg_tensor = hvd.allreduce(tensor, name=name)
    return avg_tensor.item()


def setup_horovod(model, learning_rate):
    """ Setup for Horovod usage.
github jzlianglu / pykaldi2 / bin / train_transformer_se.py View on Github external
config['data_path'] = args.dataPath
    config["sweep_size"] = args.sweep_size

    print("pytorch version:{}".format(th.__version__))

    with open(args.data) as f:
        data = yaml.safe_load(f)
        config["source_paths"] = [j for i, j in data['clean_source'].items()]

    print("Experiment starts with config {}".format(json.dumps(config, sort_keys=True, indent=4)))

    # Initialize Horovod
    hvd.init()

    th.cuda.set_device(hvd.local_rank())

    print("Run experiments with world size {}".format(hvd.size()))

    dataset = SpeechDataset(config)
    transform=None
    if args.transform is not None and os.path.isfile(args.transform):
        with open(args.transform, 'rb') as f:
            transform = pickle.load(f)
            dataset.transform = transform

    train_dataloader = SeqDataloader(dataset,
                                    batch_size=args.batch_size,
                                    num_workers = args.data_loader_threads,
                                    distributed=True,
                                    test_only=False)
github horovod / horovod / horovod / spark / torch / remote.py View on Github external
local_vars = locals()
            loss_fns = [loss_constructor(**local_vars) for loss_constructor in loss_constructors]

        # Horovod: initialize library.
        hvd.init()

        if not user_shuffle_buffer_size:
            shuffle_buffer_size = \
                calculate_shuffle_buffer_size(hvd, avg_row_size, train_rows / hvd.size())
        else:
            shuffle_buffer_size = user_shuffle_buffer_size

        cuda_available = torch.cuda.is_available()
        if cuda_available:
            # Horovod: pin GPU to local rank.
            torch.cuda.set_device(hvd.local_rank())
            # Move model to GPU.
            model.cuda()

        # Optimizer object needs to be re-instantiated. Internally, it uses memory addresses of
        # objects as their identity and therefore it cannot be serialized and then
        # deserialized. The deserialized optimizer object stores the names of the parameters
        # with their old memory addresses but in reality those are different than the
        # reconstructed deserialized object and that creates problem.
        # Learning rate is a required parameters in SGD optimizer. It will be overridden with
        # load_state_dict.
        optimizer = optimizer_cls(model.parameters(), lr=1)
        optimizer_state = model_opt_state['optimizer']

        if last_checkpoint_state is not None:
            model.load_state_dict(last_checkpoint_state['model'])
            optimizer.load_state_dict(last_checkpoint_state['optimizer'])
github yngtodd / hyperspace / examples / horovod_torch.py View on Github external
help='random seed (default: 42)')
    parser.add_argument('--log-interval', type=int, default=10, metavar='N',
                        help='how many batches to wait before logging training status')
    parser.add_argument('--fp16-allreduce', action='store_true', default=False,
                        help='use fp16 compression during allreduce')
    parser.add_argument('--results_path', type=str, help="Path to store results")
    args = parser.parse_args()
    args.cuda = not args.no_cuda and torch.cuda.is_available()

    # Horovod: initialize library.
    hvd.init()
    torch.manual_seed(args.seed)

    if args.cuda:
        # Horovod: pin GPU to local rank.
        torch.cuda.set_device(hvd.local_rank())
        torch.cuda.manual_seed(args.seed)


    kwargs = {'num_workers': 1, 'pin_memory': True} if args.cuda else {}
    train_dataset = \
        datasets.MNIST('data-%d' % hvd.rank(), train=True, download=True,
                    transform=transforms.Compose([
                        transforms.ToTensor(),
                        transforms.Normalize((0.1307,), (0.3081,))
                    ]))

    # Horovod: use DistributedSampler to partition the training data.
    train_sampler = torch.utils.data.distributed.DistributedSampler(
        train_dataset, num_replicas=hvd.size(), rank=hvd.rank())
    train_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=args.batch_size, sampler=train_sampler, **kwargs)