How to use the horovod.mxnet.rank function in horovod

To help you get started, we’ve selected a few horovod examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github horovod / horovod / test / test_mxnet.py View on Github external
# Threshold for floating point equality depends on number of
            # ranks, since we're comparing against precise multiplication.
            if size <= 3 or dtype in ['int32', 'int64']:
                threshold = 1
            elif size < 10:
                threshold = 1e-4
            elif size < 15:
                threshold = 5e-4
            else:
                break

            if max_difference > threshold:
                print("average", count, dtype, dim, max_difference, threshold)
                print("tensor", hvd.rank(), tensor)
                print("averaged", hvd.rank(), averaged)
            assert max_difference <= threshold, 'hvd.allreduce produces \
                                                 incorrect results for average'
github horovod / horovod / test / test_mxnet.py View on Github external
broadcast_tensor = hvd.broadcast(tensor, root_rank=root_rank,
                                             name=str(count))
            if rank != root_rank:
                if same(tensor.asnumpy(), root_tensor.asnumpy()):
                    print("broadcast", count, dtype, dim,
                          mx.nd.max(tensor == root_tensor))
                    print("tensor", hvd.rank(), tensor)
                    print("root_tensor", hvd.rank(), root_tensor)
                    print("comparison", hvd.rank(), tensor == root_tensor)
                assert not same(tensor.asnumpy(), root_tensor.asnumpy()), \
                    'hvd.broadcast modifies source tensor'
            if not same(broadcast_tensor.asnumpy(), root_tensor.asnumpy()):
                print("broadcast", count, dtype, dim)
                print("broadcast_tensor", hvd.rank(), broadcast_tensor)
                print("root_tensor", hvd.rank(), root_tensor)
                print("comparison", hvd.rank(),
                      broadcast_tensor == root_tensor)
            assert same(broadcast_tensor.asnumpy(), root_tensor.asnumpy()), \
                'hvd.broadcast produces incorrect broadcasted tensor'
github dmlc / gluon-nlp / scripts / bert / finetune_squad.py View on Github external
fh = logging.FileHandler(os.path.join(args.output_dir, 'finetune_squad.log'),
                         mode='w')
fh.setLevel(logging.INFO)
fh.setFormatter(formatter)
console = logging.StreamHandler()
console.setLevel(logging.INFO)
console.setFormatter(formatter)
log.addHandler(console)
log.addHandler(fh)

log.info(args)

if args.comm_backend == 'horovod':
    import horovod.mxnet as hvd
    hvd.init()
    rank = hvd.rank()
    size = hvd.size()
    local_rank = hvd.local_rank()
else:
    rank = 0
    size = 1
    local_rank = 0

if args.dtype == 'float16':
    from mxnet.contrib import amp
    amp.init()

model_name = args.bert_model
dataset_name = args.bert_dataset
only_predict = args.only_predict
model_parameters = args.model_parameters
pretrained_bert_parameters = args.pretrained_bert_parameters
github dmlc / gluon-nlp / scripts / pretraining / run_electra.py View on Github external
def init_comm(backend, gpus):
    """Init communication backend"""
    # backend specific implementation
    if backend == 'horovod':
        try:
            import horovod.mxnet as hvd  # pylint: disable=import-outside-toplevel
        except ImportError:
            logging.info('horovod must be installed.')
            sys.exit(1)
        hvd.init()
        store = None
        num_workers = hvd.size()
        rank = hvd.rank()
        local_rank = hvd.local_rank()
        is_master_node = rank == local_rank
        ctx_l = [mx.gpu(local_rank)]
        logging.info('GPU communication supported by horovod')
    else:
        store = mx.kv.create(backend)
        num_workers = store.num_workers
        rank = store.rank
        local_rank = 0
        is_master_node = rank == local_rank
        if gpus == '-1' or gpus == '':
            ctx_l = [mx.cpu()]
            logging.info('Runing on CPU')
        else:
            ctx_l = [mx.gpu(int(x)) for x in gpus.split(',')]
            logging.info('GPU communication supported by KVStore')
github dmlc / gluon-nlp / scripts / bert / run_pretraining.py View on Github external
def init_comm(backend):
    """Init communication backend"""
    # backend specific implementation
    if backend == 'horovod':
        try:
            import horovod.mxnet as hvd  # pylint: disable=import-outside-toplevel
        except ImportError:
            logging.info('horovod must be installed.')
            sys.exit(1)
        hvd.init()
        store = None
        num_workers = hvd.size()
        rank = hvd.rank()
        local_rank = hvd.local_rank()
        is_master_node = rank == local_rank
        ctxs = [mx.gpu(local_rank)]
    else:
        # kvstore
        store = mx.kv.create(backend)
        num_workers = store.num_workers
        rank = store.rank
        local_rank = 0
        is_master_node = rank == local_rank
        ctxs = [mx.cpu()] if args.gpus is None or args.gpus == '' else \
               [mx.gpu(int(x)) for x in args.gpus.split(',')]
    return store, num_workers, rank, local_rank, is_master_node, ctxs
github hpi-xnor / BMXNet-v2 / example / distributed_training-horovod / gluon_mnist.py View on Github external
dirname=data_dir)
    with zipfile.ZipFile(zip_file_path) as zf:
        zf.extractall(data_dir)

    input_shape = (1, 28, 28)
    batch_size = args.batch_size

    train_iter = mx.io.MNISTIter(
        image="%s/train-images-idx3-ubyte" % data_dir,
        label="%s/train-labels-idx1-ubyte" % data_dir,
        input_shape=input_shape,
        batch_size=batch_size,
        shuffle=True,
        flat=False,
        num_parts=hvd.size(),
        part_index=hvd.rank()
    )

    val_iter = mx.io.MNISTIter(
        image="%s/t10k-images-idx3-ubyte" % data_dir,
        label="%s/t10k-labels-idx1-ubyte" % data_dir,
        input_shape=input_shape,
        batch_size=batch_size,
        flat=False,
    )

    return train_iter, val_iter
github hpi-xnor / BMXNet-v2 / example / distributed_training-horovod / module_mnist.py View on Github external
input_shape=input_shape,
        batch_size=batch_size,
        shuffle=True,
        flat=False,
        num_parts=hvd.size(),
        part_index=hvd.rank()
    )

    val_iter = mx.io.MNISTIter(
        image="%s/t10k-images-idx3-ubyte" % data_dir,
        label="%s/t10k-labels-idx1-ubyte" % data_dir,
        input_shape=input_shape,
        batch_size=batch_size,
        flat=False,
        num_parts=hvd.size(),
        part_index=hvd.rank()
    )

    return train_iter, val_iter
github awslabs / sagemaker-debugger / examples / mxnet / scripts / mnist_mxnet_hvd.py View on Github external
zip_file_path = download("http://data.mxnet.io/mxnet/data/mnist.zip", dirname=data_dir)
    with zipfile.ZipFile(zip_file_path) as zf:
        zf.extractall(data_dir)

    input_shape = (1, 28, 28)
    batch_size = args.batch_size

    train_iter = mx.io.MNISTIter(
        image="%s/train-images-idx3-ubyte" % data_dir,
        label="%s/train-labels-idx1-ubyte" % data_dir,
        input_shape=input_shape,
        batch_size=batch_size,
        shuffle=True,
        flat=False,
        num_parts=hvd.size(),
        part_index=hvd.rank(),
    )

    val_iter = mx.io.MNISTIter(
        image="%s/t10k-images-idx3-ubyte" % data_dir,
        label="%s/t10k-labels-idx1-ubyte" % data_dir,
        input_shape=input_shape,
        batch_size=batch_size,
        flat=False,
    )

    return train_iter, val_iter
github hpi-xnor / BMXNet-v2 / example / distributed_training-horovod / module_mnist.py View on Github external
dirname=data_dir)
    with zipfile.ZipFile(zip_file_path) as zf:
        zf.extractall(data_dir)

    input_shape = (1, 28, 28)
    batch_size = args.batch_size

    train_iter = mx.io.MNISTIter(
        image="%s/train-images-idx3-ubyte" % data_dir,
        label="%s/train-labels-idx1-ubyte" % data_dir,
        input_shape=input_shape,
        batch_size=batch_size,
        shuffle=True,
        flat=False,
        num_parts=hvd.size(),
        part_index=hvd.rank()
    )

    val_iter = mx.io.MNISTIter(
        image="%s/t10k-images-idx3-ubyte" % data_dir,
        label="%s/t10k-labels-idx1-ubyte" % data_dir,
        input_shape=input_shape,
        batch_size=batch_size,
        flat=False,
        num_parts=hvd.size(),
        part_index=hvd.rank()
    )

    return train_iter, val_iter