How to use the gluonnlp.data.batchify function in gluonnlp

To help you get started, we’ve selected a few gluonnlp examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github eric-haibin-lin / AMLC19-GluonNLP / 04_contextual_representation / bert / finetune_classifier.py View on Github external
trans = BERTDatasetTransform(tokenizer, max_len,
                                 class_labels=task.class_labels,
                                 label_alias=task.label_alias,
                                 pad=pad, pair=task.is_pair,
                                 has_label=True)

    # data train
    # task.dataset_train returns (segment_name, dataset)
    train_tsv = task.dataset_train()[1]
    data_train = mx.gluon.data.SimpleDataset(pool.map(trans, train_tsv))
    data_train_len = data_train.transform(
        lambda input_id, length, segment_id, label_id: length, lazy=False)
    # bucket sampler for training
    batchify_fn = nlp.data.batchify.Tuple(
        nlp.data.batchify.Pad(axis=0), nlp.data.batchify.Stack(),
        nlp.data.batchify.Pad(axis=0), nlp.data.batchify.Stack(label_dtype))
    batch_sampler = nlp.data.sampler.FixedBucketSampler(
        data_train_len,
        batch_size=batch_size,
        num_buckets=10,
        ratio=0,
        shuffle=True)
    # data loader for training
    loader_train = gluon.data.DataLoader(
        dataset=data_train,
        num_workers=1,
        batch_sampler=batch_sampler,
        batchify_fn=batchify_fn)

    # data dev. For MNLI, more than one dev set is available
    dev_tsv = task.dataset_dev()
    dev_tsv_list = dev_tsv if isinstance(dev_tsv, list) else [dev_tsv]
github dmlc / gluon-nlp / scripts / language_model / large_word_language_model.py View on Github external
else:
        xs, ys, ms = [x], [y], [m]
    xs = _load(xs)
    ys = _load(ys)
    ms = _load(ms)
    ss = [sampler(y) for y in ys]
    ss = _load(ss)
    return xs, ys, ms, ss

train_batch_size = args.batch_size * len(context)
train_batchify = nlp.data.batchify.StreamBPTTBatchify(vocab, args.bptt, train_batch_size)
train_data = train_batchify(train_data_stream)
train_data = train_data.transform(_split_and_sample)

test_batch_size = args.batch_size
test_batchify = nlp.data.batchify.StreamBPTTBatchify(vocab, args.bptt, test_batch_size)
test_data = test_batchify(test_data_stream)
test_data = nlp.data.PrefetchingStream(test_data)

###############################################################################
# Build the model
###############################################################################

eval_model = nlp.model.language_model.BigRNN(ntokens, args.emsize, args.nhid,
                                             args.nlayers, args.nproj,
                                             embed_dropout=args.dropout,
                                             encode_dropout=args.dropout)
model = nlp.model.language_model.train.BigRNN(ntokens, args.emsize, args.nhid,
                                              args.nlayers, args.nproj, args.k,
                                              embed_dropout=args.dropout,
                                              encode_dropout=args.dropout)
loss = gluon.loss.SoftmaxCrossEntropyLoss()
github dmlc / gluon-nlp / scripts / language_model / run_squad.py View on Github external
get_model_params = {
    'name': args.model,
    'dataset_name': args.dataset,
    'pretrained': get_pretrained,
    'ctx': ctx,
    'use_decoder': False,
    'dropout': args.dropout,
    'attention_dropout': args.attention_dropout
}

# model, vocabulary and tokenizer
xlnet_base, vocab, tokenizer = model.get_model(**get_model_params)

batchify_fn = nlp.data.batchify.Tuple(
    nlp.data.batchify.Stack('int32'),  # example_id
    nlp.data.batchify.Pad(axis=0, pad_val=vocab[vocab.padding_token], dtype='int32',
                          round_to=args.round_to),  # input_ids
    nlp.data.batchify.Pad(axis=0, pad_val=3, dtype='int32', round_to=args.round_to),  # segment_ids
    nlp.data.batchify.Stack('float32'),  # valid_length
    nlp.data.batchify.Pad(axis=0, pad_val=1, round_to=args.round_to),  # p_mask
    nlp.data.batchify.Stack('float32'),  # start_position
    nlp.data.batchify.Stack('float32'),  # end_position
    nlp.data.batchify.Stack('float32'))  # is_impossible

if pretrained_xlnet_parameters:
    # only load XLnetModel parameters
    nlp.utils.load_parameters(xlnet_base, pretrained_xlnet_parameters, ctx=ctx, ignore_extra=True,
                              cast_dtype=True)

units = xlnet_base._net._units
net = XLNetForQA(xlnet_base=xlnet_base, start_top_n=args.start_top_n, end_top_n=args.end_top_n,
github dmlc / gluon-nlp / scripts / machine_translation / dataprocessor.py View on Github external
elif args.bucket_scheme == 'exp':
        bucket_scheme = nlp.data.ExpWidthBucket(bucket_len_step=1.2)
    else:
        raise NotImplementedError

    data_lengths = get_data_lengths(data_set)

    if dataset_type == 'train':
        train_batchify_fn = btf.Tuple(btf.Pad(pad_val=0), btf.Pad(pad_val=0),
                                      btf.Stack(dtype='float32'), btf.Stack(dtype='float32'))

    else:
        data_lengths = list(map(lambda x: x[-1], data_lengths))
        test_batchify_fn = btf.Tuple(btf.Pad(pad_val=0), btf.Pad(pad_val=0),
                                     btf.Stack(dtype='float32'), btf.Stack(dtype='float32'),
                                     btf.Stack())

    batch_sampler = nlp.data.FixedBucketSampler(lengths=data_lengths,
                                                batch_size=(args.batch_size \
                                                            if dataset_type == 'train' \
                                                            else args.test_batch_size),
                                                num_buckets=args.num_buckets,
                                                ratio=args.bucket_ratio,
                                                shuffle=(dataset_type == 'train'),
                                                use_average_length=use_average_length,
                                                num_shards=num_shards,
                                                bucket_scheme=bucket_scheme)

    if dataset_type == 'train':
        logging.info('Train Batch Sampler:\n%s', batch_sampler.stats())
        data_loader = nlp.data.ShardedDataLoader(data_set,
                                                 batch_sampler=batch_sampler,
github dmlc / gluon-nlp / scripts / question_answering / run_squad.py View on Github external
# TODO(sxjscience) Consider to combine the NamedTuple and batchify functionality.
    ChunkFeature = collections.namedtuple('ChunkFeature',
                                          ['qas_id',
                                           'data',
                                           'valid_length',
                                           'segment_ids',
                                           'masks',
                                           'is_impossible',
                                           'gt_start',
                                           'gt_end',
                                           'context_offset',
                                           'chunk_start',
                                           'chunk_length'])
    BatchifyFunction = bf.NamedTuple(ChunkFeature,
                                     {'qas_id': bf.List(),
                                      'data': bf.Pad(),
                                      'valid_length': bf.Stack(),
                                      'segment_ids': bf.Pad(),
                                      'masks': bf.Pad(val=1),
                                      'is_impossible': bf.Stack(),
                                      'gt_start': bf.Stack(),
                                      'gt_end': bf.Stack(),
                                      'context_offset': bf.Stack(),
                                      'chunk_start': bf.Stack(),
                                      'chunk_length': bf.Stack()})

    def __init__(self, tokenizer, doc_stride, max_seq_length, max_query_length):
        """

        Parameters
        ----------
        tokenizer
github dmlc / gluon-nlp / scripts / machine_translation / dataprocessor.py View on Github external
"""Create data loaders for training/validation/test."""
    assert dataset_type in ['train', 'val', 'test']

    if args.bucket_scheme == 'constant':
        bucket_scheme = nlp.data.ConstWidthBucket()
    elif args.bucket_scheme == 'linear':
        bucket_scheme = nlp.data.LinearWidthBucket()
    elif args.bucket_scheme == 'exp':
        bucket_scheme = nlp.data.ExpWidthBucket(bucket_len_step=1.2)
    else:
        raise NotImplementedError

    data_lengths = get_data_lengths(data_set)

    if dataset_type == 'train':
        train_batchify_fn = btf.Tuple(btf.Pad(pad_val=0), btf.Pad(pad_val=0),
                                      btf.Stack(dtype='float32'), btf.Stack(dtype='float32'))

    else:
        data_lengths = list(map(lambda x: x[-1], data_lengths))
        test_batchify_fn = btf.Tuple(btf.Pad(pad_val=0), btf.Pad(pad_val=0),
                                     btf.Stack(dtype='float32'), btf.Stack(dtype='float32'),
                                     btf.Stack())

    batch_sampler = nlp.data.FixedBucketSampler(lengths=data_lengths,
                                                batch_size=(args.batch_size \
                                                            if dataset_type == 'train' \
                                                            else args.test_batch_size),
                                                num_buckets=args.num_buckets,
                                                ratio=args.bucket_ratio,
                                                shuffle=(dataset_type == 'train'),
                                                use_average_length=use_average_length,
github dmlc / gluon-nlp / scripts / bert / finetune_classifier.py View on Github external
trans = partial(convert_examples_to_features, tokenizer=tokenizer,
                    truncate_length=truncate_length,
                    cls_token=vocab.cls_token if not use_roberta else vocab.bos_token,
                    sep_token=vocab.sep_token if not use_roberta else vocab.eos_token,
                    class_labels=task.class_labels, label_alias=task.label_alias, vocab=vocab)

    # data train
    # task.dataset_train returns (segment_name, dataset)
    train_tsv = task.dataset_train()[1]
    data_train = mx.gluon.data.SimpleDataset(list(map(trans, train_tsv)))
    data_train_len = data_train.transform(lambda _, segment_ids, valid_length, label: valid_length,
                                          lazy=False)
    # bucket sampler for training
    pad_val = vocabulary[vocabulary.padding_token]
    batchify_fn = nlp.data.batchify.Tuple(
        nlp.data.batchify.Pad(axis=0, pad_val=pad_val, round_to=args.round_to),  # input
        nlp.data.batchify.Pad(axis=0, pad_val=0, round_to=args.round_to),  # segment
        nlp.data.batchify.Stack(),  # length
        nlp.data.batchify.Stack(label_dtype))  # label
    batch_sampler = nlp.data.sampler.FixedBucketSampler(data_train_len, batch_size=batch_size,
                                                        num_buckets=10, ratio=0, shuffle=True)
    # data loader for training
    loader_train = gluon.data.DataLoader(dataset=data_train, num_workers=4,
                                         batch_sampler=batch_sampler, batchify_fn=batchify_fn)

    # data dev. For MNLI, more than one dev set is available
    dev_tsv = task.dataset_dev()
    dev_tsv_list = dev_tsv if isinstance(dev_tsv, list) else [dev_tsv]
    loader_dev_list = []
    for segment, data in dev_tsv_list:
        data_dev = mx.gluon.data.SimpleDataset(list(map(trans, data)))
        loader_dev = mx.gluon.data.DataLoader(data_dev, batch_size=dev_batch_size, num_workers=4,