How to use the gluonnlp.data.batchify.Tuple function in gluonnlp

To help you get started, we’ve selected a few gluonnlp examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github dmlc / gluon-nlp / tests / unittest / batchify / test_batchify.py View on Github external
for pad_index in [[0], [1], [2], [0, 1], [1, 2], [0, 1, 2]]:
                                shapes = [[[2 for _ in range(ndim)] for _ in range(batch_size)]
                                          for _ in range(TOTAL_ELE_NUM)]
                                for j in pad_index:
                                    for i in range(batch_size):
                                        shapes[j][i][axis] = np.random.randint(length_min, length_max)
                                random_data_npy = [tuple(np.random.normal(0, 1, shapes[j][i]).astype(dtype)
                                                         for j in range(TOTAL_ELE_NUM)) for i in range(batch_size)]
                                batchify_fn = []
                                for j in range(TOTAL_ELE_NUM):
                                    if j in pad_index:
                                        batchify_fn.append(batchify.Pad(axis=axis, pad_val=pad_val, ret_length=True,
                                                                        dtype=_dtype))
                                    else:
                                        batchify_fn.append(batchify.Stack(dtype=_dtype))
                                batchify_fn = batchify.Tuple(batchify_fn)
                                ret_use_npy = batchify_fn(random_data_npy)
                                with pytest.warns(UserWarning):
                                    # Using Pad with NDArrays is discouraged for speed reasons.
                                    ret_use_mx = batchify_fn([tuple(mx.nd.array(ele[i], dtype=dtype)
                                                                    for i in range(TOTAL_ELE_NUM))
                                                              for ele in random_data_npy])
                                for i in range(TOTAL_ELE_NUM):
                                    if i in pad_index:
                                        assert ret_use_npy[i][0].dtype == ret_use_mx[i][0].dtype == dtype
                                        assert ret_use_npy[i][1].dtype == ret_use_mx[i][1].dtype == np.int32
                                        assert_allclose(ret_use_npy[i][0].asnumpy(),
                                                        ret_use_mx[i][0].asnumpy())
                                        assert_allclose(ret_use_npy[i][1].asnumpy(),
                                                        ret_use_mx[i][1].asnumpy())
                                        assert (ret_use_npy[i][1].shape == (batch_size,))
                                    else:
github dmlc / gluon-nlp / scripts / machine_translation / evaluate_transformer.py View on Github external
args.tgt_corpus,
            sentence_normalizer=tgt_normalizer,
            base_tokenizer=base_tgt_tokenizer,
            bpe_tokenizer=tgt_tokenizer,
            add_bos=True,
            add_eos=True
        )
    else: # when applying inference, populate the fake tgt tokens
        all_tgt_token_ids = all_tgt_lines = [[] for i in range(len(all_src_token_ids))]
    test_dataloader = gluon.data.DataLoader(
        list(zip(all_src_token_ids,
                 [len(ele) for ele in all_src_token_ids],
                 all_tgt_token_ids,
                 [len(ele) for ele in all_tgt_token_ids])),
        batch_size=32,
        batchify_fn=Tuple(Pad(), Stack(), Pad(), Stack()),
        shuffle=False)

    ctx = ctx_l[0]
    pred_sentences = []
    start_eval_time = time.time()
    # evaluate
    if not args.inference:
        avg_nll_loss = 0
        ntokens = 0
        for i, (src_token_ids, src_valid_length, tgt_token_ids, tgt_valid_length)\
                in enumerate(test_dataloader):
            src_token_ids = mx.np.array(src_token_ids, ctx=ctx, dtype=np.int32)
            src_valid_length = mx.np.array(src_valid_length, ctx=ctx, dtype=np.int32)
            tgt_token_ids = mx.np.array(tgt_token_ids, ctx=ctx, dtype=np.int32)
            tgt_valid_length = mx.np.array(tgt_valid_length, ctx=ctx, dtype=np.int32)
            tgt_pred = model(src_token_ids, src_valid_length, tgt_token_ids[:, :-1],
github dmlc / gluon-nlp / scripts / machine_translation / train_transformer.py View on Github external
base_lr = args.lr
    lr_scheduler = InverseSquareRootScheduler(warmup_steps=args.warmup_steps, base_lr=base_lr,
                                              warmup_init_lr=args.warmup_init_lr)
    trainer = gluon.Trainer(model.collect_params(), 'adam',
                            {'learning_rate': args.lr, 'beta1': 0.9,
                             'beta2': 0.98, 'epsilon': 1e-9, 'lr_scheduler': lr_scheduler})
    # Load Data
    if args.bucket_scheme == 'constant':
        bucket_scheme = ConstWidthBucket()
    elif args.bucket_scheme == 'linear':
        bucket_scheme = LinearWidthBucket()
    elif args.bucket_scheme == 'exp':
        bucket_scheme = ExpWidthBucket(bucket_len_step=1.2)
    else:
        raise NotImplementedError
    batchify_fn = bf.Tuple(bf.Pad(), bf.Pad(), bf.Stack(), bf.Stack(), bf.Stack())
    # TODO(sxjscience) Support auto-bucket-size tuning
    train_batch_sampler = FixedBucketSampler(lengths=[(ele[2], ele[3]) for ele in data_train],
                                             batch_size=args.batch_size,
                                             num_buckets=args.num_buckets,
                                             ratio=args.bucket_ratio,
                                             shuffle=True,
                                             use_average_length=True,
                                             bucket_scheme=bucket_scheme,
                                             seed=args.seed)
    train_data_loader = gluon.data.DataLoader(data_train,
                                              batch_sampler=train_batch_sampler,
                                              batchify_fn=batchify_fn,
                                              num_workers=0)
    logging.info(train_batch_sampler)
    val_data_loader = gluon.data.DataLoader(data_val,
                                            batch_size=args.val_batch_size,
github dmlc / gluon-nlp / scripts / intent_cls_slot_labeling / finetune_icsl.py View on Github external
print('   #Intent         = {}'.format(len(intent_vocab)))
    print('   #Slot           = {}'.format(len(slot_vocab)))
    # Display An Example
    print('Display A Samples')
    print_sample(test_data, 1)
    print('-' * 80)

    idsl_transform = IDSLSubwordTransform(subword_vocab=bert_vocab,
                                          subword_tokenizer=tokenizer,
                                          slot_vocab=slot_vocab,
                                          cased=args.cased)
    train_data_bert = train_data.transform(idsl_transform, lazy=False)
    dev_data_bert = dev_data.transform(idsl_transform, lazy=False)
    test_data_bert = test_data.transform(idsl_transform, lazy=False)
    # Construct the DataLoader
    batchify_fn = nlp.data.batchify.Tuple(nlp.data.batchify.Pad(pad_val=0),  # Subword ID
                                          nlp.data.batchify.Pad(pad_val=0),  # Subword Mask
                                          nlp.data.batchify.Pad(pad_val=0),  # Beginning of subword
                                          nlp.data.batchify.Pad(pad_val=0),  # Tag IDs
                                          nlp.data.batchify.Stack(),  # Intent Label
                                          nlp.data.batchify.Stack())  # Valid Length
    train_batch_sampler = nlp.data.sampler.SortedBucketSampler(
        [len(ele) for ele in train_data_bert],
        batch_size=args.batch_size,
        mult=20,
        shuffle=True)
    train_loader = gluon.data.DataLoader(dataset=train_data_bert,
                                         num_workers=4,
                                         batch_sampler=train_batch_sampler,
                                         batchify_fn=batchify_fn)
    dev_loader = gluon.data.DataLoader(dataset=dev_data_bert,
                                       num_workers=4,
github kenjewu / Structured-Self-Attentive-Sentence-Embedding / code / prepare_data.py View on Github external
def get_dataloader(dataset, batch_size, is_train=True):

    # Construct the DataLoader Pad data, stack label and lengths
    batchify_fn = nlp.data.batchify.Tuple(
        nlp.data.batchify.Pad(axis=0),
        nlp.data.batchify.Stack())

    dataloader = None

    # dataloader for training
    if is_train:
        data_lengths = [len(sample[0]) for sample in dataset]

        # n this example, we use a FixedBucketSampler,
        # which assigns each data sample to a fixed bucket based on its length.
        batch_sampler = nlp.data.sampler.FixedBucketSampler(
            data_lengths,
            batch_size=batch_size,
            num_buckets=10,
            ratio=0.2,
github awslabs / autogluon / autogluon / task / text_classification / pipeline.py View on Github external
trans = BERTDatasetTransform(tokenizer, max_len,
                                 vocab=vocab,
                                 class_labels=task.class_labels,
                                 label_alias=task.label_alias,
                                 pad=pad, pair=task.is_pair,
                                 has_label=True)

    # data train
    # task.dataset_train returns (segment_name, dataset)
    train_tsv = task.dataset_train()[1]
    data_train = mx.gluon.data.SimpleDataset(pool.map(trans, train_tsv))
    data_train_len = data_train.transform(
        lambda input_id, length, segment_id, label_id: length, lazy=False)
    # bucket sampler for training
    pad_val = vocab[vocab.padding_token]
    batchify_fn = nlp.data.batchify.Tuple(
        nlp.data.batchify.Pad(axis=0, pad_val=pad_val),  # input
        nlp.data.batchify.Stack(),  # length
        nlp.data.batchify.Pad(axis=0, pad_val=0),  # segment
        nlp.data.batchify.Stack(label_dtype))  # label
    batch_sampler = nlp.data.sampler.FixedBucketSampler(
        data_train_len,
        batch_size=batch_size,
        num_buckets=10,
        ratio=0,
        shuffle=True)
    # data loader for training
    loader_train = gluon.data.DataLoader(
        dataset=data_train,
        num_workers=num_workers,
        batch_sampler=batch_sampler,
        batchify_fn=batchify_fn)
github dmlc / gluon-nlp / scripts / natural_language_inference / dataset.py View on Github external
def prepare_data_loader(args, dataset, vocab, test=False):
    """
    Read data and build data loader.
    """
    # Preprocess
    dataset = dataset.transform(lambda s1, s2, label: (vocab(s1), vocab(s2), label),
                                lazy=False)

    # Batching
    batchify_fn = btf.Tuple(btf.Pad(pad_val=0), btf.Pad(pad_val=0), btf.Stack(dtype='int32'))
    data_lengths = [max(len(d[0]), len(d[1])) for d in dataset]
    batch_sampler = nlp.data.FixedBucketSampler(lengths=data_lengths,
                                                batch_size=args.batch_size,
                                                shuffle=(not test))
    data_loader = gluon.data.DataLoader(dataset=dataset,
                                        batch_sampler=batch_sampler,
                                        batchify_fn=batchify_fn)
    return data_loader
github dmlc / gluon-nlp / scripts / language_model / run_squad.py View on Github external
get_pretrained = True

get_model_params = {
    'name': args.model,
    'dataset_name': args.dataset,
    'pretrained': get_pretrained,
    'ctx': ctx,
    'use_decoder': False,
    'dropout': args.dropout,
    'attention_dropout': args.attention_dropout
}

# model, vocabulary and tokenizer
xlnet_base, vocab, tokenizer = model.get_model(**get_model_params)

batchify_fn = nlp.data.batchify.Tuple(
    nlp.data.batchify.Stack('int32'),  # example_id
    nlp.data.batchify.Pad(axis=0, pad_val=vocab[vocab.padding_token], dtype='int32',
                          round_to=args.round_to),  # input_ids
    nlp.data.batchify.Pad(axis=0, pad_val=3, dtype='int32', round_to=args.round_to),  # segment_ids
    nlp.data.batchify.Stack('float32'),  # valid_length
    nlp.data.batchify.Pad(axis=0, pad_val=1, round_to=args.round_to),  # p_mask
    nlp.data.batchify.Stack('float32'),  # start_position
    nlp.data.batchify.Stack('float32'),  # end_position
    nlp.data.batchify.Stack('float32'))  # is_impossible

if pretrained_xlnet_parameters:
    # only load XLnetModel parameters
    nlp.utils.load_parameters(xlnet_base, pretrained_xlnet_parameters, ctx=ctx, ignore_extra=True,
                              cast_dtype=True)

units = xlnet_base._net._units
github dmlc / gluon-nlp / scripts / bert / pretraining_utils.py View on Github external
"""
    num_files = len(nlp.utils.glob(data))
    logging.info('%d files are found.', num_files)
    assert num_files >= num_parts, \
        'The number of text files must be no less than the number of ' \
        'workers/partitions (%d). Only %d files at %s are found.'%(num_parts, num_files, data)
    dataset_params = {'tokenizer': tokenizer, 'max_seq_length': max_seq_length,
                      'short_seq_prob': short_seq_prob, 'masked_lm_prob': masked_lm_prob,
                      'max_predictions_per_seq': max_predictions_per_seq, 'vocab':vocab,
                      'whole_word_mask': whole_word_mask}
    sampler_params = {'batch_size': batch_size, 'shuffle': shuffle,
                      'num_ctxes': num_ctxes, 'num_buckets': num_buckets}
    dataset_fn = prepare_pretrain_text_dataset
    sampler_fn = prepare_pretrain_bucket_sampler
    pad_val = vocab[vocab.padding_token]
    batchify_fn = nlp.data.batchify.Tuple(
        nlp.data.batchify.Pad(pad_val=pad_val, round_to=8),  # input_id
        nlp.data.batchify.Pad(pad_val=pad_val),  # masked_id
        nlp.data.batchify.Pad(pad_val=0),  # masked_position
        nlp.data.batchify.Pad(pad_val=0),  # masked_weight
        nlp.data.batchify.Stack(),  # next_sentence_label
        nlp.data.batchify.Pad(pad_val=0, round_to=8),  # segment_id
        nlp.data.batchify.Stack())
    split_sampler = nlp.data.SplitSampler(num_files, num_parts=num_parts,
                                          part_index=part_idx, repeat=repeat)
    dataloader = nlp.data.DatasetLoader(data,
                                        file_sampler=split_sampler,
                                        dataset_fn=dataset_fn,
                                        batch_sampler_fn=sampler_fn,
                                        dataset_params=dataset_params,
                                        batch_sampler_params=sampler_params,
                                        batchify_fn=batchify_fn,
github dmlc / gluon-nlp / scripts / bert / pretraining_utils.py View on Github external
we pop a cached processed dataset.
    num_max_dataset_cached : int, default is 0
        Maximum number of cached datasets. It is valid only if dataset_cached is True
    """
    num_files = len(nlp.utils.glob(data))
    logging.info('%d files are found.', num_files)
    assert num_files >= num_parts, \
        'The number of text files must be no less than the number of ' \
        'workers/partitions (%d). Only %d files at %s are found.'%(num_parts, num_files, data)
    dataset_params = {'allow_pickle': True}
    sampler_params = {'batch_size': batch_size, 'shuffle': shuffle,
                      'num_ctxes': num_ctxes, 'num_buckets': num_buckets}
    dataset_fn = prepare_pretrain_npz_dataset
    sampler_fn = prepare_pretrain_bucket_sampler
    pad_val = vocab[vocab.padding_token]
    batchify_fn = nlp.data.batchify.Tuple(
        nlp.data.batchify.Pad(pad_val=pad_val, round_to=8),  # input_id
        nlp.data.batchify.Pad(pad_val=pad_val),  # masked_id
        nlp.data.batchify.Pad(pad_val=0),  # masked_position
        nlp.data.batchify.Pad(pad_val=0),  # masked_weight
        nlp.data.batchify.Stack(),  # next_sentence_label
        nlp.data.batchify.Pad(pad_val=0, round_to=8),  # segment_id
        nlp.data.batchify.Stack())
    split_sampler = nlp.data.SplitSampler(num_files, num_parts=num_parts,
                                          part_index=part_idx, repeat=repeat)
    dataloader = nlp.data.DatasetLoader(data,
                                        file_sampler=split_sampler,
                                        dataset_fn=dataset_fn,
                                        batch_sampler_fn=sampler_fn,
                                        dataset_params=dataset_params,
                                        batch_sampler_params=sampler_params,
                                        batchify_fn=batchify_fn,