How to use the gluonnlp.data.batchify.Stack function in gluonnlp

To help you get started, we’ve selected a few gluonnlp examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github dmlc / gluon-nlp / scripts / language_model / run_glue.py View on Github external
dev_tsv = _task.dataset_dev()
    dev_tsv_list = dev_tsv if isinstance(dev_tsv, list) else [dev_tsv]
    loader_dev_list = []
    for segment, data in dev_tsv_list:
        data_dev = mx.gluon.data.SimpleDataset(list(map(trans, data)))
        loader_dev = mx.gluon.data.DataLoader(data_dev,
                                              batch_size=dev_batch_size,
                                              num_workers=4,
                                              shuffle=False,
                                              batchify_fn=batchify_fn)
        loader_dev_list.append((segment, loader_dev))

    # batchify for data test
    test_batchify_fn = nlp.data.batchify.Tuple(
        nlp.data.batchify.Pad(axis=0, pad_val=pad_val, round_to=args.round_to),
        nlp.data.batchify.Stack(),
        nlp.data.batchify.Pad(axis=0, pad_val=0, round_to=args.round_to))

    # transform for data test
    test_trans = partial(convert_examples_to_features,
                         tokenizer=_tokenizer,
                         truncate_length=max_len,
                         cls_token=_vocab.cls_token,
                         sep_token=_vocab.sep_token,
                         class_labels=None,
                         is_test=True,
                         vocab=_vocab)

    # data test. For MNLI, more than one test set is available
    test_tsv = _task.dataset_test()
    test_tsv_list = test_tsv if isinstance(test_tsv, list) else [test_tsv]
    loader_test_list = []
github dmlc / gluon-nlp / scripts / intent_cls_slot_labeling / finetune_icsl.py View on Github external
print_sample(test_data, 1)
    print('-' * 80)

    idsl_transform = IDSLSubwordTransform(subword_vocab=bert_vocab,
                                          subword_tokenizer=tokenizer,
                                          slot_vocab=slot_vocab,
                                          cased=args.cased)
    train_data_bert = train_data.transform(idsl_transform, lazy=False)
    dev_data_bert = dev_data.transform(idsl_transform, lazy=False)
    test_data_bert = test_data.transform(idsl_transform, lazy=False)
    # Construct the DataLoader
    batchify_fn = nlp.data.batchify.Tuple(nlp.data.batchify.Pad(pad_val=0),  # Subword ID
                                          nlp.data.batchify.Pad(pad_val=0),  # Subword Mask
                                          nlp.data.batchify.Pad(pad_val=0),  # Beginning of subword
                                          nlp.data.batchify.Pad(pad_val=0),  # Tag IDs
                                          nlp.data.batchify.Stack(),  # Intent Label
                                          nlp.data.batchify.Stack())  # Valid Length
    train_batch_sampler = nlp.data.sampler.SortedBucketSampler(
        [len(ele) for ele in train_data_bert],
        batch_size=args.batch_size,
        mult=20,
        shuffle=True)
    train_loader = gluon.data.DataLoader(dataset=train_data_bert,
                                         num_workers=4,
                                         batch_sampler=train_batch_sampler,
                                         batchify_fn=batchify_fn)
    dev_loader = gluon.data.DataLoader(dataset=dev_data_bert,
                                       num_workers=4,
                                       batch_size=args.batch_size,
                                       batchify_fn=batchify_fn,
                                       shuffle=False)
    test_loader = gluon.data.DataLoader(dataset=test_data_bert,
github eric-haibin-lin / AMLC19-GluonNLP / 05_deployment / bert / finetune_squad.py View on Github external
use_pooler=False,
    use_decoder=False,
    use_classifier=False)

if args.sentencepiece:
    tokenizer = nlp.data.BERTSPTokenizer(args.sentencepiece, vocab, lower=lower)
else:
    tokenizer = nlp.data.BERTTokenizer(vocab=vocab, lower=lower)

batchify_fn = nlp.data.batchify.Tuple(
    nlp.data.batchify.Stack(),
    nlp.data.batchify.Pad(axis=0, pad_val=vocab[vocab.padding_token]),
    nlp.data.batchify.Pad(axis=0, pad_val=vocab[vocab.padding_token]),
    nlp.data.batchify.Stack('float32'),
    nlp.data.batchify.Stack('float32'),
    nlp.data.batchify.Stack('float32'))

net = BertForQA(bert=bert)
if model_parameters:
    # load complete BertForQA parameters
    net.load_parameters(model_parameters, ctx=ctx, cast_dtype=True)
elif pretrained_bert_parameters:
    # only load BertModel parameters
    bert.load_parameters(pretrained_bert_parameters, ctx=ctx,
                         ignore_extra=True, cast_dtype=True)
    net.span_classifier.initialize(init=mx.init.Normal(0.02), ctx=ctx)
elif pretrained:
    # only load BertModel parameters
    net.span_classifier.initialize(init=mx.init.Normal(0.02), ctx=ctx)
else:
    # no checkpoint is loaded
    net.initialize(init=mx.init.Normal(0.02), ctx=ctx)
github dmlc / gluon-nlp / scripts / machine_translation / dataprocessor.py View on Github external
assert dataset_type in ['train', 'val', 'test']

    if args.bucket_scheme == 'constant':
        bucket_scheme = nlp.data.ConstWidthBucket()
    elif args.bucket_scheme == 'linear':
        bucket_scheme = nlp.data.LinearWidthBucket()
    elif args.bucket_scheme == 'exp':
        bucket_scheme = nlp.data.ExpWidthBucket(bucket_len_step=1.2)
    else:
        raise NotImplementedError

    data_lengths = get_data_lengths(data_set)

    if dataset_type == 'train':
        train_batchify_fn = btf.Tuple(btf.Pad(pad_val=0), btf.Pad(pad_val=0),
                                      btf.Stack(dtype='float32'), btf.Stack(dtype='float32'))

    else:
        data_lengths = list(map(lambda x: x[-1], data_lengths))
        test_batchify_fn = btf.Tuple(btf.Pad(pad_val=0), btf.Pad(pad_val=0),
                                     btf.Stack(dtype='float32'), btf.Stack(dtype='float32'),
                                     btf.Stack())

    batch_sampler = nlp.data.FixedBucketSampler(lengths=data_lengths,
                                                batch_size=(args.batch_size \
                                                            if dataset_type == 'train' \
                                                            else args.test_batch_size),
                                                num_buckets=args.num_buckets,
                                                ratio=args.bucket_ratio,
                                                shuffle=(dataset_type == 'train'),
                                                use_average_length=use_average_length,
                                                num_shards=num_shards,
github awslabs / autogluon / autogluon / task / text_classification / pipeline.py View on Github external
class_labels=task.class_labels,
                                 label_alias=task.label_alias,
                                 pad=pad, pair=task.is_pair,
                                 has_label=True)

    # data train
    # task.dataset_train returns (segment_name, dataset)
    train_tsv = task.dataset_train()[1]
    data_train = mx.gluon.data.SimpleDataset(pool.map(trans, train_tsv))
    data_train_len = data_train.transform(
        lambda input_id, length, segment_id, label_id: length, lazy=False)
    # bucket sampler for training
    pad_val = vocab[vocab.padding_token]
    batchify_fn = nlp.data.batchify.Tuple(
        nlp.data.batchify.Pad(axis=0, pad_val=pad_val),  # input
        nlp.data.batchify.Stack(),  # length
        nlp.data.batchify.Pad(axis=0, pad_val=0),  # segment
        nlp.data.batchify.Stack(label_dtype))  # label
    batch_sampler = nlp.data.sampler.FixedBucketSampler(
        data_train_len,
        batch_size=batch_size,
        num_buckets=10,
        ratio=0,
        shuffle=True)
    # data loader for training
    loader_train = gluon.data.DataLoader(
        dataset=data_train,
        num_workers=num_workers,
        batch_sampler=batch_sampler,
        batchify_fn=batchify_fn)

    # data dev. For MNLI, more than one dev set is available
github dmlc / gluon-nlp / scripts / language_model / run_glue.py View on Github external
# data train
    # task.dataset_train returns (segment_name, dataset)
    train_tsv = _task.dataset_train()[1]
    data_train = list(map(trans, train_tsv))
    data_train = mx.gluon.data.SimpleDataset(data_train)
    data_train_len = data_train.transform(
        lambda _, valid_length, segment_ids, label: valid_length, lazy=False)

    # bucket sampler for training
    pad_val = _vocab[_vocab.padding_token]
    batchify_fn = nlp.data.batchify.Tuple(
        nlp.data.batchify.Pad(axis=0, pad_val=pad_val, round_to=args.round_to),  # input
        nlp.data.batchify.Stack(),  # length
        nlp.data.batchify.Pad(axis=0, pad_val=4, round_to=args.round_to),  # segment
        nlp.data.batchify.Stack(label_dtype))  # label
    batch_sampler = nlp.data.sampler.FixedBucketSampler(data_train_len,
                                                        batch_size=batch_size,
                                                        num_buckets=10,
                                                        ratio=0,
                                                        shuffle=True)
    # data loader for training
    loader_train = gluon.data.DataLoader(dataset=data_train,
                                         num_workers=4,
                                         batch_sampler=batch_sampler,
                                         batchify_fn=batchify_fn)

    # data dev. For MNLI, more than one dev set is available
    dev_tsv = _task.dataset_dev()
    dev_tsv_list = dev_tsv if isinstance(dev_tsv, list) else [dev_tsv]
    loader_dev_list = []
    for segment, data in dev_tsv_list:
github eric-haibin-lin / AMLC19-GluonNLP / 05_deployment / bert / finetune_squad.py View on Github external
name=model_name,
    dataset_name=dataset_name,
    vocab=vocab,
    pretrained=pretrained,
    ctx=ctx,
    use_pooler=False,
    use_decoder=False,
    use_classifier=False)

if args.sentencepiece:
    tokenizer = nlp.data.BERTSPTokenizer(args.sentencepiece, vocab, lower=lower)
else:
    tokenizer = nlp.data.BERTTokenizer(vocab=vocab, lower=lower)

batchify_fn = nlp.data.batchify.Tuple(
    nlp.data.batchify.Stack(),
    nlp.data.batchify.Pad(axis=0, pad_val=vocab[vocab.padding_token]),
    nlp.data.batchify.Pad(axis=0, pad_val=vocab[vocab.padding_token]),
    nlp.data.batchify.Stack('float32'),
    nlp.data.batchify.Stack('float32'),
    nlp.data.batchify.Stack('float32'))

net = BertForQA(bert=bert)
if model_parameters:
    # load complete BertForQA parameters
    net.load_parameters(model_parameters, ctx=ctx, cast_dtype=True)
elif pretrained_bert_parameters:
    # only load BertModel parameters
    bert.load_parameters(pretrained_bert_parameters, ctx=ctx,
                         ignore_extra=True, cast_dtype=True)
    net.span_classifier.initialize(init=mx.init.Normal(0.02), ctx=ctx)
elif pretrained:
github kenjewu / Structured-Self-Attentive-Sentence-Embedding / code / prepare_data.py View on Github external
def get_dataloader(dataset, batch_size, is_train=True):

    # Construct the DataLoader Pad data, stack label and lengths
    batchify_fn = nlp.data.batchify.Tuple(
        nlp.data.batchify.Pad(axis=0),
        nlp.data.batchify.Stack())

    dataloader = None

    # dataloader for training
    if is_train:
        data_lengths = [len(sample[0]) for sample in dataset]

        # n this example, we use a FixedBucketSampler,
        # which assigns each data sample to a fixed bucket based on its length.
        batch_sampler = nlp.data.sampler.FixedBucketSampler(
            data_lengths,
            batch_size=batch_size,
            num_buckets=10,
            ratio=0.2,
            shuffle=True)
        dataloader = gluon.data.DataLoader(