How to use the gluonnlp.data function in gluonnlp

To help you get started, we’ve selected a few gluonnlp examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github dmlc / gluon-nlp / tests / unittest / corpora / test_gbw.py View on Github external
def test_gbw():
    batch_size = 80
    seq_len = 35

    stream = nlp.data.GBWStream(segment='test')
    freq = nlp.data.utils.Counter(
        itertools.chain.from_iterable(itertools.chain.from_iterable(stream)))
    assert len(freq) == 21545
    assert sum(c for c in freq.values()) == 159658
    assert freq['English'] == 14
github eric-haibin-lin / AMLC19-GluonNLP / 04_contextual_representation / bert / run_pretraining_hvd.py View on Github external
if args.dataset_name:
            warnings.warn('Both --dataset_name and --sentencepiece are provided. '
                          'The vocabulary will be loaded based on --sentencepiece')
            dataset_name = None
        vocab = nlp.vocab.BERTVocab.from_sentencepiece(args.sentencepiece)

    model, nsp_loss, mlm_loss, vocab = get_model_loss([ctx], args.model, args.pretrained,
                                                      dataset_name, vocab, args.dtype,
                                                      ckpt_dir=args.ckpt_dir,
                                                      start_step=args.start_step)
    logging.debug('Model created')
    data_eval = args.data_eval

    if args.raw:
        if args.sentencepiece:
            tokenizer = nlp.data.BERTSPTokenizer(args.sentencepiece, vocab,
                                                 num_best=args.sp_nbest,
                                                 alpha=args.sp_alpha, lower=not args.cased)
        else:
            tokenizer = nlp.data.BERTTokenizer(vocab=vocab, lower=not args.cased)

        cache_dir = os.path.join(args.ckpt_dir, 'data_eval_cache')
        cache_file = os.path.join(cache_dir, 'part-000.npz')
        nlp.utils.mkdir(cache_dir)

        # generate dev dataset from the raw text if needed
        if not args.eval_use_npz:
            data_eval = cache_file
            if not os.path.isfile(cache_file) and rank == 0:
                generate_dev_set(tokenizer, vocab, cache_file, args)

    logging.debug('Random seed set to %d', random_seed)
github dmlc / gluon-nlp / scripts / word_embeddings / evaluation.py View on Github external
def add_parameters(parser):
    """Add evaluation specific parameters to parser."""
    group = parser.add_argument_group('Evaluation arguments')

    group.add_argument('--eval-batch-size', type=int, default=512)

    # Datasets
    group.add_argument(
        '--similarity-datasets', type=str,
        default=nlp.data.word_embedding_evaluation.word_similarity_datasets,
        nargs='*',
        help='Word similarity datasets to use for intrinsic evaluation.')
    group.add_argument(
        '--similarity-functions', type=str,
        default=nlp.embedding.evaluation.list_evaluation_functions(
            'similarity'), nargs='+',
        help='Word similarity functions to use for intrinsic evaluation.')
    group.add_argument(
        '--analogy-datasets', type=str, default=['GoogleAnalogyTestSet'],
        nargs='*',
        help='Word similarity datasets to use for intrinsic evaluation.')
    group.add_argument(
        '--analogy-functions', type=str,
        default=nlp.embedding.evaluation.list_evaluation_functions('analogy'),
        nargs='+',
        help='Word analogy functions to use for intrinsic evaluation. ')
github dmlc / gluon-nlp / scripts / question_answering / data_pipeline.py View on Github external
def __call__(self, example):
        """Maps examples into distinct tokens

        Parameters
        ----------
        example : dict
            Example to process with context_tokens and ques_tokens keys

        Returns
        -------
        mapped_values : List[Tuple]
            Result of mapping process. Each tuple of (token, count) format
        """
        para_counter = data.count_tokens(example['context_tokens'] if not self._iterate_over_example
                                         else [c for tkn in example['context_tokens'] for c in tkn])
        ques_counter = data.count_tokens(example['ques_tokens'] if not self._iterate_over_example
                                         else [c for tkn in example['ques_tokens'] for c in tkn])
        counter = para_counter + ques_counter
        return list(counter.items())
github dmlc / gluon-nlp / docs / examples / machine_translation / nmt / transformer.py View on Github external
def register_vocab(dataset, sha1):
    if dataset not in nlp.data.utils._vocab_sha1:
        nlp.data.utils._vocab_sha1[dataset] = sha1
github eric-haibin-lin / AMLC19-GluonNLP / 04_contextual_representation / bert / pretraining_utils.py View on Github external
if isinstance(dataset, nlp.data.NumpyDataset):
            lengths = dataset.get_field('valid_lengths')
        elif isinstance(dataset, BERTPretrainDataset):
            lengths = dataset.transform(lambda input_ids, segment_ids, masked_lm_positions, \
                                               masked_lm_ids, masked_lm_weights, \
                                               next_sentence_labels, valid_lengths: \
                                               valid_lengths, lazy=False)
        else:
            raise ValueError('unexpected dataset type: %s'%str(dataset))

        # A batch includes: input_id, masked_id, masked_position, masked_weight,
        #                   next_sentence_label, segment_id, valid_length
        batchify_fn = Tuple(Pad(), Pad(), Pad(), Pad(), Stack(), Pad(), Stack())
        if self._use_avg_len:
            # sharded data loader
            sampler = nlp.data.FixedBucketSampler(lengths=lengths,
                                                  # batch_size per shard
                                                  batch_size=self._batch_size,
                                                  num_buckets=self._num_buckets,
                                                  shuffle=self._shuffle,
                                                  use_average_length=True,
                                                  num_shards=self._num_ctxes)
            dataloader = nlp.data.ShardedDataLoader(dataset,
                                                    batch_sampler=sampler,
                                                    batchify_fn=batchify_fn,
                                                    num_workers=self._num_ctxes)
        else:
            sampler = nlp.data.FixedBucketSampler(lengths,
                                                  batch_size=self._batch_size * self._num_ctxes,
                                                  num_buckets=self._num_buckets,
                                                  ratio=0,
                                                  shuffle=self._shuffle)
github dmlc / gluon-nlp / scripts / word_embedding_evaluation / word_embedding_evaluation.py View on Github external
', '.join(
                                     nlp.embedding.list_sources().keys()))))
    group.add_argument('--embedding-source', type=str, default='wiki.simple',
                       help=('Source from which to initialize the embedding.'
                             'Pass --list-embedding-sources to get a list of '
                             'valid sources for a given --embedding-name.'))
    group.add_argument('--list-embedding-sources', action='store_true')

    # Evaluation arguments
    group = parser.add_argument_group('Evaluation arguments')
    group.add_argument('--ignore-oov', action='store_true',
                       help='Drop OOV words from evaluation datasets.')
    ## Datasets
    group.add_argument(
        '--similarity-datasets', type=str,
        default=nlp.data.word_embedding_evaluation.word_similarity_datasets,
        nargs='*',
        help='Word similarity datasets to use for intrinsic evaluation.')
    group.add_argument(
        '--similarity-functions', type=str,
        default=nlp.embedding.evaluation.list_evaluation_functions(
            'similarity'), nargs='+',
        help='Word similarity functions to use for intrinsic evaluation.')
    group.add_argument(
        '--analogy-datasets', type=str,
        default=nlp.data.word_embedding_evaluation.word_analogy_datasets,
        nargs='*',
        help='Word similarity datasets to use for intrinsic evaluation.')
    group.add_argument(
        '--analogy-functions', type=str,
        default=nlp.embedding.evaluation.list_evaluation_functions('analogy'),
        nargs='+',
github dmlc / gluon-nlp / scripts / language_model / transformer_xl.py View on Github external
from transformer.model import get_model
    with open(args.vocab_file, 'r') as f:
        vocab = nlp.Vocab.from_json(f.read())

    ctx = mx.gpu(args.gpu) if args.gpu is not None else mx.cpu()
    model, vocab = get_model('transformerxl', vocab=vocab, dataset_name=args.dataset,
                             clamp_len=args.clamp_len)
    model.initialize(ctx=ctx)
    model.load_parameters(args.parameter_file, ignore_extra=False)
    model.hybridize()
    print(model)

    # Data
    if args.dataset == 'wt103':
        val_dataset, test_dataset = [
            nlp.data.WikiText103(segment=segment, skip_empty=False, bos=vocab.bos_token,
                                 eos=vocab.eos_token) for segment in ['val', 'test']
        ]
    elif args.dataset == 'lm1b':
        # bos=vocab.eos_token is not a typo: tf uses ['<s>'] + symbols + ['<s>']
        test_datasets = list(
            nlp.data.GBWStream(segment='test', skip_empty=True, bos=vocab.eos_token,
                               eos=vocab.eos_token))
        assert len(test_datasets) == 1
        test_dataset = mx.gluon.data.SimpleDataset(
            list(itertools.chain.from_iterable(test_datasets[0])))
        val_dataset = None
    elif args.dataset == 'text8':
        dataset = nlp.data.Text8(max_sentence_length=None)
        chars = list(itertools.chain.from_iterable(list(w) + ['_'] for w in dataset[0]))
        num_test_chars = 5000000
        val_dataset = mx.gluon.data.SimpleDataset(chars[-2 * num_test_chars:-num_test_chars])</s></s>
github dmlc / gluon-nlp / scripts / text_generation / sequence_sampling.py View on Github external
def get_tokenizer(lm_model):
    if lm_model.startswith('gpt2'):
        return nlp.data.GPT2BPETokenizer(), nlp.data.GPT2BPEDetokenizer()
    else:
        return nlp.data.SacreMosesTokenizer(), nlp.data.SacreMosesDetokenizer(return_str=True)