How to use gluonnlp - 10 common examples

To help you get started, we’ve selected a few gluonnlp examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github dmlc / gluon-nlp / tests / unittest / corpora / test_gbw.py View on Github external
def test_gbw():
    batch_size = 80
    seq_len = 35

    stream = nlp.data.GBWStream(segment='test')
    freq = nlp.data.utils.Counter(
        itertools.chain.from_iterable(itertools.chain.from_iterable(stream)))
    assert len(freq) == 21545
    assert sum(c for c in freq.values()) == 159658
    assert freq['English'] == 14
github dmlc / gluon-nlp / tests / unittest / test_vocab_embed.py View on Github external
assert v9.unknown_token == ''
    assert v9.reserved_tokens == ['b', 'a']
    assert v9.embedding is None
    assert 'a' in v9

    v10 = nlp.Vocab(counter, max_size=None, min_freq=100, unknown_token='',
                    padding_token=None, bos_token=None, eos_token=None, reserved_tokens=['b', 'c'])
    assert len(v10) == 3
    assert v10.token_to_idx == {'': 0, 'b': 1, 'c': 2}
    assert v10.idx_to_token[1] == 'b'
    assert v10.unknown_token == ''
    assert v10.reserved_tokens == ['b', 'c']
    assert v10.embedding is None
    assert 'a' not in v10

    v11 = nlp.Vocab(counter, max_size=1, min_freq=2, unknown_token='',
                    padding_token=None, bos_token=None, eos_token=None,
                    reserved_tokens=['', 'b'])
    assert len(v11) == 4
    assert v11.token_to_idx == {'': 0, '': 1, 'b': 2, 'c': 3}
    assert v11.idx_to_token[1] == ''
    assert v11.unknown_token == ''
    assert v11.reserved_tokens == ['', 'b']
    assert v11.embedding is None
    assert 'a' not in v11

    v12 = nlp.Vocab(counter, max_size=None, min_freq=2, unknown_token='b',
                    padding_token=None, bos_token=None, eos_token=None, reserved_tokens=[''])
    assert len(v12) == 3
    assert v12.token_to_idx == {'b': 0, '': 1, 'c': 2}
    assert v12.idx_to_token[1] == ''
    assert v12.unknown_token == 'b'
github dmlc / gluon-nlp / tests / unittest / test_vocab_embed.py View on Github external
assert v3.embedding.token_to_idx == {'': 0, 'c': 1, 'b': 2, 'a': 3, 'some_word$': 4}
    assert v3.embedding.idx_to_token == ['', 'c', 'b', 'a', 'some_word$']
    assert_almost_equal(v3.embedding.idx_to_vec.asnumpy(),
                        np.array([[1.1, 1.2, 1.3, 1.4, 1.5,
                                   0.11, 0.12, 0.13, 0.14, 0.15],
                                  [1.1, 1.2, 1.3, 1.4, 1.5,
                                   0.06, 0.07, 0.08, 0.09, 0.1],
                                  [0.6, 0.7, 0.8, 0.9, 1,
                                   0.11, 0.12, 0.13, 0.14, 0.15],
                                  [0.1, 0.2, 0.3, 0.4, 0.5,
                                   0.01, 0.02, 0.03, 0.04, 0.05],
                                  [1.1, 1.2, 1.3, 1.4, 1.5,
                                   0.11, 0.12, 0.13, 0.14, 0.15]])
                        )

    v4 = nlp.Vocab(counter, max_size=None, min_freq=1, unknown_token='', padding_token=None,
                   bos_token=None, eos_token=None, reserved_tokens=None)
    v4.set_embedding(my_embed3, my_embed4)
    assert v4.embedding.token_to_idx == {'': 0, 'c': 1, 'b': 2, 'a': 3, 'some_word$': 4}
    assert v4.embedding.idx_to_token == ['', 'c', 'b', 'a', 'some_word$']
    assert_almost_equal(v4.embedding.idx_to_vec.asnumpy(),
                        np.array([[1.1, 1.2, 1.3, 1.4, 1.5,
                                   0.11, 0.12, 0.13, 0.14, 0.15],
                                  [1.1, 1.2, 1.3, 1.4, 1.5,
                                   0.06, 0.07, 0.08, 0.09, 0.1],
                                  [0.6, 0.7, 0.8, 0.9, 1,
                                   0.11, 0.12, 0.13, 0.14, 0.15],
                                  [0.1, 0.2, 0.3, 0.4, 0.5,
                                   0.01, 0.02, 0.03, 0.04, 0.05],
                                  [1.1, 1.2, 1.3, 1.4, 1.5,
                                   0.11, 0.12, 0.13, 0.14, 0.15]])
                        )
github dmlc / gluon-nlp / tests / unittest / test_models.py View on Github external
def test_big_text_models(wikitext2_val_and_counter):
    # use a small vocabulary for testing
    val, val_freq = wikitext2_val_and_counter
    vocab = nlp.Vocab(val_freq)
    text_models = ['big_rnn_lm_2048_512']

    for model_name in text_models:
        eprint('testing forward for %s' % model_name)
        model, _ = nlp.model.get_model(model_name, vocab=vocab)

        print(model)
        model.collect_params().initialize()
        batch_size = 10
        hidden = model.begin_state(batch_size=batch_size, func=mx.nd.zeros)
        output, state = model(mx.nd.arange(330).reshape((33, 10)), hidden)
        output.wait_to_read()
github dmlc / gluon-nlp / tests / unittest / batchify / test_batchify.py View on Github external
for pad_index in [[0], [1], [2], [0, 1], [1, 2], [0, 1, 2]]:
                                shapes = [[[2 for _ in range(ndim)] for _ in range(batch_size)]
                                          for _ in range(TOTAL_ELE_NUM)]
                                for j in pad_index:
                                    for i in range(batch_size):
                                        shapes[j][i][axis] = np.random.randint(length_min, length_max)
                                random_data_npy = [tuple(np.random.normal(0, 1, shapes[j][i]).astype(dtype)
                                                         for j in range(TOTAL_ELE_NUM)) for i in range(batch_size)]
                                batchify_fn = []
                                for j in range(TOTAL_ELE_NUM):
                                    if j in pad_index:
                                        batchify_fn.append(batchify.Pad(axis=axis, pad_val=pad_val, ret_length=True,
                                                                        dtype=_dtype))
                                    else:
                                        batchify_fn.append(batchify.Stack(dtype=_dtype))
                                batchify_fn = batchify.Tuple(batchify_fn)
                                ret_use_npy = batchify_fn(random_data_npy)
                                with pytest.warns(UserWarning):
                                    # Using Pad with NDArrays is discouraged for speed reasons.
                                    ret_use_mx = batchify_fn([tuple(mx.nd.array(ele[i], dtype=dtype)
                                                                    for i in range(TOTAL_ELE_NUM))
                                                              for ele in random_data_npy])
                                for i in range(TOTAL_ELE_NUM):
                                    if i in pad_index:
                                        assert ret_use_npy[i][0].dtype == ret_use_mx[i][0].dtype == dtype
                                        assert ret_use_npy[i][1].dtype == ret_use_mx[i][1].dtype == np.int32
                                        assert_allclose(ret_use_npy[i][0].asnumpy(),
                                                        ret_use_mx[i][0].asnumpy())
                                        assert_allclose(ret_use_npy[i][1].asnumpy(),
                                                        ret_use_mx[i][1].asnumpy())
                                        assert (ret_use_npy[i][1].shape == (batch_size,))
                                    else:
github dmlc / gluon-nlp / tests / unittest / test_bertvocab.py View on Github external
def test_bertvocab():
    ctx = mx.cpu()

    bert_base1, vocab1 = nlp.model.get_model('bert_12_768_12',
                                             dataset_name='book_corpus_wiki_en_cased',
                                             pretrained=True, ctx=ctx, use_pooler=True,
                                             use_decoder=False, use_classifier=False)

    bert_base2, vocab2 = nlp.model.get_model('bert_12_768_12',
                                             dataset_name='book_corpus_wiki_en_uncased',
                                             pretrained=True, ctx=ctx, use_pooler=True,
                                             use_decoder=False, use_classifier=False)

    bert_base3, vocab3 = nlp.model.get_model('bert_12_768_12',
                                             dataset_name='wiki_multilingual_cased',
                                             pretrained=True, ctx=ctx, use_pooler=True,
                                             use_decoder=False, use_classifier=False)

    bert_base4, vocab4 = nlp.model.get_model('bert_12_768_12',
                                             dataset_name='wiki_multilingual_uncased',
                                             pretrained=True, ctx=ctx, use_pooler=True,
                                             use_decoder=False, use_classifier=False)

    bert_base5, vocab5 = nlp.model.get_model('bert_12_768_12',
                                             dataset_name='wiki_cn_cased',
                                             pretrained=True, ctx=ctx, use_pooler=True,
                                             use_decoder=False, use_classifier=False)

    bert_base6, vocab6 = nlp.model.get_model('bert_12_768_12',
                                                dataset_name='kobert_news_wiki_ko_cased',
github dmlc / gluon-nlp / tests / unittest / test_elmo.py View on Github external
def test_get_elmo_models():
    model_names = ['elmo_2x1024_128_2048cnn_1xhighway', 'elmo_2x2048_256_2048cnn_1xhighway',
                   'elmo_2x4096_512_2048cnn_2xhighway', 'elmo_2x4096_512_2048cnn_2xhighway']
    datasets = ['gbw', 'gbw', 'gbw', '5bw']

    for model_name, dataset in zip(model_names, datasets):
        print('testing forward for %s on dataset %s' % (model_name, dataset))
        model, _ = nlp.model.get_model(model_name,
                                       dataset_name=dataset,
                                       pretrained=dataset is not None,
                                       root='tests/data/model/')

        print(model)
        if not dataset:
            model.collect_params().initialize()
        begin_state = model.begin_state(mx.nd.zeros, batch_size=20)
        output, state = model(mx.nd.arange(35000).reshape(20, 35, 50), begin_state)
        del model
        mx.nd.waitall()
github dmlc / gluon-nlp / tests / unittest / test_models.py View on Github external
if not has_missing_params:
                model, vocab = nlp.model.get_model(model_name, dataset_name=dataset,
                                                   pretrained=True)
            else:
                with pytest.raises(AssertionError):
                    model, vocab = nlp.model.get_model(model_name, dataset_name=dataset,
                                                       pretrained=True)

                if not disable_missing_parameters:
                    model, vocab = nlp.model.get_model(model_name, dataset_name=dataset,
                                                       pretrained=True,
                                                       pretrained_allow_missing=True)
                elif 'biobert' in dataset:
                    # Biobert specific test case
                    model, vocab = nlp.model.get_model(model_name, dataset_name=dataset,
                                                       pretrained=True,
                                                       pretrained_allow_missing=True,
                                                       use_decoder=False,
                                                       use_classifier=False)
                elif 'clinicalbert' in dataset:
                    # Clinicalbert specific test case
                    model, vocab = nlp.model.get_model(model_name, dataset_name=dataset,
                                                       pretrained=True,
                                                       pretrained_allow_missing=True,
                                                       use_decoder=False)
                else:
                    assert False, "Testcase needs to be adapted."

            assert len(vocab) == vocab_size[dataset]
            for token in special_tokens:
                assert token in vocab, "Token %s not found in the vocab" % token
github dmlc / gluon-nlp / tests / unittest / test_vocab_embed.py View on Github external
def test_word_embedding_analogy_evaluation_models(analogy_function):
    dataset = nlp.data.GoogleAnalogyTestSet()
    dataset = [d for i, d in enumerate(dataset) if i < 10]

    embedding = nlp.embedding.create('fasttext', source='wiki.simple')
    counter = nlp.data.utils.Counter(embedding.idx_to_token)
    vocab = nlp.vocab.Vocab(counter)
    vocab.set_embedding(embedding)

    dataset_coded = [[vocab[d[0]], vocab[d[1]], vocab[d[2]], vocab[d[3]]]
                     for d in dataset]
    dataset_coded_nd = nd.array(dataset_coded, dtype=np.int64)

    for k in [1, 3]:
        for exclude_question_words in [True, False]:
            evaluator = nlp.embedding.evaluation.WordEmbeddingAnalogy(
                idx_to_vec=vocab.embedding.idx_to_vec,
                analogy_function=analogy_function, k=k,
                exclude_question_words=exclude_question_words)
            evaluator.initialize()
github dmlc / gluon-nlp / tests / unittest / test_vocab_embed.py View on Github external
def test_token_embedding_from_file_S3_with_custom_unknown_token(unknown_token):
    nlp.embedding.create('glove', source='glove.6B.50d',
                         unknown_token=unknown_token)