How to use the gluonnlp.data.utils.Counter function in gluonnlp

To help you get started, we’ve selected a few gluonnlp examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github dmlc / gluon-nlp / tests / unittest / corpora / test_wikitext.py View on Github external
def test_wikitext2_raw():
    train = nlp.data.WikiText2Raw(
        segment='train', root=os.path.join('tests', 'data', 'wikitext-2'))
    val = nlp.data.WikiText2Raw(
        segment='val', root=os.path.join('tests', 'data', 'wikitext-2'))
    test = nlp.data.WikiText2Raw(
        segment='test', root=os.path.join('tests', 'data', 'wikitext-2'))
    train_freq, val_freq, test_freq = [
        nlp.data.utils.Counter(x) for x in [train, val, test]
    ]
    assert len(train) == 10843541
    assert len(train_freq) == 192
    assert len(val) == 1136862
    assert len(val_freq) == 168
    assert len(test) == 1278983
    assert len(test_freq) == 177
    assert test_freq['a'.encode('utf-8')[0]] == 81512
github dmlc / gluon-nlp / tests / unittest / corpora / test_large_text_compression_benchmark.py View on Github external
def test_text8():
    data = nlp.data.Text8()
    freq = nlp.data.utils.Counter(itertools.chain.from_iterable(data))
    assert len(freq) == 253854
    assert sum(c for c in freq.values()) == 17005207
    assert freq['english'] == 11868
github dmlc / gluon-nlp / tests / unittest / test_vocab_embed.py View on Github external
tokens = list(simple_tokenize(source_str, token_delim, seq_delim))
    cnt1 = nlp.data.count_tokens(tokens, to_lower=False)
    assert cnt1 == nlp.data.utils.Counter(
        {'is': 2, 'life': 2, '.': 2, 'Life': 1, 'great': 1, '!': 1, 'good': 1, "isn't": 1,
         'bad': 1})

    cnt2 = nlp.data.count_tokens(tokens, to_lower=True)
    assert cnt2 == nlp.data.utils.Counter(
        {'life': 3, 'is': 2, '.': 2, 'great': 1, '!': 1, 'good': 1, "isn't": 1, 'bad': 1}), cnt2

    counter_to_update = nlp.data.utils.Counter({'life': 2})

    cnt3 = nlp.data.utils.count_tokens(tokens, to_lower=False,
                                   counter=counter_to_update.copy())
    assert cnt3 == nlp.data.utils.Counter(
        {'is': 2, 'life': 4, '.': 2, 'Life': 1, 'great': 1, '!': 1, 'good': 1, "isn't": 1,
         'bad': 1})

    cnt4 = nlp.data.count_tokens(tokens, to_lower=True,
                             counter=counter_to_update.copy())
    assert cnt4 == nlp.data.utils.Counter(
        {'life': 5, 'is': 2, '.': 2, 'great': 1, '!': 1, 'good': 1, "isn't": 1, 'bad': 1})
github dmlc / gluon-nlp / tests / unittest / test_vocab_embed.py View on Github external
def test_word_embedding_analogy_evaluation_models(analogy_function):
    dataset = nlp.data.GoogleAnalogyTestSet()
    dataset = [d for i, d in enumerate(dataset) if i < 10]

    embedding = nlp.embedding.create('fasttext', source='wiki.simple')
    counter = nlp.data.utils.Counter(embedding.idx_to_token)
    vocab = nlp.vocab.Vocab(counter)
    vocab.set_embedding(embedding)

    dataset_coded = [[vocab[d[0]], vocab[d[1]], vocab[d[2]], vocab[d[3]]]
                     for d in dataset]
    dataset_coded_nd = nd.array(dataset_coded, dtype=np.int64)

    for k in [1, 3]:
        for exclude_question_words in [True, False]:
            evaluator = nlp.embedding.evaluation.WordEmbeddingAnalogy(
                idx_to_vec=vocab.embedding.idx_to_vec,
                analogy_function=analogy_function, k=k,
                exclude_question_words=exclude_question_words)
            evaluator.initialize()

            words1 = dataset_coded_nd[:, 0]
github dmlc / gluon-nlp / tests / unittest / test_vocab_embed.py View on Github external
cnt2 = nlp.data.count_tokens(tokens, to_lower=True)
    assert cnt2 == nlp.data.utils.Counter(
        {'life': 3, 'is': 2, '.': 2, 'great': 1, '!': 1, 'good': 1, "isn't": 1, 'bad': 1}), cnt2

    counter_to_update = nlp.data.utils.Counter({'life': 2})

    cnt3 = nlp.data.utils.count_tokens(tokens, to_lower=False,
                                   counter=counter_to_update.copy())
    assert cnt3 == nlp.data.utils.Counter(
        {'is': 2, 'life': 4, '.': 2, 'Life': 1, 'great': 1, '!': 1, 'good': 1, "isn't": 1,
         'bad': 1})

    cnt4 = nlp.data.count_tokens(tokens, to_lower=True,
                             counter=counter_to_update.copy())
    assert cnt4 == nlp.data.utils.Counter(
        {'life': 5, 'is': 2, '.': 2, 'great': 1, '!': 1, 'good': 1, "isn't": 1, 'bad': 1})
github dmlc / gluon-nlp / tests / unittest / corpora / test_wikitext.py View on Github external
def test_wikitext103_raw():
    train = nlp.data.WikiText103Raw(
        segment='train', root=os.path.join('tests', 'data', 'wikitext-103'))
    val = nlp.data.WikiText103Raw(
        segment='val', root=os.path.join('tests', 'data', 'wikitext-103'))
    test = nlp.data.WikiText103Raw(
        segment='test', root=os.path.join('tests', 'data', 'wikitext-103'))
    train_freq, val_freq, test_freq = [
        nlp.data.utils.Counter(x) for x in [train, val, test]
    ]
    assert len(train) == 535800393
    assert len(train_freq) == 203
    assert len(val) == 1136862
    assert len(val_freq) == 168
    assert len(test) == 1278983
    assert len(test_freq) == 177
    assert test_freq['a'.encode('utf-8')[0]] == 81512
github dmlc / gluon-nlp / tests / unittest / batchify / test_batchify_language_model.py View on Github external
def test_bptt_batchify_padding_token():
    vocab = nlp.Vocab(
        nlp.data.utils.Counter(['a', 'b', 'c']), padding_token=None)
    seq_len = 35
    batch_size = 80

    # Padding token must always be specified for StreamBPTTBatchify
    with pytest.raises(ValueError):
        nlp.data.batchify.StreamBPTTBatchify(
            vocab, seq_len, batch_size, last_batch='discard')

    with pytest.raises(ValueError):
        nlp.data.batchify.StreamBPTTBatchify(
            vocab, seq_len, batch_size, last_batch='keep')

    # Padding token must be specified for last_batch='keep' for CorpusBPTTBatchify
    with pytest.raises(ValueError):
        nlp.data.batchify.CorpusBPTTBatchify(
            vocab, seq_len, batch_size, last_batch='keep')
github dmlc / gluon-nlp / tests / unittest / test_vocab_embed.py View on Github external
def counter():
    return nlp.data.utils.Counter( ['a', 'b', 'b', 'c', 'c', 'c',
                                    'some_word$'])
github dmlc / gluon-nlp / scripts / word_embedding_evaluation / word_embedding_evaluation.py View on Github external
def evaluate_similarity(args, token_embedding, dataset,
                        similarity_function='CosineSimilarity'):
    """Evaluation on similarity task."""
    # Closed vocabulary: Only need the words occuring in the dataset
    counter = nlp.data.utils.Counter(w for wpair in dataset for w in wpair[:2])
    vocab = nlp.vocab.Vocab(counter)
    vocab.set_embedding(token_embedding)

    if args.ignore_oov:
        initial_length = len(dataset)
        dataset = [d for d in dataset if d[0] in vocab and d[1] in vocab]
        num_dropped = initial_length - len(dataset)
        if num_dropped:
            logging.warning('Dropped %s pairs from %s as the were OOV.',
                            num_dropped, dataset.__class__.__name__)

    dataset_coded = [[vocab[d[0]], vocab[d[1]], d[2]] for d in dataset]
    words1, words2, scores = zip(*dataset_coded)

    evaluator = nlp.embedding.evaluation.WordEmbeddingSimilarity(
        idx_to_vec=vocab.embedding.idx_to_vec,