How to use the gluonnlp.data.count_tokens function in gluonnlp

To help you get started, we’ve selected a few gluonnlp examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github dmlc / gluon-nlp / tests / unittest / test_vocab_embed.py View on Github external
def _test_count_tokens(token_delim, seq_delim):
    source_str = _get_test_str_of_tokens(token_delim, seq_delim)

    tokens = list(simple_tokenize(source_str, token_delim, seq_delim))
    cnt1 = nlp.data.count_tokens(tokens, to_lower=False)
    assert cnt1 == nlp.data.utils.Counter(
        {'is': 2, 'life': 2, '.': 2, 'Life': 1, 'great': 1, '!': 1, 'good': 1, "isn't": 1,
         'bad': 1})

    cnt2 = nlp.data.count_tokens(tokens, to_lower=True)
    assert cnt2 == nlp.data.utils.Counter(
        {'life': 3, 'is': 2, '.': 2, 'great': 1, '!': 1, 'good': 1, "isn't": 1, 'bad': 1}), cnt2

    counter_to_update = nlp.data.utils.Counter({'life': 2})

    cnt3 = nlp.data.utils.count_tokens(tokens, to_lower=False,
                                   counter=counter_to_update.copy())
    assert cnt3 == nlp.data.utils.Counter(
        {'is': 2, 'life': 4, '.': 2, 'Life': 1, 'great': 1, '!': 1, 'good': 1, "isn't": 1,
         'bad': 1})
github aisolab / nlp_implementation / A_Structured_Self-attentive_Sentence_Embedding / preprocessing.py View on Github external
tr_data, val_data = train_test_split(data, test_size=.2)

tst_filepath = proj_dir / 'data/ratings_test.txt'
tst_data = pd.read_csv(tst_filepath, sep='\t').loc[:, ['document', 'label']]
tst_data = tst_data.loc[tst_data['document'].isna().apply(lambda elm: not elm), :]

# extracting morph in sentences
tokenizer = MeCab()
tokenized = tr_data['document'].apply(tokenizer.morphs)

plt.hist(list(map(lambda elm: len(elm), tokenized)))
plt.show()

# making the vocab
counter = nlp.data.count_tokens(itertools.chain.from_iterable(tokenized))
vocab = nlp.Vocab(counter=counter, min_freq=10, bos_token=None, eos_token=None)

# connecting SISG embedding with vocab
ptr_embedding = nlp.embedding.create('fasttext', source='wiki.ko')
vocab.set_embedding(ptr_embedding)

# saving vocab
with open('./data/vocab.pkl', mode='wb') as io:
    pickle.dump(vocab, io)

# saving tr_data, val_data, tst_data
tr_data.to_csv('./data/train.txt', index=False, sep='\t')
val_data.to_csv('./data/val.txt', index=False, sep='\t')
tst_data.to_csv('./data/test.txt', index=False, sep='\t')
github xwhan / Extremely-Fine-Grained-Entity-Typing / data_utils.py View on Github external
def build_vocab(file_list = ['crowd/dev.json', 'crowd/train_m.json', 'crowd/test.json', 'ontonotes/augmented_train.json', 'ontonotes/g_dev.json', 'ontonotes/g_test.json', 'distant_supervision/headword_train.json', 'distant_supervision/headword_dev.json', 'distant_supervision/el_dev.json', 'distant_supervision/el_train.json']):
  data_path = "data/release/"
  words = []
  for file in file_list:
    file_name = data_path + file
    with open(file_name) as f:
      line_elems = [json.loads(sent.strip()) for sent in f.readlines()]
      mention_seq = [line_elem["mention_span"].split() for line_elem in line_elems]
      left_seq = [line_elem['left_context_token'] for line_elem in line_elems]
      right_seq = [line_elem['right_context_token'] for line_elem in line_elems]
      for _ in mention_seq + right_seq + left_seq:
        words += [tok.lower() for tok in _]
  counter = gluonnlp.data.count_tokens(words)
  vocab = gluonnlp.Vocab(counter)
  with open('data/release/idx_to_token', 'w') as g:
    g.write('\n'.join(vocab.idx_to_token))
  with open('data/release/token_to_idx.json', 'w') as g:
    json.dump(vocab.token_to_idx, g)
github aisolab / nlp_implementation / Bidirectional_LSTM-CRF_Models_for_Sequence_Tagging / build_dataset_and_vocab.py View on Github external
data = []
                continue

    except StopIteration:
        print('parsing is done')


label_counter = nlp.data.count_tokens(itertools.chain.from_iterable(map(lambda elm: elm[1], dataset)))
tmp_label_vocab = nlp.Vocab(label_counter, unknown_token=None)
label_vocab = Vocab(tmp_label_vocab.idx_to_token, unknown_token=None)

with open('./data/label_vocab.pkl', mode='wb') as io:
    pickle.dump(label_vocab, io)

tr, val = train_test_split(dataset, test_size=.1, random_state=777)
token_counter = nlp.data.count_tokens(itertools.chain.from_iterable(map(lambda elm: elm[0], tr)))
tmp_token_vocab = nlp.Vocab(token_counter, min_freq=10)
ptr_embedding = nlp.embedding.create('fasttext', source='wiki.ko')
tmp_token_vocab.set_embedding(ptr_embedding)
token_vocab = Vocab(tmp_token_vocab.idx_to_token)
token_vocab.embedding = tmp_token_vocab.embedding.idx_to_vec.asnumpy()

with open('./data/token_vocab.pkl', mode='wb') as io:
    pickle.dump(token_vocab, io)
with open('./data/train.pkl', mode='wb') as io:
    pickle.dump(tr, io)
with open('./data/validation.pkl', mode='wb') as io:
    pickle.dump(val, io)
github dmlc / gluon-nlp / scripts / language_model / conversion_utils / convert_transformer_xl.py View on Github external
special_tokens['unknown_token'] = ''

    # Discover special tokens
    if [''] == corpus.vocab.special:
        if '' in sym2idx:  # Only include if special token is actually used
            special_tokens['eos_token'] = ''
    elif '<s>' in sym2idx:
        # Special case for model trained on Google 1 Billion Word LM dataset
        special_tokens['eos_token'] = '<s>'
    elif corpus.vocab.special:
        raise NotImplementedError('Provided TransformerXL cache.pkl uses an unknown special token. '
                                  'You must extend the `to_gluon_vocab` method to support it.')
    else:
        special_tokens['eos_token'] = None

    counter = nlp.data.count_tokens(sym2idx.keys())
    vocab = nlp.vocab.Vocab(counter, token_to_idx=sym2idx, **special_tokens)
    return vocab
</s></s>
github dmlc / gluon-nlp / scripts / word_embeddings / evaluate_pretrained.py View on Github external
tokens = evaluation.get_similarity_task_tokens(args_)
        vocab = nlp.Vocab(nlp.data.count_tokens(tokens))
        with utils.print_time('set {} embeddings'.format(len(tokens))):
            vocab.set_embedding(token_embedding_)
        evaluation.evaluate_similarity(
            args_, vocab.embedding, ctx, logfile=os.path.join(
                args_.logdir, 'similarity{}.tsv'.format(name)))
    if args_.analogy_datasets:
        with utils.print_time('extend open vocabulary with '
                              'OOV tokens for analogy'):
            tokens = evaluation.get_analogy_task_tokens(args_)
            if token_embedding_.unknown_token is not None:
                tokens.update(token_embedding_.idx_to_token[1:])
            else:
                tokens.update(token_embedding_.idx_to_token)
        vocab = nlp.Vocab(nlp.data.count_tokens(tokens))
        with utils.print_time('set {} embeddings'.format(len(tokens))):
            vocab.set_embedding(token_embedding_)
        evaluation.evaluate_analogy(
            args_, vocab.embedding, ctx, logfile=os.path.join(
                args_.logdir, 'analogy{}.tsv'.format(name)))
github dmlc / gluon-nlp / scripts / natural_language_inference / dataset.py View on Github external
def build_vocab(dataset):
    """
    Build vocab given a dataset.
    """
    counter = nlp.data.count_tokens([w for e in dataset for s in e[:2] for w in s],
                                    to_lower=True)
    vocab = nlp.Vocab(counter)
    return vocab
github dmlc / gluon-nlp / scripts / word_embeddings / data.py View on Github external
Returns
    -------
    gluonnlp.data.DataStream
        Each sample is a valid input to
        gluonnlp.data.EmbeddingCenterContextBatchify.
    gluonnlp.Vocab
        Vocabulary of all tokens in Text8 that occur at least min_freq times of
        maximum size max_vocab_size.
    idx_to_counts : list of int
        Mapping from token indices to their occurrence-counts in the Text8
        dataset.

    """
    with print_time('count and construct vocabulary'):
        counter = nlp.data.count_tokens(itertools.chain.from_iterable(data))
        vocab = nlp.Vocab(counter, unknown_token=None, padding_token=None,
                          bos_token=None, eos_token=None, min_freq=min_freq,
                          max_size=max_vocab_size)
        idx_to_counts = [counter[w] for w in vocab.idx_to_token]

    def code(sentence):
        return [vocab[token] for token in sentence if token in vocab]

    with print_time('code data'):
        data = data.transform(code, lazy=False)
    data = nlp.data.SimpleDataStream([data])
    return data, vocab, idx_to_counts
github dmlc / gluon-nlp / scripts / question_answering / data_pipeline.py View on Github external
def __call__(self, example):
        """Maps examples into distinct tokens

        Parameters
        ----------
        example : dict
            Example to process with context_tokens and ques_tokens keys

        Returns
        -------
        mapped_values : List[Tuple]
            Result of mapping process. Each tuple of (token, count) format
        """
        para_counter = data.count_tokens(example['context_tokens'] if not self._iterate_over_example
                                         else [c for tkn in example['context_tokens'] for c in tkn])
        ques_counter = data.count_tokens(example['ques_tokens'] if not self._iterate_over_example
                                         else [c for tkn in example['ques_tokens'] for c in tkn])
        counter = para_counter + ques_counter
        return list(counter.items())