How to use the gluonnlp.Vocab function in gluonnlp

To help you get started, we’ve selected a few gluonnlp examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github dmlc / gluon-nlp / tests / unittest / test_vocab_embed.py View on Github external
assert v9.unknown_token == ''
    assert v9.reserved_tokens == ['b', 'a']
    assert v9.embedding is None
    assert 'a' in v9

    v10 = nlp.Vocab(counter, max_size=None, min_freq=100, unknown_token='',
                    padding_token=None, bos_token=None, eos_token=None, reserved_tokens=['b', 'c'])
    assert len(v10) == 3
    assert v10.token_to_idx == {'': 0, 'b': 1, 'c': 2}
    assert v10.idx_to_token[1] == 'b'
    assert v10.unknown_token == ''
    assert v10.reserved_tokens == ['b', 'c']
    assert v10.embedding is None
    assert 'a' not in v10

    v11 = nlp.Vocab(counter, max_size=1, min_freq=2, unknown_token='',
                    padding_token=None, bos_token=None, eos_token=None,
                    reserved_tokens=['', 'b'])
    assert len(v11) == 4
    assert v11.token_to_idx == {'': 0, '': 1, 'b': 2, 'c': 3}
    assert v11.idx_to_token[1] == ''
    assert v11.unknown_token == ''
    assert v11.reserved_tokens == ['', 'b']
    assert v11.embedding is None
    assert 'a' not in v11

    v12 = nlp.Vocab(counter, max_size=None, min_freq=2, unknown_token='b',
                    padding_token=None, bos_token=None, eos_token=None, reserved_tokens=[''])
    assert len(v12) == 3
    assert v12.token_to_idx == {'b': 0, '': 1, 'c': 2}
    assert v12.idx_to_token[1] == ''
    assert v12.unknown_token == 'b'
github dmlc / gluon-nlp / tests / unittest / test_vocab_embed.py View on Github external
assert v3.embedding.token_to_idx == {'': 0, 'c': 1, 'b': 2, 'a': 3, 'some_word$': 4}
    assert v3.embedding.idx_to_token == ['', 'c', 'b', 'a', 'some_word$']
    assert_almost_equal(v3.embedding.idx_to_vec.asnumpy(),
                        np.array([[1.1, 1.2, 1.3, 1.4, 1.5,
                                   0.11, 0.12, 0.13, 0.14, 0.15],
                                  [1.1, 1.2, 1.3, 1.4, 1.5,
                                   0.06, 0.07, 0.08, 0.09, 0.1],
                                  [0.6, 0.7, 0.8, 0.9, 1,
                                   0.11, 0.12, 0.13, 0.14, 0.15],
                                  [0.1, 0.2, 0.3, 0.4, 0.5,
                                   0.01, 0.02, 0.03, 0.04, 0.05],
                                  [1.1, 1.2, 1.3, 1.4, 1.5,
                                   0.11, 0.12, 0.13, 0.14, 0.15]])
                        )

    v4 = nlp.Vocab(counter, max_size=None, min_freq=1, unknown_token='', padding_token=None,
                   bos_token=None, eos_token=None, reserved_tokens=None)
    v4.set_embedding(my_embed3, my_embed4)
    assert v4.embedding.token_to_idx == {'': 0, 'c': 1, 'b': 2, 'a': 3, 'some_word$': 4}
    assert v4.embedding.idx_to_token == ['', 'c', 'b', 'a', 'some_word$']
    assert_almost_equal(v4.embedding.idx_to_vec.asnumpy(),
                        np.array([[1.1, 1.2, 1.3, 1.4, 1.5,
                                   0.11, 0.12, 0.13, 0.14, 0.15],
                                  [1.1, 1.2, 1.3, 1.4, 1.5,
                                   0.06, 0.07, 0.08, 0.09, 0.1],
                                  [0.6, 0.7, 0.8, 0.9, 1,
                                   0.11, 0.12, 0.13, 0.14, 0.15],
                                  [0.1, 0.2, 0.3, 0.4, 0.5,
                                   0.01, 0.02, 0.03, 0.04, 0.05],
                                  [1.1, 1.2, 1.3, 1.4, 1.5,
                                   0.11, 0.12, 0.13, 0.14, 0.15]])
                        )
github dmlc / gluon-nlp / tests / unittest / test_models.py View on Github external
def test_big_text_models(wikitext2_val_and_counter):
    # use a small vocabulary for testing
    val, val_freq = wikitext2_val_and_counter
    vocab = nlp.Vocab(val_freq)
    text_models = ['big_rnn_lm_2048_512']

    for model_name in text_models:
        eprint('testing forward for %s' % model_name)
        model, _ = nlp.model.get_model(model_name, vocab=vocab)

        print(model)
        model.collect_params().initialize()
        batch_size = 10
        hidden = model.begin_state(batch_size=batch_size, func=mx.nd.zeros)
        output, state = model(mx.nd.arange(330).reshape((33, 10)), hidden)
        output.wait_to_read()
github dmlc / gluon-nlp / scripts / sentiment_analysis / process_data.py View on Github external
def _build_vocab(data_name, train_dataset, test_dataset, dev_dataset):
    all_token = []
    max_len = 0
    for dataset in (train_dataset, dev_dataset, test_dataset):
        for line in dataset:
            line = _clean_str(line[0], data_name).split()
            max_len = max_len if max_len > len(line) else len(line)
            all_token.extend(line)
    vocab = nlp.Vocab(nlp.data.count_tokens(all_token))
    vocab.set_embedding(nlp.embedding.create('Word2Vec', source='GoogleNews-vectors-negative300'))
    for word in vocab.embedding._idx_to_token:
        if (vocab.embedding[word] == nd.zeros(300)).sum() == 300:
            vocab.embedding[word] = nd.random.uniform(0, 0.05, 300)
    vocab.embedding[''] = nd.random.uniform(0, 0.05, 300)
    vocab.embedding[''] = nd.zeros(300)
    vocab.embedding[''] = nd.zeros(300)
    vocab.embedding[''] = nd.zeros(300)
    print('maximum length (in tokens): ', max_len)
    return vocab, max_len
github xwhan / Extremely-Fine-Grained-Entity-Typing / label_corr.py View on Github external
word_vocab.set_embedding(embed)
        label_vectors = []
        for id_ in range(len(label2id.keys())):
            label = id2label[id_]
            label_words = label.split('_')
            label_vectors.append(word_vocab.embedding[label_words].asnumpy().sum(0))
        affinity = cosine_similarity(label_vectors)
    else:
        print("BOW features for ontonotes")
        words = []
        for label in label2id.keys():
            label = label.replace('/', ' ')
            labels = label.strip().split()
            words += labels
        word_counter = gluonnlp.data.count_tokens(words)
        word_vocab = gluonnlp.Vocab(word_counter)
        embed = gluonnlp.embedding.create(emb_name, source=emb_source)
        word_vocab.set_embedding(embed)

        label_list = []
        label_vectors = []
        for id_ in range(len(label2id.keys())):
            label = id2label[id_]
            label = label.replace('/', ' ')
            labels = label.strip().split()
            label_list.append(labels)
            label_vectors.append(word_vocab.embedding[labels].asnumpy().sum(0))
        label_vectors = np.array(label_vectors)
        affinity = cosine_similarity(label_vectors)

    matrix = np.zeros((len(label2id.keys()), len(label2id.keys())))
    if goal == 'onto':
github xwhan / Extremely-Fine-Grained-Entity-Typing / data_utils.py View on Github external
def build_vocab(file_list = ['crowd/dev.json', 'crowd/train_m.json', 'crowd/test.json', 'ontonotes/augmented_train.json', 'ontonotes/g_dev.json', 'ontonotes/g_test.json', 'distant_supervision/headword_train.json', 'distant_supervision/headword_dev.json', 'distant_supervision/el_dev.json', 'distant_supervision/el_train.json']):
  data_path = "data/release/"
  words = []
  for file in file_list:
    file_name = data_path + file
    with open(file_name) as f:
      line_elems = [json.loads(sent.strip()) for sent in f.readlines()]
      mention_seq = [line_elem["mention_span"].split() for line_elem in line_elems]
      left_seq = [line_elem['left_context_token'] for line_elem in line_elems]
      right_seq = [line_elem['right_context_token'] for line_elem in line_elems]
      for _ in mention_seq + right_seq + left_seq:
        words += [tok.lower() for tok in _]
  counter = gluonnlp.data.count_tokens(words)
  vocab = gluonnlp.Vocab(counter)
  with open('data/release/idx_to_token', 'w') as g:
    g.write('\n'.join(vocab.idx_to_token))
  with open('data/release/token_to_idx.json', 'w') as g:
    json.dump(vocab.token_to_idx, g)
github dmlc / gluon-nlp / scripts / ner / data.py View on Github external
self.text_vocab = text_vocab
        self.seq_len = seq_len

        self.bert_tokenizer = nlp.data.BERTTokenizer(vocab=text_vocab, lower=not is_cased)

        train_sentences = [] if train_path is None else load_segment(train_path,
                                                                     self.bert_tokenizer)
        dev_sentences = [] if dev_path is None else load_segment(dev_path, self.bert_tokenizer)
        test_sentences = [] if test_path is None else load_segment(test_path, self.bert_tokenizer)
        all_sentences = train_sentences + dev_sentences + test_sentences

        if tag_vocab is None:
            logging.info('Indexing tags...')
            tag_counter = nlp.data.count_tokens(token.tag
                                                for sentence in all_sentences for token in sentence)
            self.tag_vocab = nlp.Vocab(tag_counter, padding_token=NULL_TAG,
                                       bos_token=None, eos_token=None, unknown_token=None)
        else:
            self.tag_vocab = tag_vocab
        self.null_tag_index = self.tag_vocab[NULL_TAG]

        if len(test_sentences) > 0:
            logging.info('example test sentences:')
            for i in range(2):
                logging.info(str(test_sentences[i]))

        self.train_inputs = [self._encode_as_input(sentence) for sentence in train_sentences]
        self.dev_inputs = [self._encode_as_input(sentence) for sentence in dev_sentences]
        self.test_inputs = [self._encode_as_input(sentence) for sentence in test_sentences]

        logging.info('tag_vocab: %s', self.tag_vocab)
github dmlc / gluon-nlp / scripts / question_answering / data_processing.py View on Github external
def _create_squad_vocab(all_tokens):
        """Provides vocabulary based on list of tokens

        Parameters
        ----------

        all_tokens: List[str]
            List of all tokens

        Returns
        -------
        Vocab
            Vocabulary
        """
        counter = data.count_tokens(all_tokens)
        vocab = Vocab(counter)
        return vocab