How to use the function in gluonnlp

To help you get started, we’ve selected a few gluonnlp examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github dmlc / gluon-nlp / tests / unittest / View on Github external
def _test_count_tokens(token_delim, seq_delim):
    source_str = _get_test_str_of_tokens(token_delim, seq_delim)

    tokens = list(simple_tokenize(source_str, token_delim, seq_delim))
    cnt1 =, to_lower=False)
    assert cnt1 ==
        {'is': 2, 'life': 2, '.': 2, 'Life': 1, 'great': 1, '!': 1, 'good': 1, "isn't": 1,
         'bad': 1})

    cnt2 =, to_lower=True)
    assert cnt2 ==
        {'life': 3, 'is': 2, '.': 2, 'great': 1, '!': 1, 'good': 1, "isn't": 1, 'bad': 1}), cnt2

    counter_to_update ={'life': 2})

    cnt3 =, to_lower=False,
    assert cnt3 ==
        {'is': 2, 'life': 4, '.': 2, 'Life': 1, 'great': 1, '!': 1, 'good': 1, "isn't": 1,
         'bad': 1})
github aisolab / nlp_implementation / A_Structured_Self-attentive_Sentence_Embedding / View on Github external
tr_data, val_data = train_test_split(data, test_size=.2)

tst_filepath = proj_dir / 'data/ratings_test.txt'
tst_data = pd.read_csv(tst_filepath, sep='\t').loc[:, ['document', 'label']]
tst_data = tst_data.loc[tst_data['document'].isna().apply(lambda elm: not elm), :]

# extracting morph in sentences
tokenizer = MeCab()
tokenized = tr_data['document'].apply(tokenizer.morphs)

plt.hist(list(map(lambda elm: len(elm), tokenized)))

# making the vocab
counter =
vocab = nlp.Vocab(counter=counter, min_freq=10, bos_token=None, eos_token=None)

# connecting SISG embedding with vocab
ptr_embedding = nlp.embedding.create('fasttext', source='wiki.ko')

# saving vocab
with open('./data/vocab.pkl', mode='wb') as io:
    pickle.dump(vocab, io)

# saving tr_data, val_data, tst_data
tr_data.to_csv('./data/train.txt', index=False, sep='\t')
val_data.to_csv('./data/val.txt', index=False, sep='\t')
tst_data.to_csv('./data/test.txt', index=False, sep='\t')
github xwhan / Extremely-Fine-Grained-Entity-Typing / View on Github external
def build_vocab(file_list = ['crowd/dev.json', 'crowd/train_m.json', 'crowd/test.json', 'ontonotes/augmented_train.json', 'ontonotes/g_dev.json', 'ontonotes/g_test.json', 'distant_supervision/headword_train.json', 'distant_supervision/headword_dev.json', 'distant_supervision/el_dev.json', 'distant_supervision/el_train.json']):
  data_path = "data/release/"
  words = []
  for file in file_list:
    file_name = data_path + file
    with open(file_name) as f:
      line_elems = [json.loads(sent.strip()) for sent in f.readlines()]
      mention_seq = [line_elem["mention_span"].split() for line_elem in line_elems]
      left_seq = [line_elem['left_context_token'] for line_elem in line_elems]
      right_seq = [line_elem['right_context_token'] for line_elem in line_elems]
      for _ in mention_seq + right_seq + left_seq:
        words += [tok.lower() for tok in _]
  counter =
  vocab = gluonnlp.Vocab(counter)
  with open('data/release/idx_to_token', 'w') as g:
  with open('data/release/token_to_idx.json', 'w') as g:
    json.dump(vocab.token_to_idx, g)
github aisolab / nlp_implementation / Bidirectional_LSTM-CRF_Models_for_Sequence_Tagging / View on Github external
data = []

    except StopIteration:
        print('parsing is done')

label_counter = elm: elm[1], dataset)))
tmp_label_vocab = nlp.Vocab(label_counter, unknown_token=None)
label_vocab = Vocab(tmp_label_vocab.idx_to_token, unknown_token=None)

with open('./data/label_vocab.pkl', mode='wb') as io:
    pickle.dump(label_vocab, io)

tr, val = train_test_split(dataset, test_size=.1, random_state=777)
token_counter = elm: elm[0], tr)))
tmp_token_vocab = nlp.Vocab(token_counter, min_freq=10)
ptr_embedding = nlp.embedding.create('fasttext', source='wiki.ko')
token_vocab = Vocab(tmp_token_vocab.idx_to_token)
token_vocab.embedding = tmp_token_vocab.embedding.idx_to_vec.asnumpy()

with open('./data/token_vocab.pkl', mode='wb') as io:
    pickle.dump(token_vocab, io)
with open('./data/train.pkl', mode='wb') as io:
    pickle.dump(tr, io)
with open('./data/validation.pkl', mode='wb') as io:
    pickle.dump(val, io)
github dmlc / gluon-nlp / scripts / language_model / conversion_utils / View on Github external
special_tokens['unknown_token'] = ''

    # Discover special tokens
    if [''] == corpus.vocab.special:
        if '' in sym2idx:  # Only include if special token is actually used
            special_tokens['eos_token'] = ''
    elif '<s>' in sym2idx:
        # Special case for model trained on Google 1 Billion Word LM dataset
        special_tokens['eos_token'] = '<s>'
    elif corpus.vocab.special:
        raise NotImplementedError('Provided TransformerXL cache.pkl uses an unknown special token. '
                                  'You must extend the `to_gluon_vocab` method to support it.')
        special_tokens['eos_token'] = None

    counter =
    vocab = nlp.vocab.Vocab(counter, token_to_idx=sym2idx, **special_tokens)
    return vocab
github dmlc / gluon-nlp / scripts / word_embeddings / View on Github external
tokens = evaluation.get_similarity_task_tokens(args_)
        vocab = nlp.Vocab(
        with utils.print_time('set {} embeddings'.format(len(tokens))):
            args_, vocab.embedding, ctx, logfile=os.path.join(
                args_.logdir, 'similarity{}.tsv'.format(name)))
    if args_.analogy_datasets:
        with utils.print_time('extend open vocabulary with '
                              'OOV tokens for analogy'):
            tokens = evaluation.get_analogy_task_tokens(args_)
            if token_embedding_.unknown_token is not None:
        vocab = nlp.Vocab(
        with utils.print_time('set {} embeddings'.format(len(tokens))):
            args_, vocab.embedding, ctx, logfile=os.path.join(
                args_.logdir, 'analogy{}.tsv'.format(name)))
github dmlc / gluon-nlp / scripts / natural_language_inference / View on Github external
def build_vocab(dataset):
    Build vocab given a dataset.
    counter =[w for e in dataset for s in e[:2] for w in s],
    vocab = nlp.Vocab(counter)
    return vocab
github dmlc / gluon-nlp / scripts / word_embeddings / View on Github external
        Each sample is a valid input to
        Vocabulary of all tokens in Text8 that occur at least min_freq times of
        maximum size max_vocab_size.
    idx_to_counts : list of int
        Mapping from token indices to their occurrence-counts in the Text8

    with print_time('count and construct vocabulary'):
        counter =
        vocab = nlp.Vocab(counter, unknown_token=None, padding_token=None,
                          bos_token=None, eos_token=None, min_freq=min_freq,
        idx_to_counts = [counter[w] for w in vocab.idx_to_token]

    def code(sentence):
        return [vocab[token] for token in sentence if token in vocab]

    with print_time('code data'):
        data = data.transform(code, lazy=False)
    data =[data])
    return data, vocab, idx_to_counts
github dmlc / gluon-nlp / scripts / question_answering / View on Github external
def __call__(self, example):
        """Maps examples into distinct tokens

        example : dict
            Example to process with context_tokens and ques_tokens keys

        mapped_values : List[Tuple]
            Result of mapping process. Each tuple of (token, count) format
        para_counter = data.count_tokens(example['context_tokens'] if not self._iterate_over_example
                                         else [c for tkn in example['context_tokens'] for c in tkn])
        ques_counter = data.count_tokens(example['ques_tokens'] if not self._iterate_over_example
                                         else [c for tkn in example['ques_tokens'] for c in tkn])
        counter = para_counter + ques_counter
        return list(counter.items())