How to use the gluonnlp.embedding function in gluonnlp

To help you get started, we’ve selected a few gluonnlp examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github dmlc / gluon-nlp / tests / unittest / train / test_embedding.py View on Github external
def test_word2vec_embedding_load_binary_format():
    test_dir = os.path.dirname(os.path.realpath(__file__))
    with pytest.warns(UserWarning):  # UserWarning: skipped likely header line
        word2vec_vec = nlp.embedding.Word2Vec.from_file(
            os.path.join(str(test_dir), 'test_embedding', 'lorem_ipsum_w2v.vec'), elem_delim=' ')
    word2vec_bin = nlp.embedding.Word2Vec.from_w2v_binary(
        os.path.join(str(test_dir), 'test_embedding', 'lorem_ipsum_w2v.bin')
    )
    idx_to_vec = word2vec_bin[word2vec_vec.idx_to_token]
    assert np.all(
        np.isclose(a=word2vec_vec.idx_to_vec.asnumpy(),
                   b=idx_to_vec.asnumpy(), atol=0.001))
    assert all(token in word2vec_bin for token in word2vec_vec.idx_to_token)
github dmlc / gluon-nlp / tests / unittest / test_token_embedding.py View on Github external
def test_serialization(emb, tmp_path=tmp_path):
        emb_path = os.path.join(str(tmp_path), "emb.npz")
        if unknown_lookup:
            with pytest.warns(UserWarning):  # UserWarning: Serialization of `unknown_lookup` is not supported
                emb.serialize(emb_path)
        else:
            emb.serialize(emb_path)
        loaded_emb = nlp.embedding.TokenEmbedding.deserialize(emb_path)
        assert loaded_emb == emb
github aisolab / nlp_implementation / Bidirectional_LSTM-CRF_Models_for_Sequence_Tagging / build_dataset_and_vocab.py View on Github external
except StopIteration:
        print('parsing is done')


label_counter = nlp.data.count_tokens(itertools.chain.from_iterable(map(lambda elm: elm[1], dataset)))
tmp_label_vocab = nlp.Vocab(label_counter, unknown_token=None)
label_vocab = Vocab(tmp_label_vocab.idx_to_token, unknown_token=None)

with open('./data/label_vocab.pkl', mode='wb') as io:
    pickle.dump(label_vocab, io)

tr, val = train_test_split(dataset, test_size=.1, random_state=777)
token_counter = nlp.data.count_tokens(itertools.chain.from_iterable(map(lambda elm: elm[0], tr)))
tmp_token_vocab = nlp.Vocab(token_counter, min_freq=10)
ptr_embedding = nlp.embedding.create('fasttext', source='wiki.ko')
tmp_token_vocab.set_embedding(ptr_embedding)
token_vocab = Vocab(tmp_token_vocab.idx_to_token)
token_vocab.embedding = tmp_token_vocab.embedding.idx_to_vec.asnumpy()

with open('./data/token_vocab.pkl', mode='wb') as io:
    pickle.dump(token_vocab, io)
with open('./data/train.pkl', mode='wb') as io:
    pickle.dump(tr, io)
with open('./data/validation.pkl', mode='wb') as io:
    pickle.dump(val, io)
github dmlc / gluon-nlp / scripts / word_embeddings / evaluate_pretrained.py View on Github external
def validate_args(args):
    """Validate provided arguments and act on --help."""
    if args.list_embedding_sources:
        print('Listing all sources for {} embeddings.'.format(
            args.embedding_name))
        print('Specify --embedding-name if you wish to '
              'list sources of other embeddings')
        print('')
        if args.embedding_name not in nlp.embedding.list_sources().keys():
            print('Invalid embedding name.')
            print('Only {} are supported.'.format(', '.join(
                nlp.embedding.list_sources().keys())))
            sys.exit(1)
        print(' '.join(nlp.embedding.list_sources()[args.embedding_name]))
        sys.exit(0)

    if not (args.embedding_path or args.embedding_name):
        print('You must specify either --embedding-path or --embedding-name ')
        print('Use --embedding-path to load and evaluate '
              'word embeddings from a Word2Vec text format '
              'or fastText binary format file')
        print('Use --embedding-name or to download one of '
              'the pre-trained embedding files included in GluonNLP.')
        sys.exit(1)

    if args.embedding_name and not args.embedding_source:
        print('Please also specify --embedding-source'
              ' to select the version of the pre-trained embedding. '
              'Use --list-embedding-sources to see all available sources')
        sys.exit(1)
github dmlc / gluon-nlp / scripts / word_embeddings / evaluate_pretrained.py View on Github external
# Analogy task is open-vocabulary, so must keep all known words.
        # But if not evaluating analogy, no need to precompute now as all
        # words for closed vocabulary task can be obtained via the unknown
        # lookup
        if not args.analogy_datasets:
            idx_to_token = []
        elif args.analogy_datasets and args.analogy_max_vocab_size:
            idx_to_token = idx_to_token[:args.analogy_max_vocab_size]

        embedding[''] = mx.nd.zeros(model.weight.shape[1])
        if idx_to_token:
            with utils.print_time('compute vectors for {} known '
                                  'words.'.format(len(idx_to_token))):
                embedding[idx_to_token] = model[idx_to_token]
    else:
        embedding = nlp.embedding.TokenEmbedding.from_file(args.embedding_path)

    return embedding
github aisolab / nlp_implementation / Effective_Approaches_to_Attention-based_Neural_Machine_Translation / build_vocab.py View on Github external
tmp_vocab.set_embedding(ptr_embedding)
array = tmp_vocab.embedding.idx_to_vec.asnumpy()

vocab_ko = Vocab(tmp_vocab.idx_to_token, bos_token=None, eos_token=None)
vocab_ko.embedding = array
vocab_ko_filepath = sample_dir / "vocab_ko.pkl"
config.update({"source_vocab": str(vocab_ko_filepath)})

with open(vocab_ko_filepath, mode='wb') as io:
    pickle.dump(vocab_ko, io)

# english vocab
split_en = Stemmer(language='en')
count_en = Counter(itertools.chain.from_iterable(tr['en'].apply(split_en.extract_stem).tolist()))
tmp_vocab = nlp.Vocab(count_en)
ptr_embedding = nlp.embedding.create('fasttext', source='wiki.simple', load_ngrams=True)
tmp_vocab.set_embedding(ptr_embedding)
array = tmp_vocab.embedding.idx_to_vec.asnumpy()

vocab_en = Vocab(tmp_vocab.idx_to_token)
vocab_en.embedding = array
vocab_en_filepath = sample_dir / "vocab_en.pkl"
config.update({"target_vocab": str(vocab_en_filepath)})

with open(vocab_en_filepath, mode='wb') as io:
    pickle.dump(vocab_en, io)

config.save("conf/dataset/sample.json")
github elitcloud / elit / elit / component / dep / common / data.py View on Github external
def get_pret_embs(self, word_dims=None):
        """Read pre-trained embedding file

        Parameters
        ----------
        word_dims : int or None
            vector size. Use `None` for auto-infer
        Returns
        -------
        numpy.ndarray
            T x C numpy NDArray
        """
        assert (self._pret_embeddings is not None), "No pretrained file provided."
        pret_embeddings = gluonnlp.embedding.create(self._pret_embeddings[0], source=self._pret_embeddings[1])
        embs = [None] * len(self._id2word)
        for idx, vec in enumerate(pret_embeddings.idx_to_vec):
            embs[idx] = vec.asnumpy()
        if word_dims is None:
            word_dims = len(pret_embeddings.idx_to_vec[0])
        for idx, emb in enumerate(embs):
            if emb is None:
                embs[idx] = np.zeros(word_dims)
        pret_embs = np.array(embs, dtype=np.float32)
        return pret_embs / np.std(pret_embs)
github dmlc / gluon-nlp / scripts / text_classification / evaluation.py View on Github external
def evaluate_analogy(args, token_embedding, ctx, logfile=None, global_step=0):
    """Evaluate on specified analogy datasets.

    The analogy task is an open vocabulary task, make sure to pass a
    token_embedding with a sufficiently large number of supported tokens.

    """
    results = []
    exclude_question_words = not args.analogy_dont_exclude_question_words
    for analogy_function in args.analogy_functions:
        evaluator = nlp.embedding.evaluation.WordEmbeddingAnalogy(
            idx_to_vec=token_embedding.idx_to_vec,
            exclude_question_words=exclude_question_words,
            analogy_function=analogy_function)
        evaluator.initialize(ctx=ctx)
        if not args.no_hybridize:
            evaluator.hybridize()

        for (dataset_name, dataset_kwargs,
             dataset) in iterate_analogy_datasets(args):
            initial_length = len(dataset)
            dataset_coded = [[
                token_embedding.token_to_idx[d[0]],
                token_embedding.token_to_idx[d[1]],
                token_embedding.token_to_idx[d[2]],
                token_embedding.token_to_idx[d[3]]
            ] for d in dataset if d[0] in token_embedding.token_to_idx
github aisolab / nlp_implementation / A_Structured_Self-attentive_Sentence_Embedding / preprocessing.py View on Github external
tst_data = pd.read_csv(tst_filepath, sep='\t').loc[:, ['document', 'label']]
tst_data = tst_data.loc[tst_data['document'].isna().apply(lambda elm: not elm), :]

# extracting morph in sentences
tokenizer = MeCab()
tokenized = tr_data['document'].apply(tokenizer.morphs)

plt.hist(list(map(lambda elm: len(elm), tokenized)))
plt.show()

# making the vocab
counter = nlp.data.count_tokens(itertools.chain.from_iterable(tokenized))
vocab = nlp.Vocab(counter=counter, min_freq=10, bos_token=None, eos_token=None)

# connecting SISG embedding with vocab
ptr_embedding = nlp.embedding.create('fasttext', source='wiki.ko')
vocab.set_embedding(ptr_embedding)

# saving vocab
with open('./data/vocab.pkl', mode='wb') as io:
    pickle.dump(vocab, io)

# saving tr_data, val_data, tst_data
tr_data.to_csv('./data/train.txt', index=False, sep='\t')
val_data.to_csv('./data/val.txt', index=False, sep='\t')
tst_data.to_csv('./data/test.txt', index=False, sep='\t')
github dmlc / gluon-nlp / scripts / word_embedding_evaluation / word_embedding_evaluation.py View on Github external
def get_args():
    """Construct the argument parser."""
    parser = argparse.ArgumentParser(
        description='Word embedding training with Gluon.',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    # Embeddings arguments
    group = parser.add_argument_group('Embedding arguments')
    group.add_argument('--embedding-name', type=str, default='fasttext',
                       help=('Name of embedding type to load. '
                             'Valid entries: {}'.format(
                                 ', '.join(
                                     nlp.embedding.list_sources().keys()))))
    group.add_argument('--embedding-source', type=str, default='wiki.simple',
                       help=('Source from which to initialize the embedding.'
                             'Pass --list-embedding-sources to get a list of '
                             'valid sources for a given --embedding-name.'))
    group.add_argument('--list-embedding-sources', action='store_true')

    # Evaluation arguments
    group = parser.add_argument_group('Evaluation arguments')
    group.add_argument('--ignore-oov', action='store_true',
                       help='Drop OOV words from evaluation datasets.')
    ## Datasets
    group.add_argument(
        '--similarity-datasets', type=str,
        default=nlp.data.word_embedding_evaluation.word_similarity_datasets,
        nargs='*',
        help='Word similarity datasets to use for intrinsic evaluation.')