How to use the gluonnlp.embedding.create function in gluonnlp

To help you get started, we’ve selected a few gluonnlp examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github dmlc / gluon-nlp / tests / unittest / test_vocab_embed.py View on Github external
def test_word_embedding_analogy_evaluation_models(analogy_function):
    dataset = nlp.data.GoogleAnalogyTestSet()
    dataset = [d for i, d in enumerate(dataset) if i < 10]

    embedding = nlp.embedding.create('fasttext', source='wiki.simple')
    counter = nlp.data.utils.Counter(embedding.idx_to_token)
    vocab = nlp.vocab.Vocab(counter)
    vocab.set_embedding(embedding)

    dataset_coded = [[vocab[d[0]], vocab[d[1]], vocab[d[2]], vocab[d[3]]]
                     for d in dataset]
    dataset_coded_nd = nd.array(dataset_coded, dtype=np.int64)

    for k in [1, 3]:
        for exclude_question_words in [True, False]:
            evaluator = nlp.embedding.evaluation.WordEmbeddingAnalogy(
                idx_to_vec=vocab.embedding.idx_to_vec,
                analogy_function=analogy_function, k=k,
                exclude_question_words=exclude_question_words)
            evaluator.initialize()
github dmlc / gluon-nlp / tests / unittest / test_vocab_embed.py View on Github external
def test_token_embedding_from_file_S3_with_custom_unknown_token(unknown_token):
    nlp.embedding.create('glove', source='glove.6B.50d',
                         unknown_token=unknown_token)
github aisolab / nlp_implementation / Effective_Approaches_to_Attention-based_Neural_Machine_Translation / build_vocab.py View on Github external
from collections import Counter
from pathlib import Path
from model.split import Stemmer
from model.utils import Vocab
from utils import Config

# loading dataset
sample_dir = Path('sample')
config = Config("conf/dataset/sample.json")
tr = pd.read_csv(config.train, sep='\t')

# korean vocab
split_ko = Stemmer(language='ko')
count_ko = Counter(itertools.chain.from_iterable(tr['ko'].apply(split_ko.extract_stem).tolist()))
tmp_vocab = nlp.Vocab(count_ko, bos_token=None, eos_token=None)
ptr_embedding = nlp.embedding.create('fasttext', source='wiki.ko', load_ngrams=True)
tmp_vocab.set_embedding(ptr_embedding)
array = tmp_vocab.embedding.idx_to_vec.asnumpy()

vocab_ko = Vocab(tmp_vocab.idx_to_token, bos_token=None, eos_token=None)
vocab_ko.embedding = array
vocab_ko_filepath = sample_dir / "vocab_ko.pkl"
config.update({"source_vocab": str(vocab_ko_filepath)})

with open(vocab_ko_filepath, mode='wb') as io:
    pickle.dump(vocab_ko, io)

# english vocab
split_en = Stemmer(language='en')
count_en = Counter(itertools.chain.from_iterable(tr['en'].apply(split_en.extract_stem).tolist()))
tmp_vocab = nlp.Vocab(count_en)
ptr_embedding = nlp.embedding.create('fasttext', source='wiki.simple', load_ngrams=True)
github dmlc / gluon-nlp / scripts / word_embeddings / evaluate_pretrained.py View on Github external
def load_embedding_from_gluonnlp(args):
    if args.embedding_name.lower() == 'fasttext':
        token_embedding = nlp.embedding.create(
            args.embedding_name,
            source=args.embedding_source,
            load_ngrams=args.fasttext_load_ngrams)
    else:
        token_embedding = nlp.embedding.create(
            args.embedding_name, source=args.embedding_source)
    return token_embedding
github dmlc / gluon-nlp / scripts / question_answering / data_processing.py View on Github external
Returns
        -------
        Vocab
            Word level vocabulary
        """

        if self._options.word_vocab_path and isfile(self._options.word_vocab_path):
            return pickle.load(open(self._options.word_vocab_path, 'rb'))

        all_words = []
        for dataset in self._datasets:
            all_words.extend(self._get_all_word_tokens(dataset))

        word_level_vocab = VocabProvider._create_squad_vocab(all_words)
        word_level_vocab.set_embedding(
            nlp.embedding.create('glove', source='glove.6B.{}d'.format(embedding_size)))

        count = 0

        for i in range(len(word_level_vocab)):
            if (word_level_vocab.embedding.idx_to_vec[i].sum() != 0).asscalar():
                count += 1

        print('{}/{} words have embeddings'.format(count, len(word_level_vocab)))

        if self._options.word_vocab_path:
            pickle.dump(word_level_vocab, open(self._options.word_vocab_path, 'wb'))

        return word_level_vocab
github aisolab / nlp_implementation / A_Structured_Self-attentive_Sentence_Embedding_ptc / build_vocab.py View on Github external
from collections import Counter
from model.split import split_morphs
from model.utils import Vocab
from utils import Config

qpair_dir = Path("qpair")
config = Config("conf/dataset/qpair.json")
train = pd.read_csv(config.train, sep="\t")

list_of_tokens_qa = train["question1"].apply(lambda sen: split_morphs(sen)).tolist()
list_of_tokens_qb = train["question2"].apply(lambda sen: split_morphs(sen)).tolist()
list_of_tokens = list_of_tokens_qa + list_of_tokens_qb

count_tokens = Counter(itertools.chain.from_iterable(list_of_tokens))
tmp_vocab = nlp.Vocab(counter=count_tokens, bos_token=None, eos_token=None)
ptr_embedding = nlp.embedding.create("fasttext", source="wiki.ko", load_ngrams=True)
tmp_vocab.set_embedding(ptr_embedding)

vocab = Vocab(tmp_vocab.idx_to_token, bos_token=None, eos_token=None)
vocab.embedding = tmp_vocab.embedding.idx_to_vec.asnumpy()

with open(qpair_dir / "vocab.pkl", mode="wb") as io:
    pickle.dump(vocab, io)

config.update({"vocab": str(qpair_dir / "vocab.pkl")})
config.save("conf/dataset/qpair.json")
github dmlc / gluon-nlp / scripts / question_answering / data_pipeline.py View on Github external
total=len(word_partitioned)))
        print('Word counters received in {:.3f} sec'.format(time.time() - tic))

        tic = time.time()
        print('Char counters receiving started.')
        char_mapper = SQuADAsyncVocabMapper(iterate_over_example=True)
        char_reducer = SQuADAsyncVocabReducer()
        char_mapped = list(
            tqdm.tqdm(char_mapper.run_async(itertools.chain(train_examples, dev_examples), pool),
                      total=len(train_examples) + len(dev_examples)))
        char_partitioned = SQuADDataPipeline._partition(itertools.chain(*char_mapped))
        char_counts = list(tqdm.tqdm(char_reducer.run_async(char_partitioned, pool),
                                     total=len(char_partitioned)))
        print('Char counters received in {:.3f} sec'.format(time.time() - tic))

        embedding = nlp.embedding.create('glove', source=emb_file_name)

        if is_cased_embedding:
            word_counts = itertools.chain(*[[(item[0], item[1]),
                                             (item[0].lower(), item[1]),
                                             (item[0].capitalize(), item[1]),
                                             (item[0].upper(), item[1])] for item in word_counts])
        else:
            word_counts = [(item[0].lower(), item[1]) for item in word_counts]

        word_vocab = Vocab({item[0]: item[1] for item in word_counts if
                            not shrink_word_vocab or item[0] in embedding.token_to_idx},
                           bos_token=None, eos_token=None)
        word_vocab.set_embedding(embedding)
        char_vocab = Vocab({item[0]: item[1] for item in char_counts},
                           bos_token=None, eos_token=None)
github aisolab / nlp_implementation / Convolutional_Neural_Networks_for_Sentence_Classification / build_vocab.py View on Github external
# loading dataset
nsmc_dir = Path("nsmc")
config = Config("conf/dataset/nsmc.json")
tr = pd.read_csv(config.train, sep="\t").loc[:, ["document", "label"]]

# extracting morph in sentences
list_of_tokens = tr["document"].apply(split_morphs).tolist()

# generating the vocab
token_counter = Counter(itertools.chain.from_iterable(list_of_tokens))
tmp_vocab = nlp.Vocab(
    counter=token_counter, min_freq=10, bos_token=None, eos_token=None
)

# connecting SISG embedding with vocab
ptr_embedding = nlp.embedding.create("fasttext", source="wiki.ko")
tmp_vocab.set_embedding(ptr_embedding)
array = tmp_vocab.embedding.idx_to_vec.asnumpy()

vocab = Vocab(
    tmp_vocab.idx_to_token,
    padding_token="",
    unknown_token="",
    bos_token=None,
    eos_token=None,
)
vocab.embedding = array

# saving vocab
with open(nsmc_dir / "vocab.pkl", mode="wb") as io:
    pickle.dump(vocab, io)