How to use the gensim.models.KeyedVectors.load_word2vec_format function in gensim

To help you get started, we’ve selected a few gensim examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github totalgood / nlpia / src / nlpia / book / examples / ch06_nessvectors.py View on Github external
import os

from collections import OrderedDict

import pandas as pd
from nlpia.data.loaders import get_data, BIGDATA_PATH
from gensim.models import KeyedVectors


word_vectors = get_data('word2vec')  # not in book

wordvector_path = os.path.join(BIGDATA_PATH, 'GoogleNews-vectors-negative300.bin.gz')    # not in book, reader required to compose this path

if 'word_vectors' not in globals():  # not in book
    WV = word_vectors = get_data('word2vec')
    word_vectors = KeyedVectors.load_word2vec_format(wordvector_path, binary=True)


###################################################
# Still need to create a class derived from gensim's Word2vec model instead of relying on word_vectors globals

COMPONENT_WORDS = OrderedDict([
    ('placeness', ('geography Geography geographic geographical geographical_location location ' +
                   'locale locations proximity').split()),
    ('peopleness', 'human Humans homo_sapiens peole people individuals humankind people men women'.split()),
    ('animalness', 'animal mammal carnivore animals Animal animal_welfare dog pet cats ani_mal'.split()),
    ('conceptness', 'concept concepts idea'.split()),
    ('femaleness', 'female Female females femal woman girl lady'.split()),
])


def component_vector(words):
github lizaku / vec2graph / vec2graph / example_usage.py View on Github external
def load_embeddings(embeddings_file):
    # Detect the model format by its extension:
    # Binary word2vec format:
    if embeddings_file.endswith('.bin.gz') or embeddings_file.endswith('.bin'):
        emb_model = models.KeyedVectors.load_word2vec_format(embeddings_file, binary=True,
                                                             unicode_errors='replace')
    # Text word2vec format:
    elif embeddings_file.endswith('.txt.gz') or embeddings_file.endswith('.txt') \
            or embeddings_file.endswith('.vec.gz') or embeddings_file.endswith('.vec'):
        emb_model = models.KeyedVectors.load_word2vec_format(
            embeddings_file, binary=False, unicode_errors='replace')
    # ZIP archive from the NLPL vector repository:
    elif embeddings_file.endswith('.zip'):
        with zipfile.ZipFile(embeddings_file, "r") as archive:
            # Loading and showing the metadata of the model:
            metafile = archive.open('meta.json')
            metadata = json.loads(metafile.read())
            for key in metadata:
                print(key, metadata[key])
            print('============')
            # Loading the model itself:
            stream = archive.open("model.bin")  # or model.txt, if you want to look at the model
            emb_model = models.KeyedVectors.load_word2vec_format(
                stream, binary=True, unicode_errors='replace')
    else:
        # Native Gensim format?
github shibing624 / dialogbot / dialogbot / reader / data_helper.py View on Github external
def dump_word_embeddings(word2id, emb_size, word2vec_path, embeddings_path):
    vocab_size = len(word2id)
    word2vec = models.KeyedVectors.load_word2vec_format(
        word2vec_path, binary=False)
    embeddings = np.random.randn(vocab_size, emb_size)
    for word, idx in word2id.items():
        if word in word2vec:
            embeddings[idx, :] = word2vec[word]
        else:
            embeddings[idx, :] = np.random.randn(emb_size)
    np.save(embeddings_path, embeddings)
github emanjavacas / seqmod / seqmod / loaders.py View on Github external
def load_from_model_w2v(self, words, maxwords=None, verbose=False):
        try:
            from gensim.models import KeyedVectors
        except ImportError:
            raise ValueError("No gensim installation found. Please install "
                             "`gensim` to load pretrained w2v embeddings.")
        start = time.time()
        model = KeyedVectors.load_word2vec_format(self.fname, binary=True)
        if verbose:
            print("Loaded model in {:.3f} secs".format(time.time()-start))

        if words is not None:
            vectors, outwords = [], []
            for word in words:
                try:
                    vectors.append(model[word])
                    outwords.append(word)
                except KeyError:
                    pass
        else:
            outwords = list(model.vocab.keys())
            if maxwords is not None:
                outwords = outwords[:min(maxwords, len(model.vocab)-1)]
            vectors = [model[w] for w in outwords]
github iliaschalkidis / lmtc-eurlex57k / neural_networks / lmtc_networks / label_driven_classification.py View on Github external
def PretrainedEmbedding(self):

        inputs = Input(shape=(None,), dtype='int32')
        embeddings = KeyedVectors.load_word2vec_format(self.word_embedding_path, binary=False)
        word_embeddings_weights = K.cast_to_floatx(np.concatenate((np.zeros((1, embeddings.syn0.shape[-1]), dtype=np.float32), embeddings.syn0), axis=0))
        embeds = Embedding(len(word_embeddings_weights), word_embeddings_weights.shape[-1],
                           weights=[word_embeddings_weights], trainable=False)(inputs)

        return Model(inputs=inputs, outputs=embeds, name='embedding')
github deepmipt / DeepPavlov / deeppavlov / models / ranking / emb_dict.py View on Github external
def init_from_scratch(self, tok2int_vocab):
        if self.embeddings == "fasttext":
            self.embeddings_model = FastText.load_fasttext_format(str(self.emb_model_file))
        elif self.embeddings == "word2vec":
            self.embeddings_model = KeyedVectors.load_word2vec_format(str(self.emb_model_file),
                                                                      binary=True)
        log.info("[initializing new `{}`]".format(self.__class__.__name__))
        self.build_int2emb_vocab(tok2int_vocab)
        self.build_emb_matrix(tok2int_vocab)
github jiangxinyang227 / NLP-Project / language_model / char_rnn / data_helper.py View on Github external
def get_word_vectors(self, vocab):
        """
        加载字向量,并获得相应的字向量矩阵
        :param vocab: 字汇表
        :return:
        """
        word_vectors = (1 / np.sqrt(len(vocab)) * (2 * np.random.rand(len(vocab), self._embedding_size) - 1))
        if os.path.splitext(self._word_vectors_path)[-1] == ".bin":
            word_vec = gensim.models.KeyedVectors.load_word2vec_format(self._word_vectors_path, binary=True)
        else:
            word_vec = gensim.models.KeyedVectors.load_word2vec_format(self._word_vectors_path)

        for i in range(len(vocab)):
            try:
                vector = word_vec.wv[vocab[i]]
                word_vectors[i, :] = vector
            except:
                print(vocab[i] + "不存在于字向量中")

        return word_vectors
github IBM / Semantic-Search-for-Sustainable-Development / src / conductRIA.py View on Github external
#Here we create instances of our custom paragraph vector model.

#Set values for various parameters
num_features = 4000    # Word vector dimensionality                      
min_word_count = 30   # Minimum word count                        
num_workers = 4       # Number of threads to run in parallel
context = 30          # Context window size                                                                                    
downsampling = 1e-3   # Downsample setting for frequent words

#We use the same parameters to create one instance that uses normalized bag of words scaling and another that uses tf-idf scaling
par_vec_nbow = CustomParVec(words_by_line, num_workers, num_features, min_word_count, context, downsampling, False)
par_vec_tfidf = CustomParVec(words_by_line, num_workers, num_features, min_word_count, context, downsampling, True)

#We will also experiment with Google's pre-trained word2vec model which has 300 dimensions.
if USE_GOOGLE_NEWS:
    model_google = models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True) 
    par_vec_google = CustomParVec(words_by_line, num_workers, 300, min_word_count, context, downsampling, True, model_google)

#Let's place our models in a single list
if USE_GOOGLE_NEWS:
    par_vecs = [par_vec_google, par_vec_nbow, par_vec_tfidf]
else:
    par_vecs = [par_vec_nbow, par_vec_tfidf]


#data for any prior RIA we would like to test
policy_documents_liberia   = ['Liberia Agenda for Transformation.txt', 'Liberia Eco stabilization and recovery plan-april_2015.txt']
policy_documents_bhutan    = ['Eleventh-Five-Year-Plan_Vol-1.txt', '11th-Plan-Vol-2.txt']
policy_documents_namibia   = ['na-nbsap-v2-en.txt', 'Agri Book with cover1.txt', 'execution strategy for industrialisation.txt', 'INDC of Namibia Final pdf.txt', 'Namibia_Financial_Sector_Strategy.txt', 'Tourism Policy.txt', 'namibia_national_health_policy_framework_2010-2020.txt', 'nampower booklet_V4.txt', '826_Ministry of Education Strategic Plan 2012-17.txt', 'Namibia_NDP4_Main_Document.txt']
policy_documents_cambodia  = ['National Strategic Development Plan 2014-2018 EN Final.txt', 'Cambodia_EducationStrategicPlan_2014_2018.txt', 'Cambodia Climate Change Strategic Plan 2014_2023.txt', 'Cambodia Industrial Development Policy 2015_2025.txt', 'Cambodian Gender Strategic Plan - Neary Rattanak 4_Eng.txt', 'Draft_HealthStrategicPlan2016-2020.txt', 'Cambodia_national-disability-strategic-plan-2014-2018.txt', 'National_Policy_on_Green_Growth_2013_EN.txt', 'tourism_development_stategic_plan_2012_2020_english.txt', 'Labour Migration Policy for Cambodia 2015-2018.txt', 'kh-nbsap-v2-en.txt', 'financial-sector-development-strategy-2011-2020.txt', 'National_Social_Protection_Strategy_for_the_Poor_and_Vulnerable_Eng.txt']
policy_documents_mauritius = ['Agro-forestry Strategy 2016-2020.txt', 'VISION_14June2016Vision 2030DraftVersion4.txt', 'Updated Action Plan of the Energy Strategy 2011 -2025.txt', 'National Water Policy 2014.txt', 'National CC Adaptioin Policy Framework report.txt', 'MauritiusEnergy Strategy 2009-2025.txt', 'Mauritius Govertment programme 2015-2019.txt', 'CBD Strategy and Action Plan.txt']
github BrikerMan / Kashgari / kashgari / embedding / word2vec.py View on Github external
def __init__(self, model: Union[k.Word2VecModels, str], **kwargs):
        self.model = model
        self.model_path = k.get_model_path(model)
        self.keyed_vector: KeyedVectors = KeyedVectors.load_word2vec_format(self.model_path, **kwargs)
        self.embedding_size = self.keyed_vector.vector_size
        logging.debug('------------------------------------------------')
        logging.debug('Loaded gensim word2vec model')
        logging.debug('model        : {}'.format(self.model_path))
        logging.debug('word count   : {}'.format(len(self.keyed_vector.index2entity)))
        logging.debug('Top 50 word  : {}'.format(self.keyed_vector.index2entity[:50]))
        logging.debug('------------------------------------------------')
github flairNLP / flair / flair / embeddings.py View on Github external
)
            embeddings = cached_path(
                f"{embeddings_path_v4}{embeddings[:2]}-crawl-fasttext-300d-1M",
                cache_dir=cache_dir,
            )

        elif not Path(embeddings).exists():
            raise ValueError(
                f'The given embeddings "{embeddings}" is not available or is not a valid path.'
            )

        self.name: str = str(embeddings)
        self.static_embeddings = True

        if str(embeddings).endswith(".bin"):
            self.precomputed_word_embeddings = gensim.models.KeyedVectors.load_word2vec_format(
                str(embeddings), binary=True
            )
        else:
            self.precomputed_word_embeddings = gensim.models.KeyedVectors.load(
                str(embeddings)
            )

        self.field = field

        self.__embedding_length: int = self.precomputed_word_embeddings.vector_size
        super().__init__()