How to use the gensim.models.Word2Vec.load function in gensim

To help you get started, we’ve selected a few gensim examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github derekgreene / dynamic-nmf / find-window-topics.py View on Github external
random.seed( random_seed )			
	log.info("Using random seed %s" % random_seed )

	# Will we use automatic model selection?
	validation_measure = None
	if len(kparts) == 1:
		kmax = kmin
	else:
		kmax = int(kparts[1])
		if kmax < kmin:
			kmax = kmin
		# any word2vec model specified?
		if not options.model_path is None:
			log.info( "Loading Word2Vec model from %s ..." % options.model_path )
			import gensim
			model = gensim.models.Word2Vec.load(options.model_path) 
			validation_measure = unsupervised.coherence.WithinTopicMeasure( unsupervised.coherence.ModelSimilarity(model) )

	# NMF implementation
	impl = unsupervised.nmf.SklNMF( max_iters = options.maxiter, init_strategy = "nndsvd" )

	# Process each specified time window document-term matrix
	selected_ks = []
	for matrix_filepath in args:
		# Load the cached corpus
		window_name = os.path.splitext( os.path.split( matrix_filepath )[-1] )[0]
		log.info( "- Processing time window matrix for '%s' from %s ..." % (window_name,matrix_filepath) )
		(X,terms,doc_ids) = text.util.load_corpus( matrix_filepath )
		log.info( "Read %dx%d document-term matrix" % ( X.shape[0], X.shape[1] ) )

		# Ensure that value of kmin and kmax are not greater than the number of documents
		num_docs = len(doc_ids)
github h404bi / wende / wende / classification / features / question2vec.py View on Github external
# -*- coding:utf-8 -*-
from __future__ import unicode_literals
import logging
import gensim
import numpy as np
from sklearn.base import BaseEstimator
from sklearn.feature_extraction.text import VectorizerMixin
from wende.classification.nlp import tokenize
from wende.config import WORD2VEC_MODEL_DIR, WORD2VEC_MODEL_SIZE

logging.info("loading word2vec model...")
w2v_model = gensim.models.Word2Vec.load(WORD2VEC_MODEL_DIR)


def gen_doc_vec(words, num_features):
    # remove unseen terms
    words = filter(lambda x: x in w2v_model, words)

    doc_vec = np.zeros(num_features, dtype="float32")
    word_count = 0

    for word in words:
        word_count += 1
        doc_vec += w2v_model[word]

    word_count = 1 if word_count == 0 else word_count
    doc_vec /= word_count
    return doc_vec
github bluemonk482 / tdparse / src / tdparse.py View on Github external
def __init__(self,w2vf='../resources/wordemb/w2v/c10_w3_s100',
                        sswef='../resources/wordemb/sswe'):
        self.w2v=gensim.models.Word2Vec.load(w2vf) 
        self.sswe=readTang(sswef) 
        self.lexicons=lexicon()
github EdCo95 / scientific-paper-summarisation / DataTools / useful_functions.py View on Github external
def load_word2vec():
    """
    Loads the word2vec model used in this work.
    :return: a word2vec model.
    """
    return Word2Vec.load(MODEL_SOURCE)
github alexeyev / abae-pytorch / reader.py View on Github external
def get_w2v(path):
    """
        Reading word2vec model given the path
    """
    return gensim.models.Word2Vec.load(path)
github lgalke / vec4ir / ir_eval.py View on Github external
def smart_load_embedding(model_path, doc2vec=False):
    print("Smart loading", model_path)
    if model_path is None:
        return None
    _, ext = os.path.splitext(model_path)
    if doc2vec:
        print("Loading Doc2Vec model:", model_path)
        model = Doc2Vec.load(model_path)
    elif ext == ".gnsm":  # Native format
        print("Loading embeddings in native gensim format: {}"
              .format(model_path))
        model = Word2Vec.load(model_path)
    else:  # either word2vec text or word2vec binary format
        binary = ".bin" in model_path
        print("Loading embeddings in word2vec format: {}".format(model_path))
        model = Word2Vec.load_word2vec_format(model_path, binary=binary)
    return model
github jamesmullenbach / caml-mimic / cnn-medical-text / dataproc / extract_wvs.py View on Github external
def main(vocab_size, Y, data, vocab_min):
    wv_file = os.path.join(DATA_DIR, "raw.w2v") if data == "raw" else os.path.join(DATA_DIR, "processed_%d.w2v" % (Y))
    model = gensim.models.Word2Vec.load(wv_file)
    wv = model.wv
    #free up memory
    del model

    v_dict, _ = load_lookups(vocab_size, Y, vocab_min)

    #go through vocab in order
    #find vocab word in wv.index2word, then call wv.word_vec(wv.index2word[i])
    #put results into one big matrix
    W, words = build_matrix(v_dict, wv)

    #smash that save button
    outfile = os.path.join(DATA_DIR, "raw.embed") if data == "raw" else os.path.join(DATA_DIR, "processed_%d.embed" % (Y))
    save_embeddings(W, words, outfile)
github ArtistScript / FastTextRank / FastTextRank / FastTextRank4Sentence.py View on Github external
:param use_stopword: 是否使用停用词
        :param stop_words_file: 停用词文件路径
        :param use_w2v: 是否使用词向量计算句子相似性
        :param dict_path: 词向量字典文件路径
        :param max_iter: 最大迭代伦茨
        :param tol: 最大容忍误差
        """
        if use_w2v==False and dict_path!=None:
            raise RuntimeError("再使用词向量之前必须令参数use_w2v=True")
        self.__use_stopword = use_stopword
        self.__use_w2v=use_w2v
        self.__dict_path = dict_path
        self.__max_iter = max_iter
        self.__tol = tol
        if self.__use_w2v:
            self.__word2vec = Word2Vec.load(self.__dict_path)
        self.__stop_words = set()
        self.__stop_words_file = self.get_default_stop_words_file()
        if type(stop_words_file) is str:
            self.__stop_words_file = stop_words_file
        if use_stopword:
            for word in codecs.open(self.__stop_words_file, 'r', 'utf-8', 'ignore'):
                self.__stop_words.add(word.strip())
        np.seterr(all='warn')#Print a RuntimeWarning for all types of floating-point errors

github dheeraj7596 / SCDV / 20news / SCDV.py View on Github external
model_type = sys.argv[3]

    model_name = str(num_features) + "features_" + str(min_word_count) + "minwords_" + str(
        context) + "context_len2alldata"

    # Load train data.
    train = pd.read_csv('data/train_v2.tsv', header=0, delimiter="\t")
    # Load test data.
    test = pd.read_csv('data/test_v2.tsv', header=0, delimiter="\t")
    all = pd.read_csv('data/all_v2.tsv', header=0, delimiter="\t")

    assert model_type in ["word2vec", "fasttext"]

    if model_type == "word2vec":
        # Load the trained Word2Vec model.
        model = Word2Vec.load(model_name)
        # Get wordvectors for all words in vocabulary.
        word_vectors = model.wv.vectors
        index2word = model.wv.index2word
    elif model_type == "fasttext":
        # Load the trained FastText model.
        model = FastText.load(model_name)
        # Get wordvectors for all words in vocabulary.
        word_vectors = model.wv.vectors
        index2word = model.wv.index2word

    # Set number of clusters.
    num_clusters = int(sys.argv[2])
    # Uncomment below line for creating new clusters.
    idx, idx_proba = cluster_GMM(num_clusters, word_vectors)

    # Uncomment below lines for loading saved cluster assignments and probabaility of cluster assignments.
github jayantj / w2vec-similarity / scripts / model2json.py View on Github external
#Author: Phi Van Thuy
#Purpose: Convert word2vec models to JSON database by cosine distance metric

from gensim.models import Word2Vec

#Trained model
# model_path = "/cl/work/thuy-ph/word2vec/GoogleNews-vectors-negative300.bin"

print "Loading model..."
model = Word2Vec.load('emma-model')
# model = word2vec.Word2Vec.load_word2vec_format(model_path, binary=True)  # C binary format
print "Loading model: Done"

#Name of output file
f = open('en_data_cosine_skipgram_original.json','w')

f.write("{\n")

number_words = len(model.vocab)
#number_words = 10000
for i in range(0, number_words):
  stringA = model.vocab.items()[i][0]
  f.write("\n\"" + stringA.encode("utf-8") + "\":[\n")

  nearest_words = model.most_similar(positive=[stringA], negative=[],  topn=20)
  number_nearest_words = len(nearest_words)