How to use the gensim.models.word2vec.Word2Vec.load function in gensim

To help you get started, we’ve selected a few gensim examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github NLPOR / SMP2018-task1 / svm.py View on Github external
def get_word2vec(content):
    word2vec = Word2Vec.load('predictor/model/wiki.zh.seg_200d.model')
    res = np.zeros([200])
    count = 0
    # word_list = content.split()
    for word in content:
        if word in word2vec:
            res += word2vec[word]
            count += 1
    return pd.Series(res / count)
github MarkWuNLP / KEHNN / PreProcess.py View on Github external
def createtopicvec(word2vec_path):
    max_topicword = 20
    model = Word2Vec.load(word2vec_path)
    topicmatrix = np.zeros(shape=(100,max_topicword,100),dtype=theano.config.floatX)
    file = open(r"\\msra-sandvm-001\v-wuyu\Data\SemEvalCQA"
                r"\semeval2015-task3-english-data\pre-process\stemming_preservestop_cate\catedic.txt")
    i = 0
    miss = 0
    for line in file:
        tmp = line.strip().split(' ')
        for j in range(min(len(tmp),max_topicword)):
            if gensim.utils.to_unicode(tmp[j]) in model.vocab:
                topicmatrix[i,j,:] = model[gensim.utils.to_unicode(tmp[j])]
            else:
                miss = miss+1
        i= i+1
    print "miss word2vec", miss
    return topicmatrix
github juliakreutzer / quetch / src / QUETCH.py View on Github external
t2 = WMT15QETask2(languagePair2, "../WMT15-data/task2_"+languagePair+"_dev_comb", "../WMT15-data/task2_"+languagePair+"_train_comb", targetWindowSize=targetWindowSize, sourceWindowSize=sourceWindowSize, featureIndices=featureIndices, alignments=s2tAlignments, badWeight=badweight, lowercase=lowerCase, full=full)

		
		
		contextSize = t2.contextSize
		print "... context size", contextSize

		#print t2.wordDictionary
		vocabularySize = len(t2.wordDictionary)

		#load pretrained gensim word2vec model
		params = None
		if pretrainedModel is not None:
			print "... Loading pretrained model from file", pretrainedModel
			try:
				model = gensim.models.word2vec.Word2Vec.load(pretrainedModel)
				#print model["computer"]
				lc = False
				if ".lc." in pretrainedModel:
					lc = True
					print "... lowercasing"
					#construct initial lookup table from pretrained model
				params = constructLT(model,t2.wordDictionary,d_wrd,lc)
			except AttributeError: #full model, not only LT pretrained
				params = loadParams(pretrainedModel)



		#"translate" language pair notation for task 2
		languagePair2 = languagePair.upper().replace("-","_")

		#get instance vectors and binary labels for training
github OFAI / million-post-corpus / experiments / src / evaluate_lstm.py View on Github external
def evaluate(cat, fold, txt_train, txt_test, y_train, y_test):
    pool = multiprocessing.Pool()
    wordlists_train = pool.map(preprocess, txt_train)
    wordlists_test = pool.map(preprocess, txt_test)
    pool.close()
    pool.join()

    emb = Word2Vec.load(os.path.join(conf.W2V_DIR, 'model'))
    # add point at orign for unknown words
    emb.wv.syn0 = numpy.vstack((emb.wv.syn0,
        numpy.zeros(emb.wv.syn0.shape[1], dtype=numpy.float32)))

    # train data: replace words with embedding IDs, zero-padding and truncation
    X = numpy.zeros((len(y_train), conf.LSTM_MAXPOSTLEN), dtype=numpy.int32)
    X_lengths = numpy.zeros((len(y_train)))
    for i, words in enumerate(wordlists_train):
        X_lengths[i] = len(words)
        for j, w in enumerate(words):
            if j >= conf.LSTM_MAXPOSTLEN:
                break
            if w in emb:
                X[i,j] = emb.vocab[w].index
            else:
                X[i,j] = len(emb.vocab)
github gkeglevich / word-embedding-visualizer / word_to_vec.py View on Github external
print("We're gonna train the model now...")
    vec_model.build_vocab(sentences)

    # Pass in all of the necessary training variables
    vec_model.train(
        sentences,
        total_examples = vec_model.corpus_count,
        epochs = vec_model.iter
    )

    if not os.path.exists("trained"):
        os.makedirs("trained")

    vec_model.save(os.path.join("trained", "trained_model.w2v"))

    vec_model = w2v.Word2Vec.load(os.path.join("trained", "trained_model.w2v"))
    print("We're just gonna compress the dimensions... hang tight!")
    
    # Compress the words into a 2d Vector Space using t-distributed stochastic neighbour embedding
    tsne = sklearn.manifold.TSNE(n_components=2, random_state=0)

    all_word_vectors_matrix = vec_model.wv.syn0

    all_word_vectors_matrix_2d = tsne.fit_transform(all_word_vectors_matrix)

    points = pd.DataFrame(
        [
            (word, coords[0], coords[1])
            for word, coords in [
                (word, all_word_vectors_matrix_2d[vec_model.wv.vocab[word].index])
                for word in vec_model.wv.vocab
            ]
github deepmipt / DeepPavlov / deeppavlov / models / vectorizers / word2vec_vectorizer.py View on Github external
def load(self) -> None:
        """Load Word2vec model"""
        logger.info("Loading word2vec model from {}".format(self.load_path))
        self.model = Word2Vec.load(self.load_path)
github sheffieldnlp / stance-semeval2016 / word2vec_integration.py View on Github external
def applyWord2VecModel(modelname):
    model = word2vec.Word2Vec.load(modelname)
    for key in KEYWORDS_LONG['trump']:
        print("\n", key)
        for res in model.most_similar(key, topn=60):
            print(res)
github palmagro / gg2vec / node2vec.py View on Github external
else:
            sents = self.sentences_array
        self.path = self.path +str(it)+".npy"
        print "Learning:" + self.path
        print "CCCC!"
        if not os.path.exists(self.path):
            print "Entra"
            entrada = []
            results = Parallel(n_jobs=num_cores, backend="threading")(delayed(generate_sample)(self.mode,sents,self.degree,self.w_size,i) for i in range(1,self.ns))
            for r in results:
                entrada.append(r) 
            self.w2v = word2vec.Word2Vec(entrada, size=self.ndim, window=self.w_size, min_count=1, workers=num_cores,sg=0) 
            self.w2v.save(self.path)
            print "TERMINO"   
        else:
            self.w2v = word2vec.Word2Vec.load(self.path)  
        self.get_nodes()
        self.get_rels([])
        self.delete_props()
github idio / wiki2vec / resources / gensim / convert_model.py View on Github external
def convert_model(prefix):
    ln.info("loading model")
    w2v = Word2Vec.load(prefix)

    ln.info("saving dict...")
    dict_file = prefix + ".wordids.txt"
    with open(dict_file, "w") as f:
        for word, voc_obj in w2v.vocab.items():
            f.write((u"%s\t%s\n" % (word, voc_obj.index)).encode("UTF-8"))

    ln.info("saving weights as csv...")
    weights_file = prefix+".syn0.csv"
    np.savetxt(weights_file, w2v.syn0, delimiter=",", header="%s\n%s" % w2v.syn0.shape)

    ln.info("all done. Saved converted model files: %s and %s." % (weights_file, dict_file))
github superhy / graph-mind / src / word_seg / word2vec / wordVecOpt.py View on Github external
def loadModelfromFile(self, modelFilePath):
        '''
        load model from disk which is already existed
        can continue training with the loaded model (need more test)
        '''
        return Word2Vec.load(modelFilePath)