How to use the gensim.models.word2vec function in gensim

To help you get started, we’ve selected a few gensim examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github XXXalice / syamu_bot / engine / bot_include_emotion.py View on Github external
def load_w2v(word):
    model = word2vec.Word2Vec.load('../src/' + model_file)
    try:
        similar_words = model.most_similar(positive=[word])
        return random.choice([w[0] for w in similar_words])
    except:
        return word
github uhh-lt / sensegram / experiment / wsd.py View on Github external
def __init__(self, path_to_sense_model, path_to_context_model, window=10, method="sep", filter_ctx=False):
        self.vs = word2vec.Word2Vec.load_word2vec_format(path_to_sense_model, binary=True)
        self.vc = word2vec.Word2Vec.load_word2vec_format(path_to_context_model, binary=True)
        self.window = window
        self.ctx_method = method
        self.filter_ctx = filter_ctx
        
        print("Disambiguation method: " + self.ctx_method)
        print("Filter context: %s" % (self.filter_ctx))
github sheffieldnlp / stance-semeval2016 / word2vec_eval.py View on Github external
def extractFeaturesW2V(w2vmodel="skip_nostop_multi_300features_10minwords_10context", phrasemodel="phrase.model", useDev = False):

    if useDev == False:
        tweets_train, targets_train, labels_train = readTweetsOfficial(tokenize_tweets.FILETRAIN, 'windows-1252', 2)
        tweets_dev, targets_dev, labels_dev = readTweetsOfficial(tokenize_tweets.FILEDEV, 'windows-1252', 2)
    else:
        tweets_train, targets_train, labels_train = readTweetsOfficial(tokenize_tweets.FILETRAIN, 'windows-1252', 2)
        tweets_origdev, targets_origdev, labels_origdev = readTweetsOfficial(tokenize_tweets.FILEDEV, 'windows-1252', 2)
        tweets_train.extend(tweets_origdev)
        targets_train.extend(targets_origdev)
        labels_train.extend(labels_origdev)
        tweets_dev, targets_dev, labels_dev = readTweetsOfficial(tokenize_tweets.FILETEST, 'windows-1252', 2)

    phmodel = Phrases.load(phrasemodel)
    w2vmodel = word2vec.Word2Vec.load(w2vmodel)
    features_train_w2v = extractW2VAggrFeatures(w2vmodel, phmodel, tweets_train, targets_train, labels_train)
    features_dev_w2v = extractW2VAggrFeatures(w2vmodel, phmodel, tweets_dev, targets_dev, labels_dev)

    return features_train_w2v, labels_train, features_dev_w2v, labels_dev
github jarvisqi / nlp_learning / gensim / gensim_nlpir.py View on Github external
def mode_training():
    """
    模型训练
    """
    # 读取文件下下面的文件
    # sentences = MySentences('/some/directory')
    # 分词数据
    sentences = word2vec.Text8Corpus('data/xuezhong_seg_1.txt')
    # 训练 size参数主要是用来设置神经网络的层数
    # workers参数用于设置并发训练时候的线程数,不过仅当Cython安装的情况
    model = word2vec.Word2Vec(
        sentences, min_count=20, size=4000, window=10, workers=4)


    # model.sort_vocab()

    # 计算两个词的相似度/相关程度
    # simil_1 = model.wv.similarity(u"王仙芝", u"老怪物")
    # simil_2 = model.wv.similarity(u"徐凤年", u"殿下")
    # print("【王仙芝】和【老怪物】相似度:", simil_1)
    # print("【徐凤年】和【世子】相似度:", simil_2)

    # # 计算某个词的相关词列表
    # lar = model.wv.most_similar(u"徐凤年", topn=20)  # 20个最相关的
github yongzhuo / nlg-yongzhuo / nlg_yongzhuo / text_summarization / extractive_sum / nous_base / keyword / keyword_word2vec.py View on Github external
def predict_input():
    from collections import Counter
    import pandas as pd
    import numpy as np
    import gensim
    import jieba

    model = gensim.models.word2vec.Word2Vec.load('w2v_model_wiki_word')

    def keywords(s):
        """
            codes from 苏剑林. (2017, Apr 07). 《【不可思议的Word2Vec】 3.提取关键词 》[Blog post].Retrieved from https://www.spaces.ac.cn/archives/4316
        :param s: 
        :return: 
        """
        def predict_proba(oword, iword):
            iword_vec = model[iword]
            oword = model.wv.vocab[oword]
            oword_l = model.syn1[oword.point].T
            dot = np.dot(iword_vec, oword_l)
            lprob = -sum(np.logaddexp(0, -dot) + oword.code * dot)
            return lprob

        s = [w for w in s if w in model]
github sheffieldnlp / stance-conditional / stancedetection / conditional.py View on Github external
:param postprocess: force against/favor for tweets which contain the target
    :param shortenTargets: shorten the target text, see preprocess.transform_targets()
    :param useAutoTrump: use automatically annotated Trump tweets, experimental, not helping at the moment
    :param useClinton: add the Hillary Clinton dev data to train data
    :param testSetting: evaluate on Trump
    """


    if word2vecmodel == "small":
        w2vmodel = word2vec.Word2Vec.load("../out/skip_nostop_single_100features_5minwords_5context")
    else:
        w2vmodel = word2vec.Word2Vec.load("../out/skip_nostop_single_100features_5minwords_5context_big")

    if usePhrases == True:
        phrasemodel = Phrases.load("../out/phrase_all.model")
        w2vmodel = word2vec.Word2Vec.load("../out/skip_nostop_multi_100features_5minwords_5context")

    if testSetting == "true":
        trainingdata = "../data/semeval2016-task6-train+dev.txt"
        testdata = "../data/SemEval2016-Task6-subtaskB-testdata-gold.txt"
    elif testSetting == "weaklySup":
        trainingdata = "../data/trump_autolabelled.txt"
        testdata = "../data/SemEval2016-Task6-subtaskB-testdata-gold.txt"
        enc = "utf-8"
    else:
        trainingdata = "../data/semeval2016-task6-trainingdata_new.txt"
        testdata = "../data/semEval2016-task6-trialdata_new.txt"
    if useClinton == False:
        trainingdata = "../data/semeval2016-task6-trainingdata_new.txt"

    tweets, targets, labels, ids = reader.readTweetsOfficial(trainingdata, encoding=enc)
github voicy-ai / DialogStateTracking / src / hcn / modules / embed.py View on Github external
def create_model(self, fname='text8'):
        sentences = word2vec.Text8Corpus('data/text8')
        model = word2vec.Word2Vec(sentences, size=self.dim)
        model.save('data/text8.model')
        print(':: model saved to data/text8.model')
github JasonKessler / scattertext / demo_gensim_similarity.py View on Github external
def main():
	nlp = spacy.load('en')
	#nlp = whitespace_nlp_with_sentences
	convention_df = SampleCorpora.ConventionData2012.get_data()
	convention_df['parsed'] = convention_df.text.apply(nlp)
	corpus = (CorpusFromParsedDocuments(convention_df,
	                                   category_col='party',
	                                   parsed_col='parsed')
	          .build()
	          .get_unigram_corpus())
	model = word2vec.Word2Vec(size=100,
	                          alpha=0.025,
	                          window=5,
	                          min_count=5,
	                          max_vocab_size=None,
	                          sample=0,
	                          seed=1,
	                          workers=1,
	                          min_alpha=0.0001,
	                          sg=1,
	                          hs=1,
	                          negative=0,
	                          cbow_mean=0,
	                          iter=10,
	                          null_word=0,
	                          trim_rule=None,
	                          sorted_vocab=1)
github hugochan / KATE / autoencoder / baseline / word2vec.py View on Github external
def train(self, corpus):
        self.model = word2vec.Word2Vec(size=self.dim, min_count=self.min_count,\
            window=self.window, workers=multiprocessing.cpu_count(), \
            sg=self.sg, hs=self.hs, negative=self.negative, iter=self.epoches)
        self.model.build_vocab(corpus())
        self.model.train(corpus())

        return self
github monikkinom / ner-lstm / embeddings / getWordEmbeddings.py View on Github external
from gensim.models import word2vec
from RandomVec import RandomVec
import numpy as np
import random
import pickle as pkl
import sys, pickle as pkl

WORD_DIM = 300
model = word2vec.Word2Vec.load_word2vec_format('../pickles/GoogleNews-vectors-negative300.bin', binary=True)
rvec = RandomVec(300)

def findMaxLenght(FILE_NAME):
	temp = 0
	max_lenght = 0

	for line in open(FILE_NAME):
		if line in ['\n', '\r\n']:
			if temp > max_lenght:
				max_lenght = temp
			temp = 0
		else:
			temp += 1

	return max_lenght