How to use the gensim.models.word2vec.Word2Vec function in gensim

To help you get started, we’ve selected a few gensim examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github btaille / semantic-question-matching / starting_kit.py View on Github external
###################
# Word Embeddings # 
###################

import gensim
import matplotlib.pyplot as plt
#logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
my_path = 'C:\\Users\\ASUS\\Documents\\Telecom\\PRIM\\code\\w2v\\'

# quora corpus
corpus = [clean_sent(q).split() for q in df['question1']]
for q in df['question2']: corpus.append(clean_sent(q).split())

# initialize W2V model
my_model = gensim.models.word2vec.Word2Vec(size=300, min_count=2, sg=1)
my_model.build_vocab(corpus)

# update with Glove
my_model.intersect_word2vec_format(my_path + "glove.6B.300d.txt",binary=False)

# fine tune on quora corpus
my_model.train(corpus, total_examples=my_model.corpus_count, epochs=my_model.iter)

# trim memory
my_model.init_sims(replace=True)

# Word Mover's Distance
wmd_true = []
for q1,q2 in tqdm(zip(df_true_duplicate['question1'],df_true_duplicate['question2'])):
    clean_q1 = clean_sent(q1).split()
    clean_q2 = clean_sent(q2).split()
github superhy / graph-mind / src / word_seg / word2vec / wordVecOpt.py View on Github external
else:
            model = None
            if fileType == u'opened':
                print('training model from singleFile!')
                model = Word2Vec(LineSentence(corpusFilePath), size=self._size, window=self._window, min_count=self._minCount, workers=self._workers)
            elif fileType == u'file':
                corpusFile = open(corpusFilePath, u'r')
                print('training model from singleFile!')
                model = Word2Vec(LineSentence(corpusFile), size=self._size, window=self._window, min_count=self._minCount, workers=self._workers)
                corpusFile.close()
            elif fileType == u'directory':
                corpusFiles = localFileOptUnit.listAllFilePathInDirectory(corpusFilePath)
                print('training model from listFiles of directory!')
                
                sentences = localFileOptUnit.loadSetencesFromFiles(corpusFiles)
                model = Word2Vec(sentences, size=self._size, window=self._window, min_count=self._minCount, workers=self._workers)
            elif fileType == u'other':
                # TODO add sentences list directly
                pass
                
            model.save(self.modelPath)
            model.init_sims()
            print('producing word2vec model ... ok!')
            return model
github lxw0109 / NLP-Experiments / word2vec / src / 2_gensim_word2vec_train.py View on Github external
def train_w2v():
    """
    训练word2vec模型, 并保存
    :return: model
    """
    sentences = word2vec.Text8Corpus("../data/corpus_seg.txt")    # 加载语料
    model = word2vec.Word2Vec(sentences, size=200, min_count=1, window=10)    # 训练skip-gram模型
    # NOTE: word2vec的参数意义和选择: https://github.com/lxw0109/NLPExperiments/blob/master/word2vec/doc/Learning%20Notes%20on%20word2vec.ipynb
    return model

    """
    # OK
github nlpub / hyperstar / prepare.py View on Github external
try:
    from sklearn.model_selection import train_test_split
except ImportError:
    from sklearn.cross_validation import train_test_split

parser = argparse.ArgumentParser(description='Preparation.')
parser.add_argument('--w2v', default='all.norm-sz100-w10-cb0-it1-min100.w2v', nargs='?',
                    help='Path to the word2vec model.')
parser.add_argument('--seed', default=228, type=int, nargs='?', help='Random seed.')
args = vars(parser.parse_args())

RANDOM_SEED = args['seed']
random.seed(RANDOM_SEED)

w2v = Word2Vec.load_word2vec_format(args['w2v'], binary=True, unicode_errors='ignore')
w2v.init_sims(replace=True)
print('Using %d word2vec dimensions from "%s".' % (w2v.layer1_size, args['w2v']))


def read_subsumptions(filename):
    subsumptions = []

    with codecs.open(filename, encoding='utf-8') as f:
        reader = csv.reader(f, delimiter='\t', quoting=csv.QUOTE_NONE)

        for row in reader:
            subsumptions.append((row[0], row[1]))

    return subsumptions
github happyflyingfish / cs-skill-tree / natural language process / language model / source / word2vec_gensim.py View on Github external
from gensim.models import word2vec
import logging
sentences = word2vec.Text8Corpus('/tmp/text8')
model = word2vec.Word2Vec(sentences, size=200)
model.most_similar(positive=['woman', 'king'], negative=['man'], topn=1)
github Cold-Winter / vqs / processData / processJson.py View on Github external
import json
from nltk.tokenize import TweetTokenizer
from gensim.models import word2vec
tokenzer = TweetTokenizer()

model = word2vec.Word2Vec.load_word2vec_format('model.bin', binary=True)

with open('mscoco_train2014_annotations.json', 'r') as f:
    dataAnno = json.load(f)
with open('MultipleChoice_mscoco_train2014_questions.json', 'r') as f:
    dataQuestion = json.load(f)

feaFile = open('trainRegionFea.txt','w')
choicelist = {}
answerlist = {}
# print dataQuestion['num_choices']
for question in dataAnno['annotations']:
    choicelist[question['question_id']] = question['multiple_choice_answer']
    #print choicelist[question['question_id']]
errorword = 0
erroranswer = 0
errorquestion = 0
github niitsuma / word2vec-keras-in-gensim / word2veckeras / scoreword2veckeras.py View on Github external
#return [len(word)/0.2 ]
        
    sws=list(LineScoredWordSentence(input_file,dummy_score_vec))
    #print sws[0]
    
    from word2veckeras import Word2VecKeras
    
    parameters = [{'size':[5],'hs':[0,1],'negative':[0,5],'sg':[0,1] }]
    from sklearn.grid_search import ParameterGrid
    for param in ParameterGrid(parameters):
        if (param['hs']==0 and param['negative']==0) :
            continue
        print param
        svk=ScoreWord2VecKeras(sws,**param)
        vsk = Word2VecKeras(gensim.models.word2vec.LineSentence(input_file),**param)
        vs = gensim.models.word2vec.Word2Vec(gensim.models.word2vec.LineSentence(input_file),**param)
        print( svk.most_similar('the', topn=5))
        print( vsk.most_similar('the', topn=5))
        print( vs.most_similar('the', topn=5))
        print(svk['the'])
        print(vsk['the'])
        print(vs['the'])

    # #svk.save_word2vec_format('tmp.vec')
    # #svk.save('tmp.model')

    #print svk.score_vector_size

    scored_word_list=[
        ['This',[20*0.1,10*0.2]],
        ['is',[10*0.1,5*0.2]],
        ['a',[30*0.1,10*0.2]],
github uhh-lt / sensegram / experiment / sensegram.py View on Github external
from operator import itemgetter
import os.path
import codecs
import math
import numpy as np
from gensim.models import word2vec

default_count = 100 # arbitrary, should be larger than min_count of vec object, which is 5 by default

class SenseGram(word2vec.Word2Vec):
    def __init__(self, *args, **kwargs):
        super(SenseGram, self).__init__(*args, **kwargs)
        self.probs = {} # mapping from a sense (String) to its probability
    
    def get_senses(self, word, ignore_case=False):
        """ returns a list of all available senses for a given word.
        example: 'mouse' -> ['mouse#0', 'mouse#1', 'mouse#2']
        Assumption: senses use continuous numbering"""
        words = [word]
        senses = []
        if ignore_case:
            words.append(word[0].upper() + word[1:])
            words.append(word[0].lower() + word[1:])
        
        words = set(words)
        for word in words:
github Times125 / QQ_Chat_Records_Analyse / src / classify.py View on Github external
def get_predict_vecs(cls, words):
        n_dim = 300
        imdb_w2v = Word2Vec.load('svm_data/w2v_model/w2v_model.pkl')
        # imdb_w2v.train(words)
        train_vecs = cls.build_wordvector(words, n_dim, imdb_w2v)
        # print train_vecs.shape
        return train_vecs
github akb89 / word2vec / gensim / models / word2vec.py View on Github external
def save(self, *args, **kwargs):
        # don't bother storing the cached normalized vectors, recalculable table
        # TODO: after introducing KeyedVectors now syn0, vocab, id2word are saved TWO times. Once in word2vec and once in keyedvectors
        #       After keyedvectors are deprecated it will be only once
        Word2Vec.disable_keyed_vectors_warnings()
        kwargs['ignore'] = kwargs.get('ignore', ['syn0norm', 'table', 'cum_table'])

        super(Word2Vec, self).save(*args, **kwargs)
        Word2Vec.enable_keyed_vectors_warnings()