How to use the gensim.models.Doc2Vec function in gensim

To help you get started, we’ve selected a few gensim examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github lgalke / vec4ir / tests / test_vec4ir.py View on Github external
def test_doc2vec_inference_saveload():
    tagged_docs = [TaggedDocument(simple_preprocess(doc), [i])
                   for i, doc in enumerate(documents)]
    model = Doc2Vec(tagged_docs, epochs=1, min_count=1, vector_size=10)
    model.save(TEST_FILE)
    del model
    model = Doc2Vec.load(TEST_FILE)
    os.remove(TEST_FILE)
    d2v = Doc2VecInference(model, DEFAULT_ANALYZER)
    match_op = Matching()
    retrieval = Retrieval(d2v, matching=match_op).fit(documents)
    result = retrieval.query("scientists")
    assert result[0] == 1
github Lab41 / sunny-side-up / src / Baseline / Word2Vec / Sentiment140_W2V_Pipeline.py View on Github external
''' Doc2Vec model takes in only this LabeledSentence data structure
            ex: LabeledSentence(['list', 'of', 'tokenized', 'words'], ['pos_0'])'''
            ls = LabeledSentence(preprocess_tweet(sentence).split(), [label + '_%d' % pos_count])
            pos_count += 1
        else:
            ls = LabeledSentence(preprocess_tweet(sentence).split(), [label + '_%d' % neg_count])
            neg_count += 1
        labeled_sent.append(ls)

    logging.info("Training on %d Positive and %d Negative tweets" % (pos_count, neg_count))
    logging.info("Building model...")

    '''Setting min_count > 1 can cause some tweets to "disappear" later
    from the Doc2Vec sentence corpus.
    ex: you could imagine a tweet containing only words whose count was low'''
    model = Doc2Vec(min_count=1, window=10, size=100, sample=1e-4, negative=5,
                    workers=7)

    logging.info("Building Vocabulary...")
    model.build_vocab(labeled_sent)

    logging.info("Training model...")
    for epoch in xrange(epoch_num):
        logging.info("Epoch %s..." % epoch)
        # Temporarily sets logging level to show only if its at least WARNING
        # This prevents model.train from overloading the log
        logging.getLogger().setLevel(logging.WARN)
        # Numpy random permutation method shuffles data in place 
        # Shuffling improves the accuracy of the model
        model.train(np.random.permutation(labeled_sent))
        logging.getLogger().setLevel(logging.INFO)
github eellak / gsoc2018-3gm / 3gm / train_doc2vec.py View on Github external
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

taggeddocs = []

with open(train_corpus, 'r') as f:
	docs = f.read().splitlines()

if tokenize:
	docs = nlp_clean(docs)

for label, doc in zip(labels, docs):
	td = TaggedDocument(words=tokenizer.tokenizer.split(doc.lower(), delimiter=' '), tags=[label])
	#print(td)
	taggeddocs.append(td)

model = g.Doc2Vec(taggeddocs, size=vector_size, window=window_size, min_count=min_count, sample=sampling_threshold, workers=worker_count, hs=0, dm=dm, negative=negative_size, dbow_words=1, dm_concat=1, pretrained_emb=None, iter=train_epoch)

#save model
model.save(saved_path)
github shibing624 / dialogbot / dialogbot / searchdialog / vectormodel.py View on Github external
def load_doc2vec_model(texts, model_path):
        if os.path.exists(model_path):
            model = Word2Vec.load(model_path)
        else:
            documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(texts)]
            model = Doc2Vec(documents, vector_size=5, window=3, min_count=1, workers=4, size=100, alpha=0.025, iter=40)
            model.train(documents, total_examples=model.corpus_count, epochs=model.iter)
            # model = Word2Vec(sg=1, sentences=texts, size=256, window=5, min_count=1, iter=40)
            # model.save(model_path)
        return model
github fnielsen / dasem / dasem / models.py View on Github external
def train(self, size=100, window=8, min_count=5, workers=4):
        """Train Gensim Doc2Vec model.

        Parameters
        ----------
        size : int, optional
            Dimension of the word2vec space.

        """
        tagged_documents = self.iterable_tagged_documents()
        self.model = gensim.models.Doc2Vec(
            tagged_documents, size=size, window=window, min_count=min_count,
            workers=workers)
github adityathakker / AcronymExpansion / checksim_2.py View on Github external
import json
import gensim.models
from difflib import SequenceMatcher

filename = 'data/acronyms_best.json'
f = open(filename, 'r')
data = json.load(f)
model = gensim.models.Doc2Vec.load('models_context/IC.model')
#print model.docvecs.similarity("Entity-Relationship Model","entity-relationship models")
print model.docvecs.most_similar("Integrated Circuit",topn=20)
#
total =0
correct=0
error=0
def similar(a, b):
    a=a.lower()
    b=b.lower()
    return SequenceMatcher(None, a, b).ratio()
n=0
wrong =0
for k,v in data.items():
    try:
        model = gensim.models.Doc2Vec.load('models_context/'+k+'.model')
        if similar(v["full_form"],model.docvecs.most_similar(v["full_form"])[0][0])>0.80:
github laugustyniak / textlytics / textlytics / sentiment / sentiment.py View on Github external
model : gensim.Doc2Vec
            Trained model.

        """
        times_epoch = []
        start = datetime.now()

        docs_all = list(docs) + list(docs_unsuperv)

        docs_all = self.labelize_tokenize_docs(docs_all, self.w2v_label)
        docs = self.labelize_tokenize_docs(docs, self.w2v_label)

        if model is None:
            cores = multiprocessing.cpu_count()
            model = gensim.models.Doc2Vec(min_count=3, window=10, size=100,
                                          sample=1e-3, negative=5,
                                          workers=cores)
        model.build_vocab(docs_all)
        docs_perm = docs_all
        for epoch in range(10):
            log.info('Doc-2-Vec epoch: {}'.format(epoch))
            start_epoch = datetime.now()
            random.shuffle(docs_perm)
            model.train(docs_perm)
            times_epoch.append((start_epoch, datetime.now()))
        self.results['d2v-training-times'] = {'start': start,
                                              'stop': datetime.now(),
                                              'epochs': times_epoch}
        r = self.get_doc_2_vec_vectors(model, docs)
        return r, model
github SenticNet / CASCADE / users / generate_stylometric.py View on Github external
#!/usr/bin/env python
import pandas as pd
import numpy as np
import csv
import gensim, os

doc2vec = gensim.models.Doc2Vec.load('./models/user_stylometric.model')
data = np.asarray(pd.read_csv('./train_balanced_user.csv', header=None))
DIM = 300

directory = "./user_embeddings"
if not os.path.exists(directory):
	os.makedirs(directory)
file = open(directory+"/user_stylometric.csv",'w')
wr = csv.writer(file, quoting=csv.QUOTE_ALL)

# Inferring paragraphVec vectors for each user
vectors = np.asarray([doc2vec.infer_vector(data[i][1]) for i in range(data.shape[0])])

users = data[:,0]	
for i in range(len(users)):
	ls=[]
	ls.append(users[i])
github jayantj / w2vec-similarity / train / train.py View on Github external
def train_and_save_doc2vec(docs, output_file, options = {}):
  print "Training model..."
  model = Doc2Vec(docs, **options)
  model.save(output_file)