How to use gensim - 10 common examples

To help you get started, we’ve selected a few gensim examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github lgalke / vec4ir / tests / test_vec4ir.py View on Github external
def test_doc2vec_inference_saveload():
    tagged_docs = [TaggedDocument(simple_preprocess(doc), [i])
                   for i, doc in enumerate(documents)]
    model = Doc2Vec(tagged_docs, epochs=1, min_count=1, vector_size=10)
    model.save(TEST_FILE)
    del model
    model = Doc2Vec.load(TEST_FILE)
    os.remove(TEST_FILE)
    d2v = Doc2VecInference(model, DEFAULT_ANALYZER)
    match_op = Matching()
    retrieval = Retrieval(d2v, matching=match_op).fit(documents)
    result = retrieval.query("scientists")
    assert result[0] == 1
github davidsbatista / BREDS / BREDS / test-sentence-generation.py View on Github external
def similarity_3_contexts(p, t):
        (bef, bet, aft) = (0, 0, 0)

        if t.bef_vector is not None and p.bef_vector is not None:
            bef = dot(matutils.unitvec(t.bef_vector), matutils.unitvec(p.bef_vector))

        if t.bet_vector is not None and p.bet_vector is not None:
            bet = dot(matutils.unitvec(t.bet_vector), matutils.unitvec(p.bet_vector))

        if t.aft_vector is not None and p.aft_vector is not None:
            aft = dot(matutils.unitvec(t.aft_vector), matutils.unitvec(p.aft_vector))

        return 0*bef + 1*bet + 0*aft
github shilad / macademia / Macademia / scripts / algs / test_svd.py View on Github external
def make_index():
    logging.info('loading dictionary')
    dictionary = gensim.corpora.Dictionary.load_from_text('svd/dictionary.txt')
    logging.info('loading corpus')
    corpus = gensim.corpora.MmCorpus('svd/corpus.mm')
    tfidf = gensim.models.TfidfModel(corpus)
    logging.info('loading model')
    model = gensim.models.ldamodel.LdaModel.load('svd/lda.txt')
    logging.info('building lda docs')
    lda_corpus = model[tfidf[corpus]]
    logging.info('building index')
    index = gensim.similarities.docsim.Similarity('/tmp/lda_index.txt', lda_corpus, 1000)
    index.save('svd/lda_index.txt')
github fredriko / metacurate-lexicon / src / scripts / train_wordspace_models.py View on Github external
def train_word2vec_model(input: str, output_directory: str, model_name: str) -> None:

    if not os.access(output_directory, os.W_OK):
        print("Cannot write to directory {}. Exiting!".format(output_directory))
        exit(1)

    if os.path.isdir(input):
        sentences = gensim.models.word2vec.PathLineSentences(input)
    else:
        sentences = gensim.models.word2vec.LineSentence(input)

    model = gensim.models.Word2Vec(sentences, sg=0, size=100, window=10, min_count=20, workers=10)
    model.train(sentences, total_examples=model.corpus_count, epochs=10)
    model.save(output_directory + model_name)
    # We want the vectors only to reduce memory footprint: this is the file(s) that the oneline lexicon should use.
    vectors = model.wv
    vectors.save(output_directory + model_name + ".vectors-only")
github ziqizhang / chase / python / src / ml / classifier_dnn.py View on Github external
else:
        params["scoreperclass"] = True
    if "word_norm" not in params.keys():
        params["word_norm"] = 1
    if "oov_random" not in params.keys():
        params["oov_random"] = 0
    if "emb_model" in params.keys():
        emb_models = []
        print("===> use pre-trained embeddings...")
        model_str = params["emb_model"].split(',')
        for m_s in model_str:
            gensimFormat = ".gensim" in m_s
            if gensimFormat:
                emb_models.append(gensim.models.KeyedVectors.load(m_s, mmap='r'))
            else:
                emb_models.append(gensim.models.KeyedVectors. \
                                  load_word2vec_format(m_s, binary=True))
        print("<===loaded {} models".format(len(emb_models)))
    if "emb_dim" in params.keys():
        emb_dim = int(params["emb_dim"])
    if "gpu" in params.keys():
        if params["gpu"] == "1":
            print("using gpu...")
        else:
            print("using cpu...")
    if "wdist" in params.keys():
        wdist_file = params["wdist"]
    else:
        wdist_file = None


    use_mixed_data=False
github svakulenk0 / semantic_coherence / load_embeddings.py View on Github external
def load_embeddings_gensim(embeddings_config, label, vocabulary, save_to):
    # create a weight matrix for entities in training docs
    embedding_matrix = np.zeros((len(vocabulary), embeddings_config['dims']))
        
    # load embeddings binary model with gensim for word2vec and rdf2vec embeddings
    model = gensim.models.Word2Vec.load(embeddings_config['path'])
    #model = gensim.models.KeyedVectors.load_word2vec_format(embeddings_config['path'], binary=True)
    embedded_entities = model.wv
    missing = 0
    for entity, entity_id in vocabulary.items():
        # strip entity label format to rdf2vec label format
        #rdf2vec_entity_label = 'dbr:%s' % entity.split('/')[-1]
        #print rdf2vec_entity_label
        rdf2vec_entity_label = '<' + entity + '>'
        if rdf2vec_entity_label in embedded_entities:
            embedding_matrix[entity_id] = embedded_entities[rdf2vec_entity_label]
        else:
            missing += 1
    print "done loading gensim entities. %d missing" % missing
    # save embedding_matrix for entities in the training dataset
    np.save(save_to, embedding_matrix)
    # print embedding_matrix
github ChenglongChen / Kaggle_HomeDepot / Code / Igor&Kostia / word2vec.py View on Github external
print "third vocab"   

#st conc pt conc pd conc br conc mr vocab w/o pars
t3 = list()
for i in range(len(st)):
    p = st1[i].split()+pt1[i].split()+pd1[i].split()+br1[i].split()+mr1[i].split()+ab1[i].split()+at1[i].split()
    t3.append(p)

print "fourth vocab" 

#trin models
model0 = gensim.models.Word2Vec(t, sg=1, window=10, sample=1e-5, negative=5, size=300)
model1 = gensim.models.Word2Vec(t1, sg=1, window=10, sample=1e-5, negative=5, size=300)
model2 = gensim.models.Word2Vec(t2, sg=1, window=10, sample=1e-5, negative=5, size=300)
model3 = gensim.models.Word2Vec(t3, sg=1, window=10, sample=1e-5, negative=5, size=300)
#model4 = gensim.models.Word2Vec(t, sg=0,  hs=1, window=10,   size=300)
#model5 = gensim.models.Word2Vec(t1, sg=0, hs=1,window=10,   size=300)
#model6 = gensim.models.Word2Vec(t2, sg=0, hs=1, window=10,   size=300)
#model7 = gensim.models.Word2Vec(t3, sg=0, hs=1,window=10,   size=300)

print "model prepared"


#for each model calculate features^ n_similarity between st and something else
model_list=[model0,model1,model2,model3]   #,model4  ,model5,model6,model7]
n_sim=list()

for model in model_list:

    n_sim_pt=list()
    for i in range(len(st)):
github diegma / relation-autoencoder / learning / models / decoders / BilinearPlusSP.py View on Github external
CNP = np.asarray(rng.normal(0, math.sqrt(0.1), size=(k, k, r)), dtype=theano.config.floatX)
                                                                                            # @UndefinedVariable
        self.C = theano.shared(value=CNP, name='C')
        # self.C = theano.printing.Print("C = ")(self.C)

        # Selectional Preferences
        Ca1NP = np.asarray(rng.normal(0, math.sqrt(0.1), size=(k, r)), dtype=theano.config.floatX)
        Ca2NP = np.asarray(rng.normal(0, math.sqrt(0.1), size=(k, r)), dtype=theano.config.floatX)
        self.C1 = theano.shared(value=Ca1NP, name='C1')
        self.C2 = theano.shared(value=Ca2NP, name='C2')
        # argument embeddings
        ANP = np.asarray(rng.uniform(-0.01, 0.01, size=(a, k)), dtype=theano.config.floatX)  # @UndefinedVariable

        if ex_emb:
            import gensim
            external_embeddings = gensim.models.Word2Vec.load(settings.external_embeddings_path)

            for idArg in xrange(self.a):
                arg = data.id2Arg[idArg].lower().split(' ')
                new = np.zeros(k, dtype=theano.config.floatX)
                size = 0
                for ar in arg:
                    if ar in external_embeddings:
                        new += external_embeddings[ar]
                        size += 1
                if size > 0:
                    ANP[idArg] = new/size

        self.A = theano.shared(value=ANP, name='A')  # (a1, k)

        self.Ab = theano.shared(value=np.zeros(a,  dtype=theano.config.floatX),  # @UndefinedVariable
                                 name='Ab', borrow=True)
github akutuzov / webvectors / scripts / train_model.py View on Github external
argument = sys.argv[1]
filename = argument.split('/')[-1]

args = filename.split('.')[0].split('__')
(urlhash,algo,vectorsize,windowsize) = args

if algo == "skipgram":
    skipgram = 1
else:
    skipgram = 0

data = gensim.models.word2vec.LineSentence(argument)


model = gensim.models.Word2Vec(data, size=int(vectorsize), min_count=2, window=int(windowsize), sg=skipgram, workers=2, iter=5, cbow_mean=1)
model.init_sims(replace=True)
model.save_word2vec_format(root+'/trained/'+filename.split('.')[0].split('__')[0]+'.model', binary=True)
os.remove(root+'/tmp/'+filename.split('.')[0].split('__')[0])
github totalgood / nlpia / src / nlpia / book / examples / ch06_nessvectors.py View on Github external
import os

from collections import OrderedDict

import pandas as pd
from nlpia.data.loaders import get_data, BIGDATA_PATH
from gensim.models import KeyedVectors


word_vectors = get_data('word2vec')  # not in book

wordvector_path = os.path.join(BIGDATA_PATH, 'GoogleNews-vectors-negative300.bin.gz')    # not in book, reader required to compose this path

if 'word_vectors' not in globals():  # not in book
    WV = word_vectors = get_data('word2vec')
    word_vectors = KeyedVectors.load_word2vec_format(wordvector_path, binary=True)


###################################################
# Still need to create a class derived from gensim's Word2vec model instead of relying on word_vectors globals

COMPONENT_WORDS = OrderedDict([
    ('placeness', ('geography Geography geographic geographical geographical_location location ' +
                   'locale locations proximity').split()),
    ('peopleness', 'human Humans homo_sapiens peole people individuals humankind people men women'.split()),
    ('animalness', 'animal mammal carnivore animals Animal animal_welfare dog pet cats ani_mal'.split()),
    ('conceptness', 'concept concepts idea'.split()),
    ('femaleness', 'female Female females femal woman girl lady'.split()),
])


def component_vector(words):