How to use the gensim.corpora.MmCorpus function in gensim

To help you get started, we’ve selected a few gensim examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github shilad / macademia / Macademia / scripts / algs / test_svd.py View on Github external
def make_index():
    logging.info('loading dictionary')
    dictionary = gensim.corpora.Dictionary.load_from_text('svd/dictionary.txt')
    logging.info('loading corpus')
    corpus = gensim.corpora.MmCorpus('svd/corpus.mm')
    tfidf = gensim.models.TfidfModel(corpus)
    logging.info('loading model')
    model = gensim.models.ldamodel.LdaModel.load('svd/lda.txt')
    logging.info('building lda docs')
    lda_corpus = model[tfidf[corpus]]
    logging.info('building index')
    index = gensim.similarities.docsim.Similarity('/tmp/lda_index.txt', lda_corpus, 1000)
    index.save('svd/lda_index.txt')
github kethort / TwitterLDATopicModeling / src / create_LDA_model.py View on Github external
def build_pyLDAvis_output(corp_loc, dict_loc, lda_loc):
    if not '.model' in lda_loc:
        lda_loc += '.model'
    
    corpus = MmCorpus(corp_loc)
    dictionary = Dictionary.load(dict_loc)
    lda = models.LdaModel.load(lda_loc)

    vis_data = gensim_vis.prepare(lda, corpus, dictionary, sort_topics=False) 
    pyLDAvis.save_html(vis_data, lda_loc.split('.model')[0] + '.html')
github yolanda93 / information_retrieval_system / ir_system.py View on Github external
def create_documents_view(self,corpus, ir_mode):
        dictionary,pdocs = self.create_dictionary(corpus)
        bow = self.docs2bows(corpus, dictionary,pdocs)     
        loaded_corpus = corpora.MmCorpus('vsm_docs.mm') # Recover the corpus

        if ir_mode == 1:
             model = [[(w[0], 1 + np.log2(w[1])) for w in v] for v in bow] # TF model
        elif ir_mode == 2:
             model = models.TfidfModel(loaded_corpus) # TF IDF model
        elif ir_mode == 3:
             model = models.LdaModel(loaded_corpus) # LDA model
        elif ir_mode == 4:
             model = models.LdaMulticore(loaded_corpus) # LDA Multicore model
        elif ir_mode == 5:
             model = models.LsiModel(loaded_corpus) # LSI model
        elif ir_mode == 6:
             model = models.RpModel(loaded_corpus) # RP model
        elif ir_mode == 7:
             model = models.LogEntropyModel(loaded_corpus) # LogEntropyModel model
github RaRe-Technologies / gensim / src / gensim / corpora / wikiExternalParsingCorpus.py View on Github external
source = os.path.join(module_path, corpusname + '.bz2')
    # save the results to tmp
    output = os.path.join(tempfile.gettempdir(), corpusname)

    # build dictionary.
    logging.info("source: " + source)
    wiki = WikiExternParsingCorpus(source, keep_words=200000)

    # save dictionary and bag-of-words
    wiki.saveAsText(output)
    del wiki

    # initialize corpus reader and word->id mapping
    from gensim.corpora import MmCorpus
    id2token = WikiExternParsingCorpus.loadDictionary(output + '_wordids.txt')
    mm = MmCorpus(output + '_bow.mm')

    # build tfidf
    from gensim.models import TfidfModel
    tfidf = TfidfModel(mm, id2word=id2token, normalize=True)

    # save tfidf vectors in matrix market format
    MmCorpus.saveCorpus(output + '_tfidf.mm', tfidf[mm], progressCnt=10000)

    logging.info("finished running")
github nmslib / nmslib / data / data_conv / create_lsi.py View on Github external
#!/usr/bin/env python
import logging, gensim, bz2, sys
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# load id->word mapping (the dictionary), one of the results of step 2 above
id2word = gensim.corpora.Dictionary.load_from_text('sparse_wiki_wordids.txt')
# load corpus iterator
mm = gensim.corpora.MmCorpus(bz2.BZ2File('sparse_wiki_tfidf.mm.bz2')) 

if len(sys.argv) != 2:
  raise Exception("Usage: ")

print mm

ntop=int(sys.argv[1])
print "Using " + str(ntop) + " topics "

lsi = gensim.models.lsimodel.LsiModel(corpus=mm, id2word=id2word, num_topics=ntop, chunksize=10000)

lsi_file = 'LSI/lsi'+str(ntop)

lsi.save(lsi_file)

out_vect = 'LSI/wikipedia_lsi'+str(ntop)+'.txt'
github tesserae / tesserae / branches / semantics / gensim-whitaker / whitaker.group.py View on Github external
from gensim import corpora, models, similarities

# define the data directory

path_data = "/Volumes/CWFDATA/semantics"

# load the data from whitaker.models.py

print "loading saved dictionary"

dictionary = corpora.Dictionary.load(path_data + "/whitaker.dict")

print "loading saved corpus"

corpus = corpora.MmCorpus(path_data + "/whitaker.mm")

print "loading saved tfidf model"

tfidf = models.TfidfModel.load(path_data + "/model.whitaker.tfidf")

print "creating tfidf wrapper for corpus"

corpus_tfidf = tfidf[corpus]

print "loading saved lsi model"

lsi = models.LsiModel.load(path_data + "/model.whitaker.lsi")

print "creating lsi wrapper for corpus"

corpus_lsi = lsi[corpus_tfidf]
github nasa-jpl-memex / topic_space / topic_space / topic_modeling.py View on Github external
from __future__ import absolute_import

import itertools
import logging

import gensim
import numpy as np
import pandas as pd

from .topic_space.dictionaries import iter_corpus

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)


corpus_simple_mm = gensim.corpora.MmCorpus('material_science/output/simple.mm')
corpus_simple_dict = gensim.corpora.Dictionary.load('material_science/output/simple.dict')
print(corpus_simple_mm)

# Possible models:
# - LDA
# - LSI
# - TF-IDF

lda_model = gensim.models.LdaModel(corpus_simple_mm, num_topics=10, id2word=corpus_simple_dict, passes=4)


# Transforming

# transform text into the bag-of-words space
#bow_vector = id2word_wiki.doc2bow(tokenize(text))
#print([(id2word_wiki[id], count) for id, count in bow_vector])
github sujitpal / nltk-examples / src / topicmodel / lda_model.py View on Github external
import logging
import os
import gensim

MODELS_DIR = "models"
NUM_TOPICS = 4

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

dictionary = gensim.corpora.Dictionary.load(os.path.join(MODELS_DIR, "bok.dict"))
corpus = gensim.corpora.MmCorpus(os.path.join(MODELS_DIR, "bok.mm"))

# Project to LDA space
lda = gensim.models.LdaModel(corpus, id2word=dictionary, 
                             iterations=300,
                             num_topics=NUM_TOPICS)

ftt = open(os.path.join(MODELS_DIR, "topic_terms.csv"), 'wb')
for topic_id in range(NUM_TOPICS):
    term_probs = lda.show_topic(topic_id, topn=50)
    for prob, term in term_probs:
       ftt.write("%d\t%s\t%.3f\n" % (topic_id, term.replace("_", " "), prob))
ftt.close()

fdt = open(os.path.join(MODELS_DIR, "doc_topics.csv"), 'wb')
for doc_id in range(len(corpus)):
    docbok = corpus[doc_id]
github nmslib / nmslib / data / data_conv / create_lda.py View on Github external
#!/usr/bin/env python
import logging, gensim, bz2
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# load id->word mapping (the dictionary), one of the results of step 2 above
id2word = gensim.corpora.Dictionary.load_from_text('sparse_wiki_wordids.txt')
# load corpus iterator
mm = gensim.corpora.MmCorpus(bz2.BZ2File('sparse_wiki_tfidf.mm.bz2')) 

print mm

ntop=128

lda = gensim.models.ldamodel.LdaModel(corpus=mm, id2word=id2word, num_topics= ntop, update_every=0, passes=20)

lda_file = 'LDA/lda'+str(ntop)

lda.save(lda_file)

out_vect = 'LDA/wikipedia_lda'+str(ntop)+'.txt'
gensim.corpora.MmCorpus.serialize(out_vect, (gensim.matutils.unitvec(vec) for vec in lda[mm]))
github nouhadziri / THRED / thred / topic_model / lda.py View on Github external
if not exists(mm_corpus_file):
        print("corpus not found. Starting to build it...")

        class CorpusWrapper:

            def __init__(self, dictionary):
                self._dictionary = dictionary

            def __iter__(self):
                for tokens in corpus:
                    yield self._dictionary.doc2bow(tokens)

        gensim.corpora.MmCorpus.serialize(mm_corpus_file, CorpusWrapper(dictionary))

    mm_corpus = gensim.corpora.MmCorpus(mm_corpus_file)

    # generate LDA model
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
    ldamodel = gensim.models.LdaMulticore(mm_corpus,
                                          id2word=dictionary,
                                          alpha='asymmetric', eta='auto',
                                          num_topics=args.num_topics,
                                          passes=args.passes,
                                          eval_every=args.eval_every,
                                          batch=True,
                                          chunksize=args.chunksize,
                                          iterations=args.iterations)
    print("Saving LDA model...")
    ldamodel.save(join(model_dir, 'LDA.model'))

    print("Saving words for topics...")