How to use the gensim.matutils function in gensim

To help you get started, we’ve selected a few gensim examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github ghmagazine / awesomebook / preprocess / 011_character / 02 / python_awesome.py View on Github external
if not (n.is_bos() or n.is_eos()):
        part, word = n.feature.split(',', 1)
      if part == "名詞" or part == "動詞":
        word_list.append(n.surface)

  # テキストファイルごとの単語リストを追加
  txt_word_list.append(word_list)

# bug of wordsを作成するため全種類の単語を把握し、単語IDを付与した辞書を作成
corpus_dic = corpora.Dictionary(txt_word_list)

# 各文章の単語リストをコーパス(辞書の単語IDと単語の出現回数)リストに変換
corpus_list = [corpus_dic.doc2bow(word_in_text) for word_in_text in txt_word_list]

# コーパスリストをスパースマトリックス(csc型)に変換
word_matrix = matutils.corpus2csc(corpus_list)
github HKUST-KnowComp / MNE / MNE.py View on Github external
def worker_loop():
            """Train the model, lifting lists of sentences from the job_queue."""
            work = matutils.zeros_aligned(self.layer1_size, dtype=REAL)  # per-thread private work memory
            neu1 = matutils.zeros_aligned(self.layer1_size, dtype=REAL)
            jobs_processed = 0
            while True:
                job = job_queue.get()
                if job is None:
                    progress_queue.put(None)
                    break  # no more jobs => quit this worker
                sentences, alpha = job
                tally, raw_tally = self._do_train_job(sentences, alpha, (work, neu1))
                progress_queue.put((len(sentences), tally, raw_tally))  # report back progress
                jobs_processed += 1
            logger.debug("worker exiting, processed %i jobs", jobs_processed)
github KBNLresearch / keyword-generator / keywords_lda.py View on Github external
lda = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus,
                id2word=dictionary, num_topics=num_topics)
        topics = lda.show_topics(num_topics=num_topics, num_words=num_words,
                formatted=False)
        distributions = [dist for dist in lda.load_document_topics()]
    else:
        print('Generating model with Gensim LDA ...')
        lda = gensim.models.LdaModel(corpus, id2word=dictionary,
                num_topics=num_topics, alpha='auto', chunksize=1, eval_every=1)
        gensim_topics = [t[1] for t in lda.show_topics(num_topics=num_topics,
                num_words=num_words, formatted=False)]
        topics = [[(i[1], i[0]) for i in t] for t in gensim_topics]
        distributions = []
        matrix = gensim.matutils.corpus2csc(corpus)
        for i in range(matrix.get_shape()[1]):
            bow = gensim.matutils.scipy2sparse(matrix.getcol(i).transpose())
            distributions.append(lda.get_document_topics(bow, 0))

    topics = exclude_topics(topics)
    keywords = generate_keywords(corpus, dictionary, topics, num_keywords)

    print_keywords(keywords)
    save_keywords(keywords)
    save_topics(topics)
    save_distributions(distributions)
github largelymfs / topical_word_embeddings / TWE-3 / gensim / models / word2vec.py View on Github external
def worker_train():
            """Train the model, lifting lists of sentences from the jobs queue."""
            work = zeros(self.layer2_size, dtype=REAL)  # each thread must have its own work memory
            neu1 = matutils.zeros_aligned(self.layer2_size, dtype=REAL)

            while True:
                job = jobs.get()
                if job is None:  # data finished, exit
                    break
                # update the learning rate before every job
                alpha = max(self.min_alpha, self.alpha * (1 - 1.0 * word_count[0] / total_words))
                # how many words did we train on? out-of-vocabulary (unknown) words do not count
                if self.sg:
                    job_words = sum(train_sentence_sg(self, sentence, alpha, work) for sentence in job)
                else:
                    job_words = sum(train_sentence_cbow(self, sentence, alpha, work, neu1) for sentence in job)
                with lock:
                    word_count[0] += job_words
                    elapsed = time.time() - start
                    if elapsed >= next_report[0]:
github desmarais-lab / text_reuse / etl / ncsl_cosim / ncsl_cosim.py View on Github external
# Reduce vocabulary
    dictionary.filter_extremes(no_below=2, no_above=1.0, keep_n=None)

    print('Generate term document matrix...')
    corpus = []
    for i,bill in enumerate(bills):
        text = get_bill_text(bill)
        if text is None:
            continue
        tokens = cleaner.clean(text)
        corpus.append(dictionary.doc2bow(tokens))

        if i % 100 == 0:
            print(i)
    
    dtm = matutils.corpus2csc(corpus).transpose()

    print('Calculating similarities')
    csims = cosine_similarity(dtm)

    # Store output (store the complete matrix w/o diagonal, in case 
    # left and right bill is different in the other datasets
    outline = '{},{},{}\n'
    with open('../../data/ncsl/cosine_similarities.csv', 'w') as outfile:
        outfile.write(outline.format('left_doc_id',
                                     'right_doc_id',
                                     'cosine_similarity'))

        for i in range(len(ids_with_text)):
            for j in range(len(ids_with_text)):
                if i == j:
                    continue
github RaRe-Technologies / gensim / gensim / models / ldamodel.py View on Github external
Parameters
        ----------
        topicid : int
            The ID of the topic to be returned
        topn : int, optional
            Number of the most significant words that are associated with the topic.

        Returns
        -------
        list of (int, float)
            Word ID - probability pairs for the most relevant words generated by the topic.

        """
        topic = self.get_topics()[topicid]
        topic = topic / topic.sum()  # normalize to probability distribution
        bestn = matutils.argsort(topic, topn, reverse=True)
        return [(idx, topic[idx]) for idx in bestn]
github kethort / TwitterLDATopicModeling / lda_tuna.py View on Github external
def extract_data(topic_model, corpus, dictionary, doc_topic_dists=None):
    
    if not matutils.ismatrix(corpus):
          corpus_csc = matutils.corpus2csc(corpus, num_terms=len(dictionary))
    else:
          corpus_csc = corpus
          # Need corpus to be a streaming gensim list corpus for len and inference functions below:
          corpus = matutils.Sparse2Corpus(corpus_csc)

    # TODO: add the hyperparam to smooth it out? no beta in online LDA impl.. hmm..
    # for now, I'll just make sure we don't ever get zeros...
    fnames_argsort = np.asarray(list(dictionary.token2id.values()), dtype=np.int_)
    doc_lengths = corpus_csc.sum(axis=0).A.ravel()

    assert doc_lengths.shape[0] == len(corpus), 'Document lengths and corpus have different sizes {} != {}'.format(doc_lengths.shape[0], len(corpus))

    if hasattr(topic_model, 'lda_alpha'):
           num_topics = len(topic_model.lda_alpha)
    else:
github JayveeHe / MusicTaster / song2vec / song2vec_operator.py View on Github external
elif word in self.artist2vec_model.vocab:
                        mean.append(weight * self.artist2vec_model.syn0norm[self.artist2vec_model.vocab[artist].index])
                        all_words.add(self.artist2vec_model.vocab[artist].index)
                    else:
                        raise KeyError("artist '%s' not in vocabulary" % artist)
            if not mean:
                raise ValueError("cannot compute similarity with no input")
            mean = matutils.unitvec(array(mean).mean(axis=0)).astype(REAL)
            limited = self.song2vec_model.syn0norm if restrict_vocab is None \
                else self.song2vec_model.syn0norm[:restrict_vocab]
            # limited += self.artist2vec_model.syn0norm if restrict_vocab is None \
            #     else self.artist2vec_model.syn0norm[:restrict_vocab]
            dists = dot(limited, mean)
            if not topn:
                return dists
            best = matutils.argsort(dists, topn=topn + len(all_words), reverse=True)
            # ignore (don't return) words from the input
            result = [(self.song2vec_model.index2word[sim], float(dists[sim])) for sim in best if sim not in all_words]
            return result[:topn]
        except Exception, e:
            print 'error = %s' % e
            raise e
github RaRe-Technologies / gensim / gensim / models / word2vec.py View on Github external
word_vocabs = [self.wv.vocab[w] for w in context_words_list if w in self.wv.vocab]
        if not word_vocabs:
            warnings.warn("All the input context words are out-of-vocabulary for the current model.")
            return None

        word2_indices = [word.index for word in word_vocabs]

        l1 = np_sum(self.wv.vectors[word2_indices], axis=0)
        if word2_indices and self.cbow_mean:
            l1 /= len(word2_indices)

        # propagate hidden -> output and take softmax to get probabilities
        prob_values = exp(dot(l1, self.trainables.syn1neg.T))
        prob_values /= sum(prob_values)
        top_indices = matutils.argsort(prob_values, topn=topn, reverse=True)
        # returning the most probable output words with their probabilities
        return [(self.wv.index2word[index1], prob_values[index1]) for index1 in top_indices]