How to use the gensim.corpora.Dictionary function in gensim

To help you get started, we’ve selected a few gensim examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github fastnlp / fastNLP / reproduction / CNN-sentence_classification / dataset.py View on Github external
def __init__(self):

        #load positive and negative sentenses from files
        with codecs.open("./rt-polaritydata/rt-polarity.pos",encoding ='ISO-8859-1') as f:
            positive_examples = list(f.readlines())
        with codecs.open("./rt-polaritydata/rt-polarity.neg",encoding ='ISO-8859-1') as f:
            negative_examples = list(f.readlines())
        #s.strip: clear "\n"; clear_str; pad
        positive_examples = [pad_sentences(clean_str(s.strip())) for s in positive_examples]
        negative_examples = [pad_sentences(clean_str(s.strip())) for s in negative_examples]
        self.examples = positive_examples + negative_examples
        self.sentences_texts = [sample.split() for sample in self.examples]

        #word dictionary
        dictionary = corpora.Dictionary(self.sentences_texts) 
        self.word2id_dict = dictionary.token2id  # transform to dict, like {"human":0, "a":1,...}

        #set lables: postive is 1; negative is 0
        positive_labels = [1 for _ in positive_examples]
        negative_labels = [0 for _ in negative_examples]
        self.lables = positive_labels + negative_labels
        examples_lables = list(zip(self.examples,self.lables))
        random.shuffle(examples_lables)
        self.MRDataset_frame = examples_lables

        #transform word to id
        self.MRDataset_wordid = \
            [(
                np.array([self.word2id_dict[word] for word in sent[0].split()], dtype=np.int64), 
                sent[1]
            ) for sent in self.MRDataset_frame]
github SmartDataAnalytics / horus-ner / src / horus / experiments / text_classification / topic_modeling.py View on Github external
stop = set(stopwords.words('english'))
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()
def clean(doc):
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    return normalized

doc_clean = [clean(doc).split() for doc in doc_complete]

get_histogram(doc_clean)
exit(0)

# Creating the term dictionary of our courpus, where every unique term is assigned an index.
dictionary = corpora.Dictionary(doc_clean)

# Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]

# Creating the object for LDA model using gensim library
Lda = gensim.models.ldamodel.LdaModel

# Running and Trainign LDA model on the document term matrix.
ldamodel = Lda(doc_term_matrix, num_topics=3, id2word = dictionary, passes=50)
print(ldamodel.print_topics(num_topics=3, num_words=20))
github dipanjanS / text-analytics-with-python / Old-First-Edition / source_code / Ch05_Text_Summarization / keyphrase_extraction.py View on Github external
def get_tfidf_weighted_keyphrases(sentences, 
                                  grammar=r'NP: {<dt>? * +}',
                                  top_n=10):
    
    valid_chunks = get_chunks(sentences, grammar=grammar)
                                     
    dictionary = corpora.Dictionary(valid_chunks)
    corpus = [dictionary.doc2bow(chunk) for chunk in valid_chunks]
    
    tfidf = models.TfidfModel(corpus)
    corpus_tfidf = tfidf[corpus]
    
    weighted_phrases = {dictionary.get(id): round(value,3) 
                        for doc in corpus_tfidf 
                        for id, value in doc}
                            
    weighted_phrases = sorted(weighted_phrases.items(), 
                              key=itemgetter(1), reverse=True)
    
    return weighted_phrases[:top_n]
    </dt>
github motazsaad / comparable-text-miner / textpro.py View on Github external
if not output_path.endswith('/'): output_path = output_path + '/'
	check_dir(output_path) # if directory does not exist, then create
	
	logging.info( 'building gensim corpus and dictionary for %s corpus', corpus_name )
	logging.info( 'loading corpus' )
	texts = [[word for word in process_text(document, removePunct=True, removeSW=True, removeNum=True)] for document in corpus]
	logging.info( 'tokenizing' )
	all_tokens = [item for sublist in texts for item in sublist]
	logging.info( 'mark tokens which have frequency less than %d', min_freq )
	tokens_once = set([k for k, v in collections.Counter(all_tokens).iteritems() if v &lt; min_freq ])
	logging.info( '|D|=%d' , len(texts) )
	logging.info( 'filter low frequency tokens' )
	texts = [[word for word in text if word not in tokens_once] for text in texts]
	logging.info( '|D|=%d' , len(texts) )
	logging.info( 'building dictionary' )
	dictionary = corpora.Dictionary(texts)
	logging.info( 'saving dictionary' )
	dictFile = output_path + corpus_name + '.dict'
	dictionary.save(dictFile) 
	logging.info( 'building corpus in  mm format' )
	corpus = [dictionary.doc2bow(text) for text in texts]
	logging.info( 'saving corpus' )
	gensim_corpus_file = output_path + corpus_name + '.mm'
	corpora.MmCorpus.serialize(gensim_corpus_file, corpus)
	logging.info( 'computing tfidf' )
	tfidf = models.TfidfModel(corpus) # tfidf model 
	corpus_tfidf = tfidf[corpus] # tfidf corpus 
	logging.info( 'saving tfidf corpus' )
	corpus_tfidf_file = output_path + corpus_name + '.tfidf.mm'
	corpora.MmCorpus.serialize(corpus_tfidf_file, corpus_tfidf)
	logging.info( 'gensim corpus is ready' )
##################################################################################
github daviddwlee84 / MachineLearningPractice / Algorithm / VSM / VSM_Document_Similarity / VSM_Document_Similarity_Gensim.py View on Github external
def VSM(articleMatrix):
    dictionary = corpora.Dictionary(articleMatrix) # Transfer to dictionary
    corpus = [dictionary.doc2bow(article) for article in articleMatrix]   # For each article create a bag-of-words

    # Use TF-IDF Model
    tfidf = models.TfidfModel(corpus)
    corpus_tfidf = tfidf[corpus]

    # Similarity Matrix
    tmp_file = get_tmpfile("vsm_similarity")
    similarity = similarities.Similarity(tmp_file, corpus_tfidf, num_features=len(dictionary))

    # Calculate similarity
    similarityMat = similarity[corpus_tfidf]
    return similarityMat
github burness / tensorflow-101 / nlp / text_classifier / scripts / cnn_text_classifier_v2.py View on Github external
def __init__(self, data_path):
        self.data_path = data_path
        self.dictionary = corpora.Dictionary()
        self.corpus = []
        self.labels = []
        self.cut_doc_obj = cutDoc()
        self.w2v_file = W2V_FILE
        self.class_num = CLASS_NUM
        self.filter_sizes = (3, 8)
        self.num_filters = 10
        self.hidden_dims = 64
github wanZzz6 / smart_robot / Lsi_gensim.py View on Github external
global dictionary, lsi, raw_docs

    print('正在查询问题。。。')
    raw_docs = read_quesion()
    print(raw_docs)

    # 如果已经训练过,且自定义的问题未更改,可以直接加载模型、返回index
    # if 'Lsi_matrix.index' in os.listdir('.'):
    #     index = similarities.SparseMatrixSimilarity.load('Lsi_matrix.index')
    #     return index

    # 没有模型,或者自定义问题更改,开始重新训练。
    all_doc_list = [list(jieba.cut(doc)) for doc in raw_docs]
    # 制作词袋
    dictionary = corpora.Dictionary(all_doc_list)
    # 语料库:
    corpus = [dictionary.doc2bow(doc) for doc in all_doc_list]
    # 将corpus语料库(初识语料库) 使用Lsi模型进行训练
    lsi = models.LsiModel(corpus)
    # 文本相似度
    # 稀疏矩阵
    index = similarities.SparseMatrixSimilarity(
        lsi[corpus], num_features=len(dictionary.keys()))
    # 保存矩阵模型
    index.save('Lsi_matrix.index')
    return index
github desmarais-lab / text_reuse / etl / alignments / process_alignment_results.py View on Github external
def __init__(self, infile, stemmer, remove_same_state, type="all"):
        self.infile = infile
        self.exclude = set(['', ' '])
        self.size = 0
        self.dictionary = corpora.Dictionary()
        self.stemmer = stemmer
        self.remove_same_state = remove_same_state
        self.schar = re.compile('[^A-Za-z]')
        self.type = type
        self.no_align = 0
github ContinuumIO / topik / topik / vectorizers.py View on Github external
def __init__(self, corpus):
        self.corpus = corpus
        self.iter_1, self.iter_2 = itertools.tee(self.corpus, 2)
        self.tokens = [tokens for tokens in iter_corpus(self.iter_1)]
        # Create dictionary out of input corpus tokens
        self.dict = gensim.corpora.Dictionary(self.tokens)
        self.filename = None