How to use the gensim.corpora.Dictionary.load function in gensim

To help you get started, we’ve selected a few gensim examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github infolab-usc / bdr-tweet / word2vec_tweet_filter.py View on Github external
my_columns = ["choose_one", "text", "none"]

                    df = pd.DataFrame(arr, columns=my_columns)
                    df['choose_one:confidence'] = df['choose_one'].map(
                        lambda x: 1 if x == "Not Relevant" or x == "Relevant" else 0.5)

                elif disaster_type[ij] == "fire":
                    dimensions = 350

                    stem_map_high = json.load(open('./data/disasters/classify/fire_stem_map_high.json'))
                    stem_map_low = json.load(open('./data/disasters/classify/fire_stem_map_low.json'))
                    low_2_high_map = json.load(open('./data/disasters/classify/fire_low_2_high_map.json'))

                    word2vec_flag = 1

                    dictionary = corpora.Dictionary.load('./data/disasters/classify/fire_model.dict')
                    tfidf = models.TfidfModel.load('./data/disasters/classify/fire_model.tfidf')
                    lsi = models.LsiModel.load('./data/disasters/classify/fire_model.lsi')

                    model_flag = 1

                    input_file = "./data/disasters/classify/fire.csv"

                    arr = []
                    with open(input_file) as f:
                        for line in f:
                            a = [x.strip() for x in line.split(',')]
                            arr.append(a)

                    # df = np.array(arr)
                    # df = pd.read_csv(input_file, encoding="ISO-8859-1", delimiter=",")
                    my_columns = ["choose_one", "text", "none"]
github kethort / TwitterLDATopicModeling / src / tweets_on_LDA.py View on Github external
parser.add_argument('-t', '--topology_file', required=True, action='store', dest='top_file', help='Location of topology file')
    parser.add_argument('-p', '--dir_prefix', choices=['clique', 'community'], required=True, action='store', dest='dir_prefix', help='Select whether the topology contains cliques or communities')
    parser.add_argument('-w', '--working_dir', required=True, action='store', dest='working_dir', help='Name of the directory you want to direct output to')
    parser.add_argument('-l', '--lda_loc', required=True, action='store', dest='lda_loc', help='Location of the saved LDA model')
    parser.add_argument('-d', '--dict_loc', required=True, action='store', dest='dict_loc', help='Location of dictionary for the model')
    parser.add_argument('-u', '--unseen_docs', required=True, action='store', dest='unseen_docs', help='Directory containing unseen documents')
    parser.add_argument('-m', '--lemma', action='store_true', dest='lemma', help='Use this option to lemmatize words')
    argcomplete.autocomplete(parser)
    args = parser.parse_args()

    output_dir = os.path.join(args.working_dir, '')
    if not os.path.exists(os.path.dirname(output_dir)):
        os.makedirs(os.path.dirname(output_dir), 0o755)

    # load dictionary
    model_dict = corpora.Dictionary.load(args.dict_loc)
    # load trained model from file
    lda = models.LdaModel.load(args.lda_loc)
    write_topn_words(output_dir, lda)

    # create a set of all users from topology file
    with open(args.top_file, 'r') as inp_file:
        users = set(str(user) for community in inp_file for user in ast.literal_eval(community))

    # opens up a 'job in progress' if ran this program and stopped it
    try:
        with open(output_dir + 'document_vectors.json', 'r') as all_community_file:
            document_vectors = json.load(all_community_file)
    except:
        document_vectors = {}

    # use multiprocessing to query document vectors
github KeithYue / Zhihu_Spider / zhihu / zhihu_config.py View on Github external
# this is for corpus data
ZHIHU_ITEM_PATH = './zhihu_dat/item.dat'
ZHIHU_USER_PATH = './zhihu_dat/users.dat'

# this for adj data
ZHIHU_ITEM_ADJ = './zhihu_dat/item_adj.dat'
ZHIHU_USER_ADJ = './zhihu_dat/user_adj.dat'

# this for truth data
ZHIHU_TRUTH_ADJ = './zhihu_dat/truth.dat'

# this for user profile
ZHIHU_USER_Q_NUMBER = './zhihu_dat/user_q_num.dat'
ZHIHU_USER_Q_SCORE = './zhihu_dat/user_q_score.dat'

dictionary = corpora.Dictionary.load(ZHIHU_DICT_PATH)

# formateed user topic dat
ZHIHU_USER_TOPIC_PATH = './zhihu_dat/zhihu_user_topic.dat'
github carpedm20 / attentive-reader-tensorflow / data_utils.py View on Github external
cat
  will result in a vocabulary {"dog": 0, "cat": 1}, and this function will
  also return the reversed-vocabulary ["dog", "cat"].

  Args:
    vocabulary_path: path to the file containing the vocabulary.

  Returns:
    a pair: the vocabulary (a dictionary mapping string to integers), and
    the reversed vocabulary (a list, which reverses the vocabulary mapping).

  Raises:
    ValueError: if the provided vocabulary_path does not exist.
  """
  if gfile.Exists(vocabulary_path):
    vocab = corpora.Dictionary.load(vocabulary_path)
    return vocab.token2id, vocab.token2id.keys()
  else:
    raise ValueError("Vocabulary file %s not found.", vocabulary_path)
github NLeSC / cptm / cptm / CPTCorpus.py View on Github external
def load_dictionaries(self, topicDict=None, opinionDict=None):
        if topicDict:
            self.topicDictionary = corpora.Dictionary.load(topicDict)
            logger.info('topic dictionary {}'.format(self.topicDictionary))
        if opinionDict:
            self.opinionDictionary = corpora.Dictionary.load(opinionDict)
            logger.info('opinion dictionary {}'.format(self.opinionDictionary))
github danrobinson / taggernews / articles / management / commands / tag_articles.py View on Github external
def init_from_files(cls, topic_model_fname, gensim_dict_fname, lr_dict_fname,
                      *args, **kwargs):
    topic_modeler = models.ldamodel.LdaModel.load(topic_model_fname)
    gensim_dict = corpora.Dictionary.load(gensim_dict_fname)
    lr_dict = joblib.load(lr_dict_fname)
    return cls(topic_modeler, gensim_dict, lr_dict, *args, **kwargs)
github SeanTater / uncc2014watsonsim / scripts / gensim / intro.py View on Github external
#!/usr/bin/env python
import logging
import os
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
from gensim import corpora, models, similarities, matutils
from stoplist import stoplist
from vstore import VStore
import digestion


SOURCE = "wiki3"

### Create the corpus out of the documents
if os.path.exists(SOURCE+'.corpus.mm'):
    # Query mode
    unidict = corpora.Dictionary.load(SOURCE = ".dict")
    unilsi = models.LsiModel.load(SOURCE + '.unilsimodel')
    uniindex = similarities.MatrixSimilarity.load(SOURCE + ".matsim")
else:
    # Index mode
    # collect statistics about all tokens
    unidict = digestion.line_dict(SOURCE);
    filter_ids = set(unidict.token2id[stopword] for stopword in stoplist
        if stopword in unidict.token2id) # stopwords
    filter_ids.update(set([unidict.token2id[fragment] for fragment in unidict.token2id
        if len(fragment) == 1])) # short words
    filter_ids.update(set([tokenid for tokenid, docfreq in unidict.dfs.iteritems()
        if docfreq == 1])) # hepax legomena
    unidict.filter_tokens(filter_ids) # remove stop words and words that appear only once
    unidict.compactify() # remove gaps in id sequence after words that were removed
    unidict.save(SOURCE + '.dict')
    print(unidict)
github dataroot / Kaggle-CV / proj / processors.py View on Github external
def __init__(self, oblige_fit, path):
        super().__init__(oblige_fit, path)

        with open(path + 'tags_embs.pkl', 'rb') as file:
            self.embs = pickle.load(file)

        self.tp = TextProcessor(path)
        self.lda_dic = Dictionary.load(path + 'questions.lda_dic')
        self.lda_tfidf = TfidfModel.load(path + 'questions.lda_tfidf')
        self.lda_model = LdaMulticore.load(path + 'questions.lda_model')
        self.d2v = Doc2Vec.load(path + 'questions.d2v')

        self.features = {
            'categorical': [],
            'numerical': {
                'zero': ['questions_body_length', 'questions_tag_count'],
                'mean': []
            },
            'date': ['questions_date_added']
        }

        self._unroll_features()
github lluisgomez / TextTopicNet / LDA / learn_LDA_model.py View on Github external
# add tokens to corpus list
        texts.append(preprocess(raw))
        sys.stdout.write('\rCreating a list of tokenized documents: %d/%d documents processed...' % (len(texts),len(train_dict.values())))
        sys.stdout.flush()
    sys.stdout.write(' Done!\n')

# turn our tokenized documents into a id <-> term dictionary
if not os.path.isfile('./dictionary.dict'):
    print 'Turn our tokenized documents into a id <-> term dictionary ...',
    sys.stdout.flush()
    dictionary = corpora.Dictionary(texts)
    dictionary.save('./dictionary.dict')
else:
    print 'Loading id <-> term dictionary from ./dictionary.dict ...',
    sys.stdout.flush()
    dictionary = corpora.Dictionary.load('./dictionary.dict')
print ' Done!'

# ignore words that appear in less than 20 documents or more than 50% documents
dictionary.filter_extremes(no_below=20, no_above=0.5)
    
# convert tokenized documents into a document-term matrix
if not os.path.isfile('./bow.mm'):
    print 'Convert tokenized documents into a document-term matrix ...',
    sys.stdout.flush()
    corpus = [dictionary.doc2bow(text) for text in texts]
    gensim.corpora.MmCorpus.serialize('./bow.mm', corpus)
else:
    print 'Loading document-term matrix from ./bow.mm ...',
    sys.stdout.flush()
    corpus = gensim.corpora.MmCorpus('./bow.mm')
print ' Done!'