How to use the scattertext.indexstore.IndexStore.IndexStore function in scattertext

To help you get started, we’ve selected a few scattertext examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github JasonKessler / scattertext / scattertext / CorpusFromFeatureDict.py View on Github external
name of column in convention_df with a feature dictionary
		metadata_col : str, optional
				name of column in convention_df with a meatadata dictionary
		parsed_col : str, optional
				name of column in convention_df with parsed strings
		'''
		self._df = df.reset_index()
		self._category_col = category_col
		self._text_col = text_col
		self._feature_col = feature_col
		self._parsed_col = parsed_col
		self._metadata_col = metadata_col
		self._category_idx_store = IndexStore()
		self._X_factory = CSRMatrixFactory()
		self._mX_factory = CSRMatrixFactory()
		self._term_idx_store = IndexStore()
		self._metadata_idx_store = IndexStore()
github JasonKessler / scattertext / scattertext / TermDocMatrixWithoutCategories.py View on Github external
def add_doc_names_as_metadata(self, doc_names):
        '''
        :param doc_names: array-like[str], document names of reach document
        :return: Corpus-like object with doc names as metadata. If two documents share the same name
        (doc number) will be appended to their names.
        '''
        if len(doc_names) != self.get_num_docs():
            raise Exception("The parameter doc_names contains %s elements. "
                            "It should have %s elements, one per document." % (len(doc_names), self.get_num_docs()))

        doc_names_counter = collections.Counter(np.array(doc_names))
        metafact = CSRMatrixFactory()
        metaidxstore = IndexStore()
        doc_id_uses = collections.Counter()
        for i in range(self.get_num_docs()):
            doc_id = doc_names[i]
            if doc_names_counter[doc_id] > 1:
                doc_id_uses[doc_id] += 1
                doc_name_idx = metaidxstore.getidx('%s (%s)' % (doc_id, doc_id_uses[doc_id]))
            else:
                doc_name_idx = metaidxstore.getidx(doc_id)
            metafact[i, i] = doc_name_idx
        return self.add_metadata(metafact.get_csr_matrix(), metaidxstore)
github JasonKessler / scattertext / scattertext / TermDocMatrixFactory.py View on Github external
def _build_from_category_spacy_doc_iter(self, category_doc_iter):
        '''
        Parameters
        ----------
        category_doc_iter : iterator of (string category name, spacy.tokens.doc.Doc) pairs

        Returns
        ----------
        t : TermDocMatrix
        '''
        term_idx_store = IndexStore()
        category_idx_store = IndexStore()
        metadata_idx_store = IndexStore()
        X, mX, y = self._get_features_and_labels_from_documents_and_indexes \
            (category_doc_iter,
             category_idx_store,
             term_idx_store,
             metadata_idx_store)
        return TermDocMatrix(X,
                             mX,
                             y,
                             term_idx_store=term_idx_store,
                             category_idx_store=category_idx_store,
                             metadata_idx_store=metadata_idx_store)
github JasonKessler / scattertext / scattertext / indexstore / IndexStore.py View on Github external
def batch_delete_idx(self, idx_list):
		new_idxstore = IndexStore()
		last_idx_to_delete = -1
		number_of_values = self.getnumvals()
		for idx_to_delete in sorted(idx_list):
			if idx_to_delete >= number_of_values:
				raise ValueError('index ' + str(idx_to_delete) + ' not found')
			new_idxstore._i2val += self._i2val[last_idx_to_delete + 1:idx_to_delete]
			last_idx_to_delete = idx_to_delete
		new_idxstore._i2val += self._i2val[last_idx_to_delete + 1:]
		new_idxstore._val2i = {val: i for i, val in enumerate(new_idxstore._i2val)}
		new_idxstore._next_i = len(new_idxstore._val2i)

		return new_idxstore
github JasonKessler / scattertext / scattertext / ParsedCorpus.py View on Github external
def term_group_freq_df(self, group_col):
        # type: (str) -> pd.DataFrame
        '''
        Returns a dataframe indexed on the number of groups a term occured in.

        Parameters
        ----------
        group_col

        Returns
        -------
        pd.DataFrame
        '''
        group_idx_store = IndexStore()
        X = self._X
        group_idx_to_cat_idx, row_group_cat \
            = self._get_group_docids_and_index_store(X, group_col, group_idx_store)
        newX = self._change_document_type_in_matrix(X, row_group_cat)
        newX = self._make_all_positive_data_ones(newX)
        category_row = newX.tocoo().row
        for group_idx, cat_idx in group_idx_to_cat_idx.items():
            category_row[category_row == group_idx] = cat_idx
        catX = self._change_document_type_in_matrix(newX, category_row)
        return self._term_freq_df_from_matrix(catX)
github JasonKessler / scattertext / scattertext / TermDocMatrixFactory.py View on Github external
def _build_from_category_spacy_doc_iter(self, category_doc_iter):
        '''
        Parameters
        ----------
        category_doc_iter : iterator of (string category name, spacy.tokens.doc.Doc) pairs

        Returns
        ----------
        t : TermDocMatrix
        '''
        term_idx_store = IndexStore()
        category_idx_store = IndexStore()
        metadata_idx_store = IndexStore()
        X, mX, y = self._get_features_and_labels_from_documents_and_indexes \
            (category_doc_iter,
             category_idx_store,
             term_idx_store,
             metadata_idx_store)
        return TermDocMatrix(X,
                             mX,
                             y,
                             term_idx_store=term_idx_store,
                             category_idx_store=category_idx_store,
                             metadata_idx_store=metadata_idx_store)
github JasonKessler / scattertext / scattertext / TermDocMatrixFromPandas.py View on Github external
def build(self):
        '''Constructs the term doc matrix.

        Returns
        -------
        TermDocMatrix
        '''

        X_factory = CSRMatrixFactory()
        mX_factory = CSRMatrixFactory()
        term_idx_store = IndexStore()
        metadata_idx_store = IndexStore()

        parse_pipeline = ParsePipelineFactoryWithoutCategories(self.get_nlp(),
                                              X_factory,
                                              mX_factory,
                                              term_idx_store,
                                              metadata_idx_store,
                                              self)
        df = self._clean_and_filter_nulls_and_empties_from_dataframe()
        tdm = self._apply_pipeline_and_get_build_instance(X_factory,
                                                          mX_factory,
                                                          df,
                                                          parse_pipeline,
                                                          term_idx_store,
                                                          metadata_idx_store)
        return tdm
github JasonKessler / scattertext / scattertext / TermDocMatrixFromPandas.py View on Github external
def build(self):
        '''Constructs the term doc matrix.

        Returns
        -------
        TermDocMatrix
        '''

        X_factory = CSRMatrixFactory()
        mX_factory = CSRMatrixFactory()
        term_idx_store = IndexStore()
        metadata_idx_store = IndexStore()

        parse_pipeline = ParsePipelineFactoryWithoutCategories(self.get_nlp(),
                                              X_factory,
                                              mX_factory,
                                              term_idx_store,
                                              metadata_idx_store,
                                              self)
        df = self._clean_and_filter_nulls_and_empties_from_dataframe()
        tdm = self._apply_pipeline_and_get_build_instance(X_factory,
                                                          mX_factory,
                                                          df,
                                                          parse_pipeline,
                                                          term_idx_store,
                                                          metadata_idx_store)
        return tdm
github JasonKessler / scattertext / scattertext / CorpusFromParsedDocuments.py View on Github external
----------
		df : pd.DataFrame
		 contains category_col, and parse_col, were parsed col is entirely spacy docs
		category_col : str
			name of category column in convention_df
		parsed_col : str
			name of spacy parsed column in convention_df
		feats_from_spacy_doc : FeatsFromSpacyDoc
		'''
		self._df = df.reset_index()
		self._category_col = category_col
		self._parsed_col = parsed_col
		self._category_idx_store = IndexStore()
		self._X_factory = CSRMatrixFactory()
		self._mX_factory = CSRMatrixFactory()
		self._term_idx_store = IndexStore()
		self._metadata_idx_store = IndexStore()
		self._feats_from_spacy_doc = feats_from_spacy_doc
github JasonKessler / scattertext / scattertext / TermDocMatrixFactory.py View on Github external
def _build_from_category_spacy_doc_iter(self, category_doc_iter):
        '''
        Parameters
        ----------
        category_doc_iter : iterator of (string category name, spacy.tokens.doc.Doc) pairs

        Returns
        ----------
        t : TermDocMatrix
        '''
        term_idx_store = IndexStore()
        category_idx_store = IndexStore()
        metadata_idx_store = IndexStore()
        X, mX, y = self._get_features_and_labels_from_documents_and_indexes \
            (category_doc_iter,
             category_idx_store,
             term_idx_store,
             metadata_idx_store)
        return TermDocMatrix(X,
                             mX,
                             y,
                             term_idx_store=term_idx_store,
                             category_idx_store=category_idx_store,
                             metadata_idx_store=metadata_idx_store)