How to use the scattertext.CSRMatrixTools.CSRMatrixFactory function in scattertext

To help you get started, we’ve selected a few scattertext examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github JasonKessler / scattertext / scattertext / termcompaction / CompactTerms.py View on Github external
def _get_term_indices_to_compact_from_term_freqs(self, term_freqs, term_doc_matrix, non_text):
		idx = IndexStore()
		tdf_vals = term_freqs.values
		valid_terms_mask = tdf_vals.sum(axis=1) >= self.minimum_term_count
		tdf_vals = term_freqs[valid_terms_mask].values
		terms = np.array(term_freqs.index)[valid_terms_mask]

		lengths = []
		fact = CSRMatrixFactory()
		for i, t in enumerate(terms):
			for tok in t.split():
				fact[i, idx.getidx(tok)] = 1
			lengths.append(len(t.split()))
		lengths = np.array(lengths)
		mat = fact.get_csr_matrix()

		coocs = lengths - (mat * mat.T)
		pairs = np.argwhere(coocs == 0).T
		pairs = self._limit_to_non_identical_terms(pairs)
		pairs = self._limit_to_pairs_of_bigrams_and_a_constituent_unigram(pairs, terms)
		pairs = self._limit_to_redundant_unigrams(pairs, tdf_vals)
		idx_store = term_doc_matrix._get_relevant_idx_store(non_text)
		redundant_terms = idx_store.getidxstrictbatch(terms[np.unique(pairs[:, 1])])
		infrequent_terms = np.argwhere(~valid_terms_mask).T[0]
		terms_to_remove = np.concatenate([redundant_terms, infrequent_terms])
github JasonKessler / scattertext / scattertext / TermDocMatrixFactory.py View on Github external
def _get_features_and_labels_from_documents_and_indexes(self,
                                                            category_doc_iter,
                                                            category_idx_store,
                                                            term_idx_store,
                                                            metadata_idx_store):
        y = []
        X_factory = CSRMatrixFactory()
        mX_factory = CSRMatrixFactory()
        for document_index, (category, parsed_text) in enumerate(category_doc_iter):
            self._register_doc_and_category(X_factory,
                                            mX_factory,
                                            category,
                                            category_idx_store,
                                            document_index,
                                            parsed_text,
                                            term_idx_store,
                                            metadata_idx_store,
                                            y)
        X = X_factory.get_csr_matrix()
        mX = mX_factory.get_csr_matrix()
        y = np.array(y)
        return X, mX, y
github JasonKessler / scattertext / scattertext / TermDocMatrix.py View on Github external
for term in terms_in_corpus[label_term_mask]]
            if new_meta_X is None:
                new_meta_X = label_X
            else:
                label_X_pad = (CSRMatrixFactory()
                               .set_last_col_idx(cols_to_pad - 1)
                               .set_last_row_idx(sum(label_doc_mask) - 1)
                               .get_csr_matrix())
                padded_label_X = scipy.sparse.hstack([label_X_pad, label_X])
                new_meta_X.resize(new_meta_X.shape[0], padded_label_X.shape[1])
                new_meta_X = scipy.sparse.vstack([new_meta_X,
                                                  padded_label_X])

        new_metadata_idx_store = IndexStoreFromList.build(new_metadata_list)
        new_meta_X = new_meta_X.tocsr()
        new_mX = (CSRMatrixFactory()
                  .set_last_col_idx(new_meta_X.shape[1] - 1)
                  .set_last_row_idx(new_meta_X.shape[0] - 1)
                  .get_csr_matrix().tolil())
        start_row = 0
        for doc_label in ordered_doc_labels:
            label_doc_mask = doc_labels == doc_label
            num_rows = sum(label_doc_mask)
            new_mX[label_doc_mask, :] = new_meta_X[start_row:start_row + num_rows, :]
            start_row += num_rows

        new_mX = new_mX.tocsr()
        new_tdm = self._make_new_term_doc_matrix(self._X,
                                                 new_mX,
                                                 self._y,
                                                 self._term_idx_store,
                                                 self._category_idx_store,
github JasonKessler / scattertext / scattertext / TermDocMatrixFromPandas.py View on Github external
def build(self):
        '''Constructs the term doc matrix.

        Returns
        -------
        TermDocMatrix
        '''

        X_factory = CSRMatrixFactory()
        mX_factory = CSRMatrixFactory()
        term_idx_store = IndexStore()
        metadata_idx_store = IndexStore()

        parse_pipeline = ParsePipelineFactoryWithoutCategories(self.get_nlp(),
                                              X_factory,
                                              mX_factory,
                                              term_idx_store,
                                              metadata_idx_store,
                                              self)
        df = self._clean_and_filter_nulls_and_empties_from_dataframe()
        tdm = self._apply_pipeline_and_get_build_instance(X_factory,
                                                          mX_factory,
                                                          df,
                                                          parse_pipeline,
                                                          term_idx_store,
github JasonKessler / scattertext / scattertext / TermDocMatrix.py View on Github external
def use_categories_as_metadata(self):
        '''
        Returns a TermDocMatrix which is identical to self except the metadata values are now identical to the
         categories present.

        :return: TermDocMatrix
        '''
        new_metadata_factory = CSRMatrixFactory()
        for i, category_idx in enumerate(self.get_category_ids()):
            new_metadata_factory[i, category_idx] = 1
        new_metadata = new_metadata_factory.get_csr_matrix()
        new_tdm = self._make_new_term_doc_matrix(self._X,
                                                 new_metadata,
                                                 self._y,
                                                 self._term_idx_store,
                                                 self._category_idx_store,
                                                 copy(self._category_idx_store),
                                                 self._y == self._y)
        return new_tdm
github JasonKessler / scattertext / scattertext / TermDocMatrixFromPandas.py View on Github external
def init_term_doc_matrix_variables():
        y = []
        X_factory = CSRMatrixFactory()
        mX_factory = CSRMatrixFactory()
        category_idx_store = IndexStore()
        term_idx_store = IndexStore()
        metadata_idx_store = IndexStore()

        return X_factory, mX_factory, category_idx_store, \
               term_idx_store, metadata_idx_store, y
github JasonKessler / scattertext / scattertext / TermDocMatrixFactory.py View on Github external
def _get_features_and_labels_from_documents_and_indexes(self,
                                                            category_doc_iter,
                                                            category_idx_store,
                                                            term_idx_store,
                                                            metadata_idx_store):
        y = []
        X_factory = CSRMatrixFactory()
        mX_factory = CSRMatrixFactory()
        for document_index, (category, parsed_text) in enumerate(category_doc_iter):
            self._register_doc_and_category(X_factory,
                                            mX_factory,
                                            category,
                                            category_idx_store,
                                            document_index,
                                            parsed_text,
                                            term_idx_store,
                                            metadata_idx_store,
                                            y)
        X = X_factory.get_csr_matrix()
        mX = mX_factory.get_csr_matrix()
        y = np.array(y)
        return X, mX, y