How to use the scattertext.TermDocMatrix.TermDocMatrix function in scattertext

To help you get started, we’ve selected a few scattertext examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github JasonKessler / scattertext / scattertext / TermDocMatrixFactory.py View on Github external
category_idx_store = IndexStore()
    mX_factory = CSRMatrixFactory()
    for doci, (category, text) in enumerate(category_text_iter):
        y.append(category_idx_store.getidx(category))
        term_freq = Counter()
        for sent in text.strip(string.punctuation).lower().split('\n'):
            unigrams = []
            for tok in sent.strip().split():
                unigrams.append(tok)
            bigrams = list(map(' '.join, zip(unigrams[:-1], unigrams[1:])))
            for term in unigrams + bigrams:
                term_freq[term_idx_store.getidx(term)] += 1
        for word_idx, freq in term_freq.items():
            X_factory[doci, word_idx] = freq
    metadata_idx_store = IndexStore()
    return TermDocMatrix(X=X_factory.get_csr_matrix(),
                         mX=mX_factory.get_csr_matrix(),
                         y=np.array(y),
                         term_idx_store=term_idx_store,
                         metadata_idx_store=metadata_idx_store,
                         category_idx_store=category_idx_store)
github JasonKessler / scattertext / scattertext / TermDocMatrix.py View on Github external
def _make_new_term_doc_matrix(self,
                                  new_X=None,
                                  new_mX=None,
                                  new_y=None,
                                  new_term_idx_store=None,
                                  new_category_idx_store=None,
                                  new_metadata_idx_store=None,
                                  new_y_mask=None):
        return TermDocMatrix(X=new_X if new_X is not None else self._X,
                             mX=new_mX if new_mX is not None else self._mX,
                             y=new_y if new_y is not None else self._y,
                             term_idx_store=new_term_idx_store if new_term_idx_store is not None else self._term_idx_store,
                             category_idx_store=new_category_idx_store if new_category_idx_store is not None else self._category_idx_store,
                             metadata_idx_store=new_metadata_idx_store if new_metadata_idx_store is not None else self._metadata_idx_store,
                             unigram_frequency_path=self._unigram_frequency_path)
github JasonKessler / scattertext / scattertext / TermDocMatrixFactory.py View on Github external
----------
        category_doc_iter : iterator of (string category name, spacy.tokens.doc.Doc) pairs

        Returns
        ----------
        t : TermDocMatrix
        '''
        term_idx_store = IndexStore()
        category_idx_store = IndexStore()
        metadata_idx_store = IndexStore()
        X, mX, y = self._get_features_and_labels_from_documents_and_indexes \
            (category_doc_iter,
             category_idx_store,
             term_idx_store,
             metadata_idx_store)
        return TermDocMatrix(X,
                             mX,
                             y,
                             term_idx_store=term_idx_store,
                             category_idx_store=category_idx_store,
                             metadata_idx_store=metadata_idx_store)
github JasonKessler / scattertext / scattertext / TermDocMatrixFromPandas.py View on Github external
def _apply_pipeline_and_get_build_instance(self,
                                               X_factory,
                                               mX_factory,
                                               category_idx_store,
                                               df,
                                               parse_pipeline,
                                               term_idx_store,
                                               metadata_idx_store,
                                               y):
        df.apply(parse_pipeline.parse, axis=1)
        y = np.array(y)
        X, mX = self._build_sparse_matrices(y, X_factory, mX_factory)
        tdm = TermDocMatrix(X, mX, y, term_idx_store, category_idx_store, metadata_idx_store)
        return tdm
github JasonKessler / scattertext / scattertext / Corpus.py View on Github external
import numpy as np
import pandas as pd
from numpy import nonzero

from scattertext.TermDocMatrix import TermDocMatrix


class Corpus(TermDocMatrix):
	def __init__(self,
	             X,
	             mX,
	             y,
	             term_idx_store,
	             category_idx_store,
	             metadata_idx_store,
	             raw_texts,
	             unigram_frequency_path=None):
		'''
		Parameters
		----------
		X : csr_matrix
			term document matrix
		mX : csr_matrix
			metadata-document matrix
github JasonKessler / scattertext / scattertext / Corpus.py View on Github external
mX : csr_matrix
			metadata-document matrix
		y : np.array
			category index array
		term_idx_store : IndexStore
			Term indices
		category_idx_store : IndexStore
			Catgory indices
		metadata_idx_store : IndexStore
		  Document metadata indices
		raw_texts : np.array or pd.Series
			Raw texts
		unigram_frequency_path : str or None
			Path to term frequency file.
		'''
		TermDocMatrix.__init__(self, X, mX, y,
		                       term_idx_store,
		                       category_idx_store,
		                       metadata_idx_store,
		                       unigram_frequency_path)
		self._raw_texts = raw_texts