How to use the scattertext.ParsedCorpus.ParsedCorpus function in scattertext

To help you get started, we’ve selected a few scattertext examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github JasonKessler / scattertext / scattertext / topicmodel / SentencesForTopicModeling.py View on Github external
def __init__(self, corpus):
		'''

		Parameters
		----------
		corpus
		'''
		assert isinstance(corpus, ParsedCorpus)
		self.corpus = corpus
		self.termidxstore = corpus._term_idx_store
		matfact = CSRMatrixFactory()
		self.doclabs = []
		self.sentlabs = []
		self.sentdocs = []
		senti = 0
		for doci, doc in enumerate(corpus.get_parsed_docs()):
			for sent in doc.sents:
				validsent = False
				for t in sent:
					try:
						termi = self.termidxstore.getidxstrict(t.lower_)
					except:
						continue
					if validsent is False:
github JasonKessler / scattertext / scattertext / ParsedCorpus.py View on Github external
def _make_new_term_doc_matrix(self,
                                  new_X=None,
                                  new_mX=None,
                                  new_y=None,
                                  new_term_idx_store=None,
                                  new_category_idx_store=None,
                                  new_metadata_idx_store=None,
                                  new_y_mask=None):
        return ParsedCorpus(
            X=new_X if new_X is not None else self._X,
            mX=new_mX if new_mX is not None else self._mX,
            y=new_y if new_y is not None else self._y,
            parsed_col=self._parsed_col,
            category_col=self._category_col,
            term_idx_store=new_term_idx_store if new_term_idx_store is not None else self._term_idx_store,
            category_idx_store=new_category_idx_store if new_category_idx_store is not None else self._category_idx_store,
            metadata_idx_store=new_metadata_idx_store if new_metadata_idx_store is not None else self._metadata_idx_store,
            df=self._df[new_y_mask] if new_y_mask is not None else self._df,
            unigram_frequency_path=self._unigram_frequency_path
        )
github JasonKessler / scattertext / scattertext / CorpusFromFeatureDict.py View on Github external
def build(self):
		'''Constructs the term doc matrix.

		Returns
		-------
		scattertext.ParsedCorpus.ParsedCorpus
		'''
		self._y = self._get_y_and_populate_category_idx_store()
		self._df.apply(self._add_to_x_factory, axis=1)
		self._X = self._X_factory.set_last_row_idx(len(self._y) - 1).get_csr_matrix()
		self._mX = self._mX_factory.set_last_row_idx(len(self._y) - 1).get_csr_matrix()
		if self._parsed_col is not None and self._parsed_col in self._df:
			return ParsedCorpus(self._df,
			                    self._X,
			                    self._mX,
			                    self._y,
			                    self._term_idx_store,
			                    self._category_idx_store,
			                    self._metadata_idx_store,
			                    self._parsed_col,
			                    self._category_col)
		else:
			return CorpusDF(self._df,
			                self._X,
			                self._mX,
			                self._y,
			                self._text_col,
			                self._term_idx_store,
			                self._category_idx_store,
github JasonKessler / scattertext / scattertext / CorpusFromFeatureDict.py View on Github external
def _make_new_term_doc_matrix(self,
	                              new_X,
	                              new_mX,
	                              new_y,
	                              new_term_idx_store,
	                              new_category_idx_store,
	                              new_metadata_idx_store,
	                              new_y_mask):
		if self._parsed_col is not None and self._parsed_col in self._df:
			return ParsedCorpus(self._df[new_y_mask],
			                    new_X,
			                    new_mX,
			                    new_y,
			                    new_term_idx_store,
			                    new_category_idx_store,
			                    new_metadata_idx_store,
			                    self._parsed_col,
			                    self._category_col)
		else:
			return CorpusDF(self._df[new_y_mask],
			                new_X,
			                new_mX,
			                new_y,
			                self._text_col,
			                new_term_idx_store,
			                new_category_idx_store,
github JasonKessler / scattertext / scattertext / DocsAndLabelsFromCorpus.py View on Github external
def __init__(self, corpus, alternative_text_field=None):
		'''
		Parameters
		----------
		corpus, Corpus: Corpus to extract documents and labels from
		alternative_text_field, str or None: if str, corpus must be parsed corpus
		'''
		#assert (isinstance(corpus, (Corpus, ParsedCorpus, CorpusDF, TermCategoryFrequencies))
		#		or (issubclass(type(corpus), (Corpus, ParsedCorpus, CorpusDF, TermCategoryFrequencies))))
		self._texts_to_display = None
		if alternative_text_field is not None:
			if not isinstance(corpus, ParsedCorpus):
				raise CorpusShouldBeParsedCorpusException(
					'Corpus type needs to be ParsedCorpus to use the alternative text field.')
			self._texts_to_display = corpus.get_field(alternative_text_field)
		self._use_non_text_features = False
		self._corpus = corpus
github JasonKessler / scattertext / scattertext / representations / Word2VecFromParsedCorpus.py View on Github external
def __init__(self, corpus, word2vec_model=None):
		'''
		Parameters
		----------
		corpus: ParsedCorpus
		  from which to build word2vec model
		word2vec_model: word2vec.Word2Vec
			Gensim instance to be used to train word2vec model
		'''
		try:
			from gensim.models import word2vec
			assert word2vec_model is None or isinstance(word2vec_model, word2vec.Word2Vec)
		except:
			warnings.warn("You should really install gensim, but we're going to duck-type your model and pray it works")
		assert isinstance(corpus, ParsedCorpus)
		self.corpus = corpus
		self.model = self._get_word2vec_model(word2vec_model)
github JasonKessler / scattertext / scattertext / CorpusFromParsedDocuments.py View on Github external
def build(self):
		'''Constructs the term doc matrix.

		Returns
		-------
		scattertext.ParsedCorpus.ParsedCorpus
		'''
		self._y = self._get_y_and_populate_category_idx_store()
		self._df.apply(self._add_to_x_factory, axis=1)
		self._X = self._X_factory.set_last_row_idx(len(self._y)-1).get_csr_matrix()
		self._mX = self._mX_factory.set_last_row_idx(len(self._y)-1).get_csr_matrix()
		return ParsedCorpus(self._df,
		                    self._X,
		                    self._mX,
		                    self._y,
		                    self._term_idx_store,
		                    self._category_idx_store,
		                    self._metadata_idx_store,
		                    self._parsed_col,
		                    self._category_col)
github JasonKessler / scattertext / scattertext / CorpusFromParsedDocuments.py View on Github external
def _make_new_term_doc_matrix(self,
	                              new_X=None,
	                              new_mX=None,
	                              new_y=None,
	                              new_term_idx_store=None,
	                              new_category_idx_store=None,
	                              new_metadata_idx_store=None,
	                              new_y_mask=None):
		return ParsedCorpus(self._df[new_y_mask] if new_y_mask else self._df,
		                    self._X if new_X is None else new_X,
		                    self._mX if new_mX is None else new_mX,
		                    self._y if new_y is None else new_y,
		                    self._term_idx_store if new_term_idx_store is None else new_term_idx_store,
		                    self._category_idx_store if new_category_idx_store is None else new_category_idx_store,
		                    self._metadata_idx_store if new_metadata_idx_store is None else new_metadata_idx_store,
		                    self._parsed_col,
		                    self._category_col)