Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
category_idx_store = IndexStore()
mX_factory = CSRMatrixFactory()
for doci, (category, text) in enumerate(category_text_iter):
y.append(category_idx_store.getidx(category))
term_freq = Counter()
for sent in text.strip(string.punctuation).lower().split('\n'):
unigrams = []
for tok in sent.strip().split():
unigrams.append(tok)
bigrams = list(map(' '.join, zip(unigrams[:-1], unigrams[1:])))
for term in unigrams + bigrams:
term_freq[term_idx_store.getidx(term)] += 1
for word_idx, freq in term_freq.items():
X_factory[doci, word_idx] = freq
metadata_idx_store = IndexStore()
return TermDocMatrix(X=X_factory.get_csr_matrix(),
mX=mX_factory.get_csr_matrix(),
y=np.array(y),
term_idx_store=term_idx_store,
metadata_idx_store=metadata_idx_store,
category_idx_store=category_idx_store)
def _make_new_term_doc_matrix(self,
new_X=None,
new_mX=None,
new_y=None,
new_term_idx_store=None,
new_category_idx_store=None,
new_metadata_idx_store=None,
new_y_mask=None):
return TermDocMatrix(X=new_X if new_X is not None else self._X,
mX=new_mX if new_mX is not None else self._mX,
y=new_y if new_y is not None else self._y,
term_idx_store=new_term_idx_store if new_term_idx_store is not None else self._term_idx_store,
category_idx_store=new_category_idx_store if new_category_idx_store is not None else self._category_idx_store,
metadata_idx_store=new_metadata_idx_store if new_metadata_idx_store is not None else self._metadata_idx_store,
unigram_frequency_path=self._unigram_frequency_path)
----------
category_doc_iter : iterator of (string category name, spacy.tokens.doc.Doc) pairs
Returns
----------
t : TermDocMatrix
'''
term_idx_store = IndexStore()
category_idx_store = IndexStore()
metadata_idx_store = IndexStore()
X, mX, y = self._get_features_and_labels_from_documents_and_indexes \
(category_doc_iter,
category_idx_store,
term_idx_store,
metadata_idx_store)
return TermDocMatrix(X,
mX,
y,
term_idx_store=term_idx_store,
category_idx_store=category_idx_store,
metadata_idx_store=metadata_idx_store)
def _apply_pipeline_and_get_build_instance(self,
X_factory,
mX_factory,
category_idx_store,
df,
parse_pipeline,
term_idx_store,
metadata_idx_store,
y):
df.apply(parse_pipeline.parse, axis=1)
y = np.array(y)
X, mX = self._build_sparse_matrices(y, X_factory, mX_factory)
tdm = TermDocMatrix(X, mX, y, term_idx_store, category_idx_store, metadata_idx_store)
return tdm
import numpy as np
import pandas as pd
from numpy import nonzero
from scattertext.TermDocMatrix import TermDocMatrix
class Corpus(TermDocMatrix):
def __init__(self,
X,
mX,
y,
term_idx_store,
category_idx_store,
metadata_idx_store,
raw_texts,
unigram_frequency_path=None):
'''
Parameters
----------
X : csr_matrix
term document matrix
mX : csr_matrix
metadata-document matrix
mX : csr_matrix
metadata-document matrix
y : np.array
category index array
term_idx_store : IndexStore
Term indices
category_idx_store : IndexStore
Catgory indices
metadata_idx_store : IndexStore
Document metadata indices
raw_texts : np.array or pd.Series
Raw texts
unigram_frequency_path : str or None
Path to term frequency file.
'''
TermDocMatrix.__init__(self, X, mX, y,
term_idx_store,
category_idx_store,
metadata_idx_store,
unigram_frequency_path)
self._raw_texts = raw_texts