How to use the scattertext.ClassPercentageCompactor function in scattertext

To help you get started, we’ve selected a few scattertext examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github JasonKessler / scattertext / demo_four_square.py View on Github external
t0 = time.time()
reviews_df = pd.read_csv('https://github.com/JasonKessler/ICLR18ReviewVis/raw/master/iclr2018_reviews.csv.bz2')
reviews_df['parse'] = reviews_df['review'].apply(st.whitespace_nlp_with_sentences)
full_corpus = (st.CorpusFromParsedDocuments(reviews_df,
                                            category_col='category',
                                            parsed_col='parse',
                                            #feats_from_spacy_doc=st.PhraseMachinePhrases()
                                            ).build())

term_ranker = st.OncePerDocFrequencyRanker
corpus = (full_corpus
          .keep_only_these_categories(['Accept, Positive', 'Accept, Negative',
                                       'Reject, Positive', 'Reject, Negative'],
                                      False)
          .get_unigram_corpus()
          .select(st.ClassPercentageCompactor(term_count=5)))

print('finding priors', time.time() - t0, 's')
priors = (st.PriorFactory(full_corpus, starting_count=0.01)
          .use_all_categories()
          .get_priors())
print('building four square', time.time() - t0, 's')

four_square = st.FourSquare(
	corpus,
	category_a_list=['Accept, Positive'],
	not_category_a_list=['Reject, Negative'],
	category_b_list=['Accept, Negative'],
	not_category_b_list=['Reject, Positive'],
	term_ranker=term_ranker,
	scorer=st.LogOddsRatioInformativeDirichletPrior(priors, 500, 'word'),
	labels={'a': 'Positive Reviews of Accepted Papers',
github norMNfan / Reddit-Bot-Classifier / classifier.py View on Github external
data[:, 0] = Y_comments
	data[:, 1] = X_comments

	for d in data:
		if d[0] == 0:
			d[0] = 'normal'
		else:
			d[0] = 'bot'

	df = pd.DataFrame({'label': data[:, 0], 'text':data[:, 1]})
	print(df)

	corpus = (st.CorpusFromPandas(df, category_col='label', text_col='text', nlp=st.whitespace_nlp_with_sentences)
		.build()
		.get_unigram_corpus()
		.compact(st.ClassPercentageCompactor(term_count=2, term_ranker=st.OncePerDocFrequencyRanker)))

	html = st.produce_characteristic_explorer(
		corpus,
		category='normal',
		category_name='Normal',
		not_category_name='Bot'
	)
	open('comment_text_chart.html', 'wb').write(html.encode('utf-8'))
github JasonKessler / scattertext / demo_characteristic_chart.py View on Github external
movie_df = st.SampleCorpora.RottenTomatoes.get_data()
movie_df.category = movie_df.category.apply(
	lambda x: {'rotten': 'Negative', 'fresh': 'Positive', 'plot': 'Plot'}[x]
)
movie_df = movie_df[movie_df.category.isin(['Negative', 'Positive'])]

corpus = (st.CorpusFromPandas(movie_df,
                              category_col='category',
                              text_col='text',
                              nlp=st.whitespace_nlp_with_sentences)
          .build()
          .get_unigram_corpus())

# Remove relatively infrequent terms from both categories
corpus = corpus.select(st.ClassPercentageCompactor(term_count=2,
                                                   term_ranker=st.OncePerDocFrequencyRanker))
fn = 'demo_characteristic_chart.html'

open(fn, 'wb').write(st.produce_characteristic_explorer(
	corpus,
	category='Positive',
	not_category_name='Negative',
	metadata=corpus.get_df()['movie_name'],
	characteristic_scorer=st.DenseRankCharacteristicness(rerank_ranks=False),
	term_ranker=st.termranking.AbsoluteFrequencyRanker,
	term_scorer=st.ScaledFScorePresets(beta=1, one_to_neg_one=True)
).encode('utf-8'))
print('open ' + fn)
github JasonKessler / scattertext / demo_bow_pca.py View on Github external
from sklearn.decomposition import TruncatedSVD

import scattertext as st
from scattertext import ClassPercentageCompactor, CSRMatrixFactory
from scattertext.representations.CorpusSentenceIterator import CorpusSentenceIterator

convention_df = st.SampleCorpora.ConventionData2012.get_data()
convention_df['parse'] = convention_df['text'].apply(st.whitespace_nlp_with_sentences)

corpus = (st.CorpusFromParsedDocuments(convention_df,
                                       category_col='party',
                                       parsed_col='parse')
          .build()
          .get_stoplisted_unigram_corpus().select(ClassPercentageCompactor(term_count=3)))


html = st.produce_projection_explorer(corpus,
                                      embeddings=corpus.get_term_doc_mat(),
                                      projection_model=TruncatedSVD(n_components=30, n_iter=10),
                                      x_dim=0,
                                      y_dim=1,
                                      category='democrat',
                                      category_name='Democratic',
                                      not_category_name='Republican',
                                      metadata=convention_df.speaker,
                                      width_in_pixels=1000)
file_name = 'demo_bow_pca.html'
open(file_name, 'wb').write(html.encode('utf-8'))
print('Open', file_name, 'in chrome')