How to use the scattertext.CorpusFromPandas function in scattertext

To help you get started, we’ve selected a few scattertext examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github JasonKessler / scattertext / demo_pair_plot_movies.py View on Github external
import scattertext as st

movie_df = st.SampleCorpora.RottenTomatoes.get_data()
movie_df.category = movie_df.category \
    .apply(lambda x: {'rotten': 'Negative', 'fresh': 'Positive', 'plot': 'Plot'}[x])

corpus = st.CorpusFromPandas(
    movie_df,
    category_col='movie_name',
    text_col='text',
    nlp=st.whitespace_nlp_with_sentences
).build().get_stoplisted_unigram_corpus()

html = st.produce_pairplot(
    corpus,
    category_projection=st.get_optimal_category_projection(corpus, verbose=True),
    metadata=movie_df['category'] + ': ' + movie_df['movie_name'],
    d3_url_struct=st.D3URLs(
        d3_scale_chromatic_url='scattertext/data/viz/scripts/d3-scale-chromatic.v1.min.js',
        d3_url='scattertext/data/viz/scripts/d3.min.js'
    )
)
github JasonKessler / scattertext / demo_log_odds_ratio_prior.py View on Github external
from scattertext.termcompaction.CompactTerms import CompactTerms

import scattertext as st
from scattertext import LogOddsRatioInformativeDirichletPrior

fn = 'demo_log_odds_ratio_prior.html'
df = st.SampleCorpora.RottenTomatoes.get_data()
corpus = (st.CorpusFromPandas(df,
                              category_col='category',
                              text_col='text',
                              nlp=st.whitespace_nlp_with_sentences)
          .build())
priors = (st.PriorFactory(corpus,
                          category='fresh',
                          not_categories=['rotten'],
                          starting_count=1)
          # .use_general_term_frequencies()
          .use_all_categories()
          .get_priors())
(open(fn, 'wb')
    .write(
    st.produce_frequency_explorer(
        corpus,
        category='fresh',
github JasonKessler / scattertext / scattertext / CLI.py View on Github external
else:
		import spacy
		nlp = spacy.load(args.spacy_language_model)

	term_ranker = None
	if args.one_use_per_doc is True:
		term_ranker = OncePerDocFrequencyRanker

	category_display_name = args.category_display_name
	if category_display_name is None:
		category_display_name = args.positive_category
	not_category_display_name = args.not_category_display_name
	if not_category_display_name is None:
		not_category_display_name = 'Not ' + category_display_name

	corpus = CorpusFromPandas(df,
	                          category_col=args.category_column,
	                          text_col=args.text_column,
	                          nlp=nlp).build()
	html = produce_scattertext_explorer(corpus,
	                                    category=args.positive_category,
	                                    category_name=category_display_name,
	                                    not_category_name=not_category_display_name,
	                                    minimum_term_frequency=args.minimum_term_frequency,
	                                    pmi_filter_thresold=args.pmi_threshold,
	                                    width_in_pixels=args.width_in_pixels,
	                                    term_ranker=term_ranker,
	                                    metadata=None if args.metadata_column is None \
		                                    else df[args.metadata_column]
	                                    )
	if args.outputfile == '-':
		print(html)
github JasonKessler / scattertext / demo_pair_plot_convention_empath.py View on Github external
import scattertext as st
import scattertext.categoryprojector.pairplot

convention_df = st.SampleCorpora.ConventionData2012.get_data()
empath_feature_builder = st.FeatsFromOnlyEmpath()

corpus = st.CorpusFromPandas(
    convention_df,
    category_col='speaker',
    text_col='text',
    nlp=st.whitespace_nlp_with_sentences,
    feats_from_spacy_doc=empath_feature_builder).build().get_unigram_corpus()

html = scattertext.categoryprojector.pairplot.produce_pairplot(corpus,
                                                               use_metadata=True,
                                                               category_projector=st.CategoryProjector(selector=None),
                                                               topic_model_term_lists=empath_feature_builder.get_top_model_term_lists(),
                                                               metadata=convention_df['party'] + ': ' + convention_df['speaker'])

file_name = 'convention_pair_plot_empath.html'
open(file_name, 'wb').write(html.encode('utf-8'))
print('./' + file_name)
github JasonKessler / scattertext / demo_log_odds_ratio_prior_rotten_tomatoes.com.py View on Github external
import scattertext as st
from scattertext import LogOddsRatioInformativeDirichletPrior

fn = 'rotten_fresh2.html'
df = st.SampleCorpora.RottenTomatoes.get_data()
corpus = (st.CorpusFromPandas(df,
                              category_col='category',
                              text_col='text',
                              nlp=st.whitespace_nlp_with_sentences)
	.build())
priors = (st.PriorFactory(corpus,
                          category='fresh',
                          not_categories=['rotten'],
                          starting_count=1)
	.use_general_term_frequencies()
	.use_all_categories()
	.get_priors())
(open(fn, 'wb')
	.write(
	st.produce_fightin_words_explorer(
		corpus,
		category='fresh',
github JasonKessler / scattertext / demo_pair_plot_convention_geninq.py View on Github external
import scattertext as st
import scattertext.categoryprojector.pairplot

convention_df = st.SampleCorpora.ConventionData2012.get_data()
general_inquirer_feature_builder = st.FeatsFromGeneralInquirer()

corpus = st.CorpusFromPandas(
    convention_df,
    category_col='speaker',
    text_col='text',
    nlp=st.whitespace_nlp_with_sentences,
    feats_from_spacy_doc=general_inquirer_feature_builder,
).build().get_unigram_corpus()

html = scattertext.categoryprojector.pairplot.produce_pairplot(corpus,
                                                               use_metadata=True,
                                                               category_projector=st.CategoryProjector(selector=None),
                                                               topic_model_term_lists=general_inquirer_feature_builder.get_top_model_term_lists(),
                                                               topic_model_preview_size=100,
                                                               metadata_descriptions=general_inquirer_feature_builder.get_definitions(),
                                                               metadata=convention_df['party'] + ': ' + convention_df['speaker'])

file_name = 'convention_pair_plot_geninq.html'
github JasonKessler / scattertext / demo_pair_plot_convention.py View on Github external
import scattertext as st
import scattertext.categoryprojector.pairplot

convention_df = st.SampleCorpora.ConventionData2012.get_data()


corpus = st.CorpusFromPandas(
	convention_df,
	category_col='speaker',
	text_col='text',
	nlp=st.whitespace_nlp_with_sentences
).build().get_unigram_corpus()
html = scattertext.categoryprojector.pairplot.produce_pairplot(
	corpus,
	metadata=convention_df['party'] + ': ' + convention_df['speaker']
)

file_name = 'convention_pair_plot.html'
open(file_name, 'wb').write(html.encode('utf-8'))
print('./' + file_name)