Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
import scattertext as st
movie_df = st.SampleCorpora.RottenTomatoes.get_data()
movie_df.category = movie_df.category \
.apply(lambda x: {'rotten': 'Negative', 'fresh': 'Positive', 'plot': 'Plot'}[x])
corpus = st.CorpusFromPandas(
movie_df,
category_col='movie_name',
text_col='text',
nlp=st.whitespace_nlp_with_sentences
).build().get_stoplisted_unigram_corpus()
html = st.produce_pairplot(
corpus,
category_projection=st.get_optimal_category_projection(corpus, verbose=True),
metadata=movie_df['category'] + ': ' + movie_df['movie_name'],
d3_url_struct=st.D3URLs(
d3_scale_chromatic_url='scattertext/data/viz/scripts/d3-scale-chromatic.v1.min.js',
d3_url='scattertext/data/viz/scripts/d3.min.js'
)
)
from scattertext.termcompaction.CompactTerms import CompactTerms
import scattertext as st
from scattertext import LogOddsRatioInformativeDirichletPrior
fn = 'demo_log_odds_ratio_prior.html'
df = st.SampleCorpora.RottenTomatoes.get_data()
corpus = (st.CorpusFromPandas(df,
category_col='category',
text_col='text',
nlp=st.whitespace_nlp_with_sentences)
.build())
priors = (st.PriorFactory(corpus,
category='fresh',
not_categories=['rotten'],
starting_count=1)
# .use_general_term_frequencies()
.use_all_categories()
.get_priors())
(open(fn, 'wb')
.write(
st.produce_frequency_explorer(
corpus,
category='fresh',
else:
import spacy
nlp = spacy.load(args.spacy_language_model)
term_ranker = None
if args.one_use_per_doc is True:
term_ranker = OncePerDocFrequencyRanker
category_display_name = args.category_display_name
if category_display_name is None:
category_display_name = args.positive_category
not_category_display_name = args.not_category_display_name
if not_category_display_name is None:
not_category_display_name = 'Not ' + category_display_name
corpus = CorpusFromPandas(df,
category_col=args.category_column,
text_col=args.text_column,
nlp=nlp).build()
html = produce_scattertext_explorer(corpus,
category=args.positive_category,
category_name=category_display_name,
not_category_name=not_category_display_name,
minimum_term_frequency=args.minimum_term_frequency,
pmi_filter_thresold=args.pmi_threshold,
width_in_pixels=args.width_in_pixels,
term_ranker=term_ranker,
metadata=None if args.metadata_column is None \
else df[args.metadata_column]
)
if args.outputfile == '-':
print(html)
import scattertext as st
import scattertext.categoryprojector.pairplot
convention_df = st.SampleCorpora.ConventionData2012.get_data()
empath_feature_builder = st.FeatsFromOnlyEmpath()
corpus = st.CorpusFromPandas(
convention_df,
category_col='speaker',
text_col='text',
nlp=st.whitespace_nlp_with_sentences,
feats_from_spacy_doc=empath_feature_builder).build().get_unigram_corpus()
html = scattertext.categoryprojector.pairplot.produce_pairplot(corpus,
use_metadata=True,
category_projector=st.CategoryProjector(selector=None),
topic_model_term_lists=empath_feature_builder.get_top_model_term_lists(),
metadata=convention_df['party'] + ': ' + convention_df['speaker'])
file_name = 'convention_pair_plot_empath.html'
open(file_name, 'wb').write(html.encode('utf-8'))
print('./' + file_name)
import scattertext as st
from scattertext import LogOddsRatioInformativeDirichletPrior
fn = 'rotten_fresh2.html'
df = st.SampleCorpora.RottenTomatoes.get_data()
corpus = (st.CorpusFromPandas(df,
category_col='category',
text_col='text',
nlp=st.whitespace_nlp_with_sentences)
.build())
priors = (st.PriorFactory(corpus,
category='fresh',
not_categories=['rotten'],
starting_count=1)
.use_general_term_frequencies()
.use_all_categories()
.get_priors())
(open(fn, 'wb')
.write(
st.produce_fightin_words_explorer(
corpus,
category='fresh',
import scattertext as st
import scattertext.categoryprojector.pairplot
convention_df = st.SampleCorpora.ConventionData2012.get_data()
general_inquirer_feature_builder = st.FeatsFromGeneralInquirer()
corpus = st.CorpusFromPandas(
convention_df,
category_col='speaker',
text_col='text',
nlp=st.whitespace_nlp_with_sentences,
feats_from_spacy_doc=general_inquirer_feature_builder,
).build().get_unigram_corpus()
html = scattertext.categoryprojector.pairplot.produce_pairplot(corpus,
use_metadata=True,
category_projector=st.CategoryProjector(selector=None),
topic_model_term_lists=general_inquirer_feature_builder.get_top_model_term_lists(),
topic_model_preview_size=100,
metadata_descriptions=general_inquirer_feature_builder.get_definitions(),
metadata=convention_df['party'] + ': ' + convention_df['speaker'])
file_name = 'convention_pair_plot_geninq.html'
import scattertext as st
import scattertext.categoryprojector.pairplot
convention_df = st.SampleCorpora.ConventionData2012.get_data()
corpus = st.CorpusFromPandas(
convention_df,
category_col='speaker',
text_col='text',
nlp=st.whitespace_nlp_with_sentences
).build().get_unigram_corpus()
html = scattertext.categoryprojector.pairplot.produce_pairplot(
corpus,
metadata=convention_df['party'] + ': ' + convention_df['speaker']
)
file_name = 'convention_pair_plot.html'
open(file_name, 'wb').write(html.encode('utf-8'))
print('./' + file_name)