How to use the scattertext.SampleCorpora.RottenTomatoes function in scattertext

To help you get started, we’ve selected a few scattertext examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github JasonKessler / scattertext / demo_characteristic_chart.py View on Github external
import scattertext as st

movie_df = st.SampleCorpora.RottenTomatoes.get_data()
movie_df.category = movie_df.category.apply(
	lambda x: {'rotten': 'Negative', 'fresh': 'Positive', 'plot': 'Plot'}[x]
)
movie_df = movie_df[movie_df.category.isin(['Negative', 'Positive'])]

corpus = (st.CorpusFromPandas(movie_df,
                              category_col='category',
                              text_col='text',
                              nlp=st.whitespace_nlp_with_sentences)
          .build()
          .get_unigram_corpus())

# Remove relatively infrequent terms from both categories
corpus = corpus.select(st.ClassPercentageCompactor(term_count=2,
                                                   term_ranker=st.OncePerDocFrequencyRanker))
fn = 'demo_characteristic_chart.html'
github JasonKessler / scattertext / demo_pair_plot_movies_umap.py View on Github external
import umap
from sklearn.feature_extraction.text import TfidfTransformer

import scattertext as st

movie_df = st.SampleCorpora.RottenTomatoes.get_data()
movie_df.category = movie_df.category \
    .apply(lambda x: {'rotten': 'Negative', 'fresh': 'Positive', 'plot': 'Plot'}[x])

corpus = st.CorpusFromPandas(
    movie_df,
    category_col='movie_name',
    text_col='text',
    nlp=st.whitespace_nlp_with_sentences
).build().get_stoplisted_unigram_corpus()

category_projection = st.CategoryProjector(
    projector=umap.UMAP(metric='cosine')
).project(corpus)

html = st.produce_pairplot(
    corpus,
github JasonKessler / scattertext / demo_semiotic.py View on Github external
import scattertext as st

movie_df = st.SampleCorpora.RottenTomatoes.get_data()
#movie_df.category = movie_df.category.apply \
#(lambda x: {'rotten': 'Negative', 'fresh': 'Positive', 'plot': 'Plot'}[x])
corpus = st.CorpusFromPandas(
	movie_df,
	category_col='category',
	text_col='text',
	nlp=st.whitespace_nlp_with_sentences
).build()
corpus = corpus.get_unigram_corpus()

semiotic_square = st.SemioticSquare(
	corpus,
	category_a='fresh',
	category_b='rotten',
	neutral_categories=['plot'],
	scorer=st.RankDifference(),
github JasonKessler / scattertext / demo_pair_plot_movies.py View on Github external
import scattertext as st

movie_df = st.SampleCorpora.RottenTomatoes.get_data()
movie_df.category = movie_df.category \
    .apply(lambda x: {'rotten': 'Negative', 'fresh': 'Positive', 'plot': 'Plot'}[x])

corpus = st.CorpusFromPandas(
    movie_df,
    category_col='movie_name',
    text_col='text',
    nlp=st.whitespace_nlp_with_sentences
).build().get_stoplisted_unigram_corpus()

html = st.produce_pairplot(
    corpus,
    category_projection=st.get_optimal_category_projection(corpus, verbose=True),
    metadata=movie_df['category'] + ': ' + movie_df['movie_name'],
    d3_url_struct=st.D3URLs(
        d3_scale_chromatic_url='scattertext/data/viz/scripts/d3-scale-chromatic.v1.min.js',
github JasonKessler / scattertext / demo_cred_tfidf.py View on Github external
import scattertext as st

movie_df = st.SampleCorpora.RottenTomatoes.get_data()

corpus = st.CorpusFromPandas(
    movie_df,
    category_col='category',
    text_col='text',
    nlp=st.whitespace_nlp_with_sentences
).build().get_unigram_corpus().remove_categories(['plot'])

term_scorer = st.CredTFIDF(corpus).set_categories('fresh', ['rotten'])

print(term_scorer.get_score_df().sort_values(by='delta_cred_tf_idf', ascending=False).head())

html = st.produce_frequency_explorer(
    corpus,
    category='fresh',
    not_category_name='rotten',
github JasonKessler / scattertext / demo_unified_context.py View on Github external
import scattertext as st
import pandas as pd

df = st.SampleCorpora.RottenTomatoes.get_data()
df['parse'] = df['text'].apply(st.whitespace_nlp_with_sentences)
corpus = (st.CorpusFromParsedDocuments(df, category_col='category', parsed_col='parse')
          .build()
          .get_unigram_corpus()
          .select(st.AssociationCompactor(1000)))

corpus, axes = st.EmbeddingsResolver(corpus).set_embeddings_model().project_embeddings()
term_colors = st.CategoryColorAssigner(corpus).get_term_colors()
html = st.produce_pca_explorer(
    corpus,
    category='fresh',
    not_categories=['rotten'],
    neutral_categories=['plot'],
    metadata=df['movie_name'],
    width_in_pixels=1000,
    show_axes=False,
github JasonKessler / scattertext / demo_log_odds_ratio_prior.py View on Github external
from scattertext.termcompaction.CompactTerms import CompactTerms

import scattertext as st
from scattertext import LogOddsRatioInformativeDirichletPrior

fn = 'demo_log_odds_ratio_prior.html'
df = st.SampleCorpora.RottenTomatoes.get_data()
corpus = (st.CorpusFromPandas(df,
                              category_col='category',
                              text_col='text',
                              nlp=st.whitespace_nlp_with_sentences)
          .build())
priors = (st.PriorFactory(corpus,
                          category='fresh',
                          not_categories=['rotten'],
                          starting_count=1)
          # .use_general_term_frequencies()
          .use_all_categories()
          .get_priors())
(open(fn, 'wb')
    .write(
    st.produce_frequency_explorer(
        corpus,
github JasonKessler / scattertext / demo_log_odds_ratio_prior_rotten_tomatoes.com.py View on Github external
import scattertext as st
from scattertext import LogOddsRatioInformativeDirichletPrior

fn = 'rotten_fresh2.html'
df = st.SampleCorpora.RottenTomatoes.get_data()
corpus = (st.CorpusFromPandas(df,
                              category_col='category',
                              text_col='text',
                              nlp=st.whitespace_nlp_with_sentences)
	.build())
priors = (st.PriorFactory(corpus,
                          category='fresh',
                          not_categories=['rotten'],
                          starting_count=1)
	.use_general_term_frequencies()
	.use_all_categories()
	.get_priors())
(open(fn, 'wb')
	.write(
	st.produce_fightin_words_explorer(
		corpus,