Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
import scattertext as st
movie_df = st.SampleCorpora.RottenTomatoes.get_data()
movie_df.category = movie_df.category.apply(
lambda x: {'rotten': 'Negative', 'fresh': 'Positive', 'plot': 'Plot'}[x]
)
movie_df = movie_df[movie_df.category.isin(['Negative', 'Positive'])]
corpus = (st.CorpusFromPandas(movie_df,
category_col='category',
text_col='text',
nlp=st.whitespace_nlp_with_sentences)
.build()
.get_unigram_corpus())
# Remove relatively infrequent terms from both categories
corpus = corpus.select(st.ClassPercentageCompactor(term_count=2,
term_ranker=st.OncePerDocFrequencyRanker))
fn = 'demo_characteristic_chart.html'
import umap
from sklearn.feature_extraction.text import TfidfTransformer
import scattertext as st
movie_df = st.SampleCorpora.RottenTomatoes.get_data()
movie_df.category = movie_df.category \
.apply(lambda x: {'rotten': 'Negative', 'fresh': 'Positive', 'plot': 'Plot'}[x])
corpus = st.CorpusFromPandas(
movie_df,
category_col='movie_name',
text_col='text',
nlp=st.whitespace_nlp_with_sentences
).build().get_stoplisted_unigram_corpus()
category_projection = st.CategoryProjector(
projector=umap.UMAP(metric='cosine')
).project(corpus)
html = st.produce_pairplot(
corpus,
import scattertext as st
movie_df = st.SampleCorpora.RottenTomatoes.get_data()
#movie_df.category = movie_df.category.apply \
#(lambda x: {'rotten': 'Negative', 'fresh': 'Positive', 'plot': 'Plot'}[x])
corpus = st.CorpusFromPandas(
movie_df,
category_col='category',
text_col='text',
nlp=st.whitespace_nlp_with_sentences
).build()
corpus = corpus.get_unigram_corpus()
semiotic_square = st.SemioticSquare(
corpus,
category_a='fresh',
category_b='rotten',
neutral_categories=['plot'],
scorer=st.RankDifference(),
import scattertext as st
movie_df = st.SampleCorpora.RottenTomatoes.get_data()
movie_df.category = movie_df.category \
.apply(lambda x: {'rotten': 'Negative', 'fresh': 'Positive', 'plot': 'Plot'}[x])
corpus = st.CorpusFromPandas(
movie_df,
category_col='movie_name',
text_col='text',
nlp=st.whitespace_nlp_with_sentences
).build().get_stoplisted_unigram_corpus()
html = st.produce_pairplot(
corpus,
category_projection=st.get_optimal_category_projection(corpus, verbose=True),
metadata=movie_df['category'] + ': ' + movie_df['movie_name'],
d3_url_struct=st.D3URLs(
d3_scale_chromatic_url='scattertext/data/viz/scripts/d3-scale-chromatic.v1.min.js',
import scattertext as st
movie_df = st.SampleCorpora.RottenTomatoes.get_data()
corpus = st.CorpusFromPandas(
movie_df,
category_col='category',
text_col='text',
nlp=st.whitespace_nlp_with_sentences
).build().get_unigram_corpus().remove_categories(['plot'])
term_scorer = st.CredTFIDF(corpus).set_categories('fresh', ['rotten'])
print(term_scorer.get_score_df().sort_values(by='delta_cred_tf_idf', ascending=False).head())
html = st.produce_frequency_explorer(
corpus,
category='fresh',
not_category_name='rotten',
import scattertext as st
import pandas as pd
df = st.SampleCorpora.RottenTomatoes.get_data()
df['parse'] = df['text'].apply(st.whitespace_nlp_with_sentences)
corpus = (st.CorpusFromParsedDocuments(df, category_col='category', parsed_col='parse')
.build()
.get_unigram_corpus()
.select(st.AssociationCompactor(1000)))
corpus, axes = st.EmbeddingsResolver(corpus).set_embeddings_model().project_embeddings()
term_colors = st.CategoryColorAssigner(corpus).get_term_colors()
html = st.produce_pca_explorer(
corpus,
category='fresh',
not_categories=['rotten'],
neutral_categories=['plot'],
metadata=df['movie_name'],
width_in_pixels=1000,
show_axes=False,
from scattertext.termcompaction.CompactTerms import CompactTerms
import scattertext as st
from scattertext import LogOddsRatioInformativeDirichletPrior
fn = 'demo_log_odds_ratio_prior.html'
df = st.SampleCorpora.RottenTomatoes.get_data()
corpus = (st.CorpusFromPandas(df,
category_col='category',
text_col='text',
nlp=st.whitespace_nlp_with_sentences)
.build())
priors = (st.PriorFactory(corpus,
category='fresh',
not_categories=['rotten'],
starting_count=1)
# .use_general_term_frequencies()
.use_all_categories()
.get_priors())
(open(fn, 'wb')
.write(
st.produce_frequency_explorer(
corpus,
import scattertext as st
from scattertext import LogOddsRatioInformativeDirichletPrior
fn = 'rotten_fresh2.html'
df = st.SampleCorpora.RottenTomatoes.get_data()
corpus = (st.CorpusFromPandas(df,
category_col='category',
text_col='text',
nlp=st.whitespace_nlp_with_sentences)
.build())
priors = (st.PriorFactory(corpus,
category='fresh',
not_categories=['rotten'],
starting_count=1)
.use_general_term_frequencies()
.use_all_categories()
.get_priors())
(open(fn, 'wb')
.write(
st.produce_fightin_words_explorer(
corpus,