Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
t0 = time.time()
reviews_df = pd.read_csv('https://github.com/JasonKessler/ICLR18ReviewVis/raw/master/iclr2018_reviews.csv.bz2')
reviews_df['parse'] = reviews_df['review'].apply(st.whitespace_nlp_with_sentences)
full_corpus = (st.CorpusFromParsedDocuments(reviews_df,
category_col='category',
parsed_col='parse',
#feats_from_spacy_doc=st.PhraseMachinePhrases()
).build())
term_ranker = st.OncePerDocFrequencyRanker
corpus = (full_corpus
.keep_only_these_categories(['Accept, Positive', 'Accept, Negative',
'Reject, Positive', 'Reject, Negative'],
False)
.get_unigram_corpus()
.select(st.ClassPercentageCompactor(term_count=5)))
print('finding priors', time.time() - t0, 's')
priors = (st.PriorFactory(full_corpus, starting_count=0.01)
.use_all_categories()
.get_priors())
print('building four square', time.time() - t0, 's')
four_square = st.FourSquare(
corpus,
category_a_list=['Accept, Positive'],
not_category_a_list=['Reject, Negative'],
category_b_list=['Accept, Negative'],
not_category_b_list=['Reject, Positive'],
term_ranker=term_ranker,
scorer=st.LogOddsRatioInformativeDirichletPrior(priors, 500, 'word'),
labels={'a': 'Positive Reviews of Accepted Papers',
data[:, 0] = Y_comments
data[:, 1] = X_comments
for d in data:
if d[0] == 0:
d[0] = 'normal'
else:
d[0] = 'bot'
df = pd.DataFrame({'label': data[:, 0], 'text':data[:, 1]})
print(df)
corpus = (st.CorpusFromPandas(df, category_col='label', text_col='text', nlp=st.whitespace_nlp_with_sentences)
.build()
.get_unigram_corpus()
.compact(st.ClassPercentageCompactor(term_count=2, term_ranker=st.OncePerDocFrequencyRanker)))
html = st.produce_characteristic_explorer(
corpus,
category='normal',
category_name='Normal',
not_category_name='Bot'
)
open('comment_text_chart.html', 'wb').write(html.encode('utf-8'))
movie_df = st.SampleCorpora.RottenTomatoes.get_data()
movie_df.category = movie_df.category.apply(
lambda x: {'rotten': 'Negative', 'fresh': 'Positive', 'plot': 'Plot'}[x]
)
movie_df = movie_df[movie_df.category.isin(['Negative', 'Positive'])]
corpus = (st.CorpusFromPandas(movie_df,
category_col='category',
text_col='text',
nlp=st.whitespace_nlp_with_sentences)
.build()
.get_unigram_corpus())
# Remove relatively infrequent terms from both categories
corpus = corpus.select(st.ClassPercentageCompactor(term_count=2,
term_ranker=st.OncePerDocFrequencyRanker))
fn = 'demo_characteristic_chart.html'
open(fn, 'wb').write(st.produce_characteristic_explorer(
corpus,
category='Positive',
not_category_name='Negative',
metadata=corpus.get_df()['movie_name'],
characteristic_scorer=st.DenseRankCharacteristicness(rerank_ranks=False),
term_ranker=st.termranking.AbsoluteFrequencyRanker,
term_scorer=st.ScaledFScorePresets(beta=1, one_to_neg_one=True)
).encode('utf-8'))
print('open ' + fn)
from sklearn.decomposition import TruncatedSVD
import scattertext as st
from scattertext import ClassPercentageCompactor, CSRMatrixFactory
from scattertext.representations.CorpusSentenceIterator import CorpusSentenceIterator
convention_df = st.SampleCorpora.ConventionData2012.get_data()
convention_df['parse'] = convention_df['text'].apply(st.whitespace_nlp_with_sentences)
corpus = (st.CorpusFromParsedDocuments(convention_df,
category_col='party',
parsed_col='parse')
.build()
.get_stoplisted_unigram_corpus().select(ClassPercentageCompactor(term_count=3)))
html = st.produce_projection_explorer(corpus,
embeddings=corpus.get_term_doc_mat(),
projection_model=TruncatedSVD(n_components=30, n_iter=10),
x_dim=0,
y_dim=1,
category='democrat',
category_name='Democratic',
not_category_name='Republican',
metadata=convention_df.speaker,
width_in_pixels=1000)
file_name = 'demo_bow_pca.html'
open(file_name, 'wb').write(html.encode('utf-8'))
print('Open', file_name, 'in chrome')