Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
term_ranks = np.argsort(np.argsort(-term_category_scores, axis=0), axis=0) + 1
metadata_descriptions = {
term: '<br>' + '<br>'.join(
'<b>%s</b> TextRank score rank: %s/%s' % (cat, term_ranks.loc[term, cat], corpus.get_num_metadata())
for cat in corpus.get_categories())
for term in corpus.get_metadata()
}
category_specific_prominence = term_category_scores.apply(
lambda r: r.Democratic if r.Democratic > r.Republican else -r.Republican,
axis=1
)
html = produce_scattertext_explorer(
corpus,
category='Democratic',
not_category_name='Republican',
minimum_term_frequency=0,
pmi_threshold_coefficient=0,
width_in_pixels=1000,
transform=dense_rank,
use_non_text_features=True,
metadata=corpus.get_df()['speaker'],
scores=category_specific_prominence,
sort_by_dist=False,
# ensure that we search for term in visualization
topic_model_term_lists={term: [term] for term in corpus.get_metadata()},
topic_model_preview_size=0, # ensure singleton topics aren't shown
metadata_descriptions=metadata_descriptions,
use_full_doc=True
from scattertext.CorpusFromParsedDocuments import CorpusFromParsedDocuments
from scattertext import SampleCorpora, whitespace_nlp_with_sentences, produce_scattertext_explorer
convention_df = SampleCorpora.ConventionData2012.get_data().assign(
parse = lambda df: df.text.apply(whitespace_nlp_with_sentences)
)
corpus = CorpusFromParsedDocuments(convention_df, category_col='party', parsed_col='parse').build()
html = produce_scattertext_explorer(
corpus,
category='democrat',
category_name='Democratic',
not_category_name='Republican',
minimum_term_frequency=5,
pmi_threshold_coefficient=8,
width_in_pixels=1000,
metadata=convention_df['speaker'],
d3_scale_chromatic_url='scattertext/data/viz/scripts/d3-scale-chromatic.v1.min.js',
d3_url='scattertext/data/viz/scripts/d3.min.js',
)
open('./demo.html', 'wb').write(html.encode('utf-8'))
print('Open ./demo.html in Chrome or Firefox.')
term_ranker = None
if args.one_use_per_doc is True:
term_ranker = OncePerDocFrequencyRanker
category_display_name = args.category_display_name
if category_display_name is None:
category_display_name = args.positive_category
not_category_display_name = args.not_category_display_name
if not_category_display_name is None:
not_category_display_name = 'Not ' + category_display_name
corpus = CorpusFromPandas(df,
category_col=args.category_column,
text_col=args.text_column,
nlp=nlp).build()
html = produce_scattertext_explorer(corpus,
category=args.positive_category,
category_name=category_display_name,
not_category_name=not_category_display_name,
minimum_term_frequency=args.minimum_term_frequency,
pmi_filter_thresold=args.pmi_threshold,
width_in_pixels=args.width_in_pixels,
term_ranker=term_ranker,
metadata=None if args.metadata_column is None \
else df[args.metadata_column]
)
if args.outputfile == '-':
print(html)
else:
with open(args.outputfile, 'wb') as o:
o.write(html.encode('utf-8'))
from scattertext.termscoring.RankDifference import RankDifference
from scattertext.termcompaction.AssociationCompactor import AssociationCompactor
from scattertext import SampleCorpora, whitespace_nlp_with_sentences, produce_scattertext_explorer
from scattertext.CorpusFromPandas import CorpusFromPandas
convention_df = SampleCorpora.ConventionData2012.get_data()
corpus = CorpusFromPandas(
convention_df,
category_col='party',
text_col='text',
nlp=whitespace_nlp_with_sentences
).build().get_unigram_corpus().compact(AssociationCompactor(4000))
html = produce_scattertext_explorer(
corpus,
category='democrat',
category_name='Democratic',
not_category_name='Republican',
minimum_term_frequency=0,
pmi_threshold_coefficient=0,
width_in_pixels=1000,
metadata=convention_df['speaker'],
term_scorer=RankDifference(),
transform=dense_rank
)
open('./demo_dense_rank.html', 'wb').write(html.encode('utf-8'))
print('Open ./demo_dense_rank.html in Chrome or Firefox.')
def main():
convention_df = SampleCorpora.ConventionData2012.get_data()
feat_builder = FeatsFromOnlyEmpath()
corpus = CorpusFromParsedDocuments(convention_df,
category_col='party',
parsed_col='text',
feats_from_spacy_doc=feat_builder).build()
html = produce_scattertext_explorer(corpus,
category='democrat',
category_name='Democratic',
not_category_name='Republican',
width_in_pixels=1000,
metadata=convention_df['speaker'],
use_non_text_features=True,
use_full_doc=True,
topic_model_term_lists=feat_builder.get_top_model_term_lists())
open('./Convention-Visualization-Empath.html', 'wb').write(html.encode('utf-8'))
print('Open ./Convention-Visualization-Empath.html in Chrome or Firefox.')
import spacy
nlp = spacy.load('en')
df = st.SampleCorpora.ConventionData2012.get_data().assign(
parse=lambda df: list(nlp.pipe(df.text))
)
corpus = st.CorpusFromParsedDocuments(
df,
category_col='party',
parsed_col='parse',
feats_from_spacy_doc=st.SpacyEntities(entity_types_to_use=['NAME', 'LOC'])
).build()
html = st.produce_scattertext_explorer(
corpus,
category='democrat',
category_name='Democratic',
not_category_name='Republican',
minimum_term_frequency=0, pmi_threshold_coefficient=0,
width_in_pixels=1000, metadata=corpus.get_df()['speaker'],
transform=st.Scalers.dense_rank,
max_overlapping=10,
max_docs_per_category=0
)
open('./demo_names2.html', 'w').write(html)
print('open ./demo_names2.html in Chrome')
import scattertext as st
df = st.SampleCorpora.ConventionData2012.get_data().assign(
parse=lambda df: df.text.apply(st.whitespace_nlp_with_sentences)
)
corpus = st.CorpusFromParsedDocuments(
df, category_col='party', parsed_col='parse'
).build().get_unigram_corpus().compact(st.AssociationCompactor(2000))
html = st.produce_scattertext_explorer(
corpus,
category='democrat',
category_name='Democratic',
not_category_name='Republican',
minimum_term_frequency=0, pmi_threshold_coefficient=0,
width_in_pixels=1000, metadata=corpus.get_df()['speaker'],
transform=st.Scalers.dense_rank,
max_overlapping=3
)
open('./demo_compact.html', 'w').write(html)
print('open ./demo_compact.html in Chrome')
df_aug = pd.merge(df, male_prob, left_on='first_name', right_index=True)
df_aug['gender'] = df_aug['prob'].apply(lambda x: 'm' if x > 0.9 else 'f' if x < 0.1 else '?')
df_mf = df_aug[df_aug['gender'].isin(['m', 'f'])]
df_mf.to_csv('emoji_data.csv', index=False)
nlp = st.tweet_tokenizier_factory(nltk.tokenize.TweetTokenizer())
df_mf['parse'] = df_mf['Tweet content'].apply(nlp)
corpus = st.CorpusFromParsedDocuments(
df_mf,
parsed_col='parse',
category_col='gender',
feats_from_spacy_doc=st.FeatsFromSpacyDocOnlyEmoji()
).build()
html = st.produce_scattertext_explorer(
corpus,
category='f',
category_name='Female',
not_category_name='Male',
use_full_doc=True,
term_ranker=OncePerDocFrequencyRanker,
sort_by_dist=False,
metadata=(df_mf['User Name']
+ ' (@' + df_mf['Nickname'] + ') '
+ df_mf['Date'].astype(str)),
width_in_pixels=1000
)
print('writing EmojiGender.html')
open("EmojiGender.html", 'wb').write(html.encode('utf-8'))