How to use the scattertext.produce_scattertext_explorer function in scattertext

To help you get started, we’ve selected a few scattertext examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github JasonKessler / scattertext / demo_pytextrank.py View on Github external
term_ranks = np.argsort(np.argsort(-term_category_scores, axis=0), axis=0) + 1

metadata_descriptions = {
    term: '<br>' + '<br>'.join(
        '<b>%s</b> TextRank score rank: %s/%s' % (cat, term_ranks.loc[term, cat], corpus.get_num_metadata())
        for cat in corpus.get_categories())
    for term in corpus.get_metadata()
}

category_specific_prominence = term_category_scores.apply(
    lambda r: r.Democratic if r.Democratic &gt; r.Republican else -r.Republican,
    axis=1
)

html = produce_scattertext_explorer(
    corpus,
    category='Democratic',
    not_category_name='Republican',
    minimum_term_frequency=0,
    pmi_threshold_coefficient=0,
    width_in_pixels=1000,
    transform=dense_rank,
    use_non_text_features=True,
    metadata=corpus.get_df()['speaker'],
    scores=category_specific_prominence,
    sort_by_dist=False,
    # ensure that we search for term in visualization
    topic_model_term_lists={term: [term] for term in corpus.get_metadata()},
    topic_model_preview_size=0,  # ensure singleton topics aren't shown
    metadata_descriptions=metadata_descriptions,
    use_full_doc=True
github JasonKessler / scattertext / demo.py View on Github external
from scattertext.CorpusFromParsedDocuments import CorpusFromParsedDocuments
from scattertext import SampleCorpora, whitespace_nlp_with_sentences, produce_scattertext_explorer

convention_df = SampleCorpora.ConventionData2012.get_data().assign(
	parse = lambda df: df.text.apply(whitespace_nlp_with_sentences)
)
corpus = CorpusFromParsedDocuments(convention_df, category_col='party', parsed_col='parse').build()

html = produce_scattertext_explorer(
	corpus,
	category='democrat',
	category_name='Democratic',
	not_category_name='Republican',
	minimum_term_frequency=5,
	pmi_threshold_coefficient=8,
	width_in_pixels=1000,
	metadata=convention_df['speaker'],
	d3_scale_chromatic_url='scattertext/data/viz/scripts/d3-scale-chromatic.v1.min.js',
	d3_url='scattertext/data/viz/scripts/d3.min.js',
)

open('./demo.html', 'wb').write(html.encode('utf-8'))
print('Open ./demo.html in Chrome or Firefox.')
github JasonKessler / scattertext / scattertext / CLI.py View on Github external
term_ranker = None
	if args.one_use_per_doc is True:
		term_ranker = OncePerDocFrequencyRanker

	category_display_name = args.category_display_name
	if category_display_name is None:
		category_display_name = args.positive_category
	not_category_display_name = args.not_category_display_name
	if not_category_display_name is None:
		not_category_display_name = 'Not ' + category_display_name

	corpus = CorpusFromPandas(df,
	                          category_col=args.category_column,
	                          text_col=args.text_column,
	                          nlp=nlp).build()
	html = produce_scattertext_explorer(corpus,
	                                    category=args.positive_category,
	                                    category_name=category_display_name,
	                                    not_category_name=not_category_display_name,
	                                    minimum_term_frequency=args.minimum_term_frequency,
	                                    pmi_filter_thresold=args.pmi_threshold,
	                                    width_in_pixels=args.width_in_pixels,
	                                    term_ranker=term_ranker,
	                                    metadata=None if args.metadata_column is None \
		                                    else df[args.metadata_column]
	                                    )
	if args.outputfile == '-':
		print(html)
	else:
		with open(args.outputfile, 'wb') as o:
			o.write(html.encode('utf-8'))
github JasonKessler / scattertext / demo_dense_rank.py View on Github external
from scattertext.termscoring.RankDifference import RankDifference

from scattertext.termcompaction.AssociationCompactor import AssociationCompactor

from scattertext import SampleCorpora, whitespace_nlp_with_sentences, produce_scattertext_explorer
from scattertext.CorpusFromPandas import CorpusFromPandas

convention_df = SampleCorpora.ConventionData2012.get_data()
corpus = CorpusFromPandas(
    convention_df,
    category_col='party',
    text_col='text',
    nlp=whitespace_nlp_with_sentences
).build().get_unigram_corpus().compact(AssociationCompactor(4000))

html = produce_scattertext_explorer(
    corpus,
    category='democrat',
    category_name='Democratic',
    not_category_name='Republican',
    minimum_term_frequency=0,
    pmi_threshold_coefficient=0,
    width_in_pixels=1000,
    metadata=convention_df['speaker'],
    term_scorer=RankDifference(),
    transform=dense_rank
)

open('./demo_dense_rank.html', 'wb').write(html.encode('utf-8'))
print('Open ./demo_dense_rank.html in Chrome or Firefox.')
github JasonKessler / scattertext / demo_empath.py View on Github external
def main():
	convention_df = SampleCorpora.ConventionData2012.get_data()
	feat_builder = FeatsFromOnlyEmpath()
	corpus = CorpusFromParsedDocuments(convention_df,
	                                   category_col='party',
	                                   parsed_col='text',
	                                   feats_from_spacy_doc=feat_builder).build()
	html = produce_scattertext_explorer(corpus,
	                                    category='democrat',
	                                    category_name='Democratic',
	                                    not_category_name='Republican',
	                                    width_in_pixels=1000,
	                                    metadata=convention_df['speaker'],
	                                    use_non_text_features=True,
	                                    use_full_doc=True,
	                                    topic_model_term_lists=feat_builder.get_top_model_term_lists())
	open('./Convention-Visualization-Empath.html', 'wb').write(html.encode('utf-8'))
	print('Open ./Convention-Visualization-Empath.html in Chrome or Firefox.')
github JasonKessler / scattertext / demo_names.py View on Github external
import spacy

nlp = spacy.load('en')

df = st.SampleCorpora.ConventionData2012.get_data().assign(
    parse=lambda df: list(nlp.pipe(df.text))
)

corpus = st.CorpusFromParsedDocuments(
    df,
    category_col='party',
    parsed_col='parse',
    feats_from_spacy_doc=st.SpacyEntities(entity_types_to_use=['NAME', 'LOC'])
).build()

html = st.produce_scattertext_explorer(
    corpus,
    category='democrat',
    category_name='Democratic',
    not_category_name='Republican',
    minimum_term_frequency=0, pmi_threshold_coefficient=0,
    width_in_pixels=1000, metadata=corpus.get_df()['speaker'],
    transform=st.Scalers.dense_rank,
    max_overlapping=10,
    max_docs_per_category=0
)
open('./demo_names2.html', 'w').write(html)
print('open ./demo_names2.html in Chrome')
github JasonKessler / scattertext / demo_compact.py View on Github external
import scattertext as st

df = st.SampleCorpora.ConventionData2012.get_data().assign(
    parse=lambda df: df.text.apply(st.whitespace_nlp_with_sentences)
)

corpus = st.CorpusFromParsedDocuments(
    df, category_col='party', parsed_col='parse'
).build().get_unigram_corpus().compact(st.AssociationCompactor(2000))

html = st.produce_scattertext_explorer(
    corpus,
    category='democrat',
    category_name='Democratic',
    not_category_name='Republican',
    minimum_term_frequency=0, pmi_threshold_coefficient=0,
    width_in_pixels=1000, metadata=corpus.get_df()['speaker'],
    transform=st.Scalers.dense_rank,
    max_overlapping=3
)
open('./demo_compact.html', 'w').write(html)
print('open ./demo_compact.html in Chrome')
github JasonKessler / scattertext / demo_emoji.py View on Github external
df_aug = pd.merge(df, male_prob, left_on='first_name', right_index=True)
	df_aug['gender'] = df_aug['prob'].apply(lambda x: 'm' if x &gt; 0.9 else 'f' if x &lt; 0.1 else '?')
	df_mf = df_aug[df_aug['gender'].isin(['m', 'f'])]
	df_mf.to_csv('emoji_data.csv', index=False)

nlp = st.tweet_tokenizier_factory(nltk.tokenize.TweetTokenizer())
df_mf['parse'] = df_mf['Tweet content'].apply(nlp)

corpus = st.CorpusFromParsedDocuments(
	df_mf,
	parsed_col='parse',
	category_col='gender',
	feats_from_spacy_doc=st.FeatsFromSpacyDocOnlyEmoji()
).build()

html = st.produce_scattertext_explorer(
	corpus,
	category='f',
	category_name='Female',
	not_category_name='Male',
	use_full_doc=True,
	term_ranker=OncePerDocFrequencyRanker,
	sort_by_dist=False,
	metadata=(df_mf['User Name']
	          + ' (@' + df_mf['Nickname'] + ') '
	          + df_mf['Date'].astype(str)),
	width_in_pixels=1000
)

print('writing EmojiGender.html')
open("EmojiGender.html", 'wb').write(html.encode('utf-8'))