How to use the scattertext.SampleCorpora.ConventionData2012 function in scattertext

To help you get started, we’ve selected a few scattertext examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github JasonKessler / scattertext / demo_compact.py View on Github external
import scattertext as st

df = st.SampleCorpora.ConventionData2012.get_data().assign(
    parse=lambda df: df.text.apply(st.whitespace_nlp_with_sentences)
)

corpus = st.CorpusFromParsedDocuments(
    df, category_col='party', parsed_col='parse'
).build().get_unigram_corpus().compact(st.AssociationCompactor(2000))

html = st.produce_scattertext_explorer(
    corpus,
    category='democrat',
    category_name='Democratic',
    not_category_name='Republican',
    minimum_term_frequency=0, pmi_threshold_coefficient=0,
    width_in_pixels=1000, metadata=corpus.get_df()['speaker'],
    transform=st.Scalers.dense_rank,
    max_overlapping=3
github JasonKessler / scattertext / demo_without_spacy.py View on Github external
from scattertext import SampleCorpora, produce_scattertext_explorer
from scattertext.CorpusFromPandas import CorpusFromPandas
from scattertext.WhitespaceNLP import whitespace_nlp

nlp = whitespace_nlp

convention_df = SampleCorpora.ConventionData2012.get_data()
corpus = CorpusFromPandas(convention_df,
                          category_col='party',
                          text_col='text',
                          nlp=nlp).build()

html = produce_scattertext_explorer(corpus,
                                    category='democrat',
                                    category_name='Democratic',
                                    not_category_name='Republican',
                                    minimum_term_frequency=5,
                                    width_in_pixels=1000,
                                    metadata=convention_df['speaker'])
open('./demo_without_spacy.html', 'wb').write(html.encode('utf-8'))
print('Open ./demo_without_spacy.html in Chrome or Firefox.')
github JasonKessler / scattertext / demo_custom_coordinates.py View on Github external
import numpy as np
import spacy
from sklearn.linear_model import LogisticRegression

from scattertext import SampleCorpora, produce_scattertext_explorer
from scattertext.CorpusFromPandas import CorpusFromPandas
from scattertext.Scalers import scale

nlp = spacy.load('en')
convention_df = SampleCorpora.ConventionData2012.get_data()
corpus = CorpusFromPandas(convention_df,
                          category_col='party',
                          text_col='text',
                          nlp=nlp).build()

term_freq_df = corpus.get_term_freq_df()


def zero_centered_scale(ar):
	ar[ar > 0] = scale(ar[ar > 0])
	ar[ar < 0] = -scale(-ar[ar < 0])
	return (ar + 1) / 2.


frequencies_scaled = scale(np.log(term_freq_df.sum(axis=1).values))
scores = corpus.get_logreg_coefs('democrat',
github JasonKessler / scattertext / demo_gensim_similarity.py View on Github external
def main():
	nlp = spacy.load('en')
	#nlp = whitespace_nlp_with_sentences
	convention_df = SampleCorpora.ConventionData2012.get_data()
	convention_df['parsed'] = convention_df.text.apply(nlp)
	corpus = (CorpusFromParsedDocuments(convention_df,
	                                   category_col='party',
	                                   parsed_col='parsed')
	          .build()
	          .get_unigram_corpus())
	model = word2vec.Word2Vec(size=100,
	                          alpha=0.025,
	                          window=5,
	                          min_count=5,
	                          max_vocab_size=None,
	                          sample=0,
	                          seed=1,
	                          workers=1,
	                          min_alpha=0.0001,
	                          sg=1,
github JasonKessler / scattertext / demo_embeddings_pca.py View on Github external
import scattertext as st
import pandas as pd
from sklearn.feature_extraction.text import TfidfTransformer
from scipy.sparse.linalg import svds

convention_df = st.SampleCorpora.ConventionData2012.get_data()
convention_df['parse'] = convention_df['text'].apply(st.whitespace_nlp_with_sentences)
corpus = (st.CorpusFromParsedDocuments(convention_df,
                                       category_col='party',
                                       parsed_col='parse')
          .build()
          .get_stoplisted_unigram_corpus()
          .remove_infrequent_words(minimum_term_count=3, term_ranker=st.OncePerDocFrequencyRanker))
embeddings = TfidfTransformer().fit_transform(corpus.get_term_doc_mat()).T
U, S, VT = svds(embeddings, k = 3, maxiter=20000, which='LM')

x_dim = 0; y_dim = 1
projection = pd.DataFrame({'term':corpus.get_terms(),
                           'x':U.T[x_dim],
                           'y':U.T[y_dim]}).set_index('term')
html = st.produce_pca_explorer(corpus,
                               category='democrat',
github JasonKessler / scattertext / demo_empath.py View on Github external
def main():
	convention_df = SampleCorpora.ConventionData2012.get_data()
	feat_builder = FeatsFromOnlyEmpath()
	corpus = CorpusFromParsedDocuments(convention_df,
	                                   category_col='party',
	                                   parsed_col='text',
	                                   feats_from_spacy_doc=feat_builder).build()
	html = produce_scattertext_explorer(corpus,
	                                    category='democrat',
	                                    category_name='Democratic',
	                                    not_category_name='Republican',
	                                    width_in_pixels=1000,
	                                    metadata=convention_df['speaker'],
	                                    use_non_text_features=True,
	                                    use_full_doc=True,
	                                    topic_model_term_lists=feat_builder.get_top_model_term_lists())
	open('./Convention-Visualization-Empath.html', 'wb').write(html.encode('utf-8'))
	print('Open ./Convention-Visualization-Empath.html in Chrome or Firefox.')
github JasonKessler / scattertext / demo_tsne_style_for_publication.py View on Github external
import scattertext as st

convention_df = st.SampleCorpora.ConventionData2012.get_data()
convention_df['parse'] = convention_df['text'].apply(st.whitespace_nlp_with_sentences)

corpus = (st.CorpusFromParsedDocuments(convention_df,
                                       category_col='party',
                                       parsed_col='parse')
          .build().get_stoplisted_unigram_corpus())


html = st.produce_projection_explorer(corpus,
                                      category='democrat',
                                      category_name='Democratic',
                                      not_category_name='Republican',
                                      metadata=convention_df.speaker,
                                      color_func='''(function(d) {return d.s > 0.5 ? d3.interpolateRdYlBu(0.6) : d3.interpolateRdYlBu(0.4) })''',
                                      center_label_over_points = True,
                                      censor_points=True,
github JasonKessler / scattertext / demo_scaled_f_score.py View on Github external
from scattertext import SampleCorpora, whitespace_nlp_with_sentences, produce_frequency_explorer
from scattertext.CorpusFromPandas import CorpusFromPandas
from scattertext.termscoring.ScaledFScore import ScaledFScorePresetsNeg1To1

convention_df = SampleCorpora.ConventionData2012.get_data()
corpus = CorpusFromPandas(convention_df,
                          category_col='party',
                          text_col='text',
                          nlp=whitespace_nlp_with_sentences).build().get_unigram_corpus()
html = produce_frequency_explorer(corpus,
                                  category='democrat',
                                  category_name='Democratic',
                                  not_category_name='Republican',
                                  minimum_term_frequency=5,
                                  width_in_pixels=1000,
                                  term_scorer=ScaledFScorePresetsNeg1To1(
	                                      beta=1,
	                                      scaler_algo='normcdf'
                                      ),
                                  grey_threshold=0,
                                  y_axis_values=[-1, 0, 1],
github JasonKessler / scattertext / demo_pytextrank.py View on Github external
from scattertext import SampleCorpora, RankDifference, dense_rank, PyTextRankPhrases, AssociationCompactor, \
    produce_scattertext_explorer
from scattertext import CorpusFromParsedDocuments
import spacy
import numpy as np
import pytextrank

nlp = spacy.load('en')

convention_df = SampleCorpora.ConventionData2012.get_data().assign(
    parse=lambda df: df.text.apply(nlp),
    party=lambda df: df.party.apply({'democrat': 'Democratic', 'republican': 'Republican'}.get)
)

corpus = CorpusFromParsedDocuments(
    convention_df,
    category_col='party',
    parsed_col='parse',
    feats_from_spacy_doc=PyTextRankPhrases()
).build(
).compact(
    AssociationCompactor(2000, use_non_text_features=True)
)

print('Aggregate PyTextRank phrase scores')
term_category_scores = corpus.get_metadata_freq_df('')
github JasonKessler / scattertext / demo_pair_plot_convention.py View on Github external
import scattertext as st
import scattertext.categoryprojector.pairplot

convention_df = st.SampleCorpora.ConventionData2012.get_data()


corpus = st.CorpusFromPandas(
	convention_df,
	category_col='speaker',
	text_col='text',
	nlp=st.whitespace_nlp_with_sentences
).build().get_unigram_corpus()
html = scattertext.categoryprojector.pairplot.produce_pairplot(
	corpus,
	metadata=convention_df['party'] + ': ' + convention_df['speaker']
)

file_name = 'convention_pair_plot.html'
open(file_name, 'wb').write(html.encode('utf-8'))
print('./' + file_name)