How to use the scattertext.SampleCorpora.ConventionData2012.get_data function in scattertext

To help you get started, we’ve selected a few scattertext examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github JasonKessler / scattertext / demo_hedges_r.py View on Github external
from scattertext import SampleCorpora, whitespace_nlp_with_sentences, produce_frequency_explorer, HedgesR
from scattertext.CorpusFromPandas import CorpusFromPandas

convention_df = SampleCorpora.ConventionData2012.get_data()
corpus = (CorpusFromPandas(convention_df,
                           category_col='party',
                           text_col='text',
                           nlp=whitespace_nlp_with_sentences)
          .build()
          .get_unigram_corpus())
html = produce_frequency_explorer(
    corpus,
    category='democrat',
    category_name='Democratic',
    not_category_name='Republican',
    term_scorer=HedgesR(corpus),
    metadata=convention_df['speaker'],
    grey_threshold=0
)
file_name = 'demo_hedges_r.html'
github JasonKessler / scattertext / demo_similarity.py View on Github external
def main():
	nlp = spacy.load('en')
	convention_df = SampleCorpora.ConventionData2012.get_data()
	corpus = CorpusFromPandas(convention_df,
	                          category_col='party',
	                          text_col='text',
	                          nlp=nlp).build()
	html = word_similarity_explorer(corpus,
	                                category='democrat',
	                                category_name='Democratic',
	                                not_category_name='Republican',
	                                target_term='jobs',
	                                minimum_term_frequency=5,
	                                width_in_pixels=1000,
	                                metadata=convention_df['speaker'],
	                                alpha=0.01,
	                                max_p_val=0.1,
	                                save_svg_button=True)
	open('./demo_similarity.html', 'wb').write(html.encode('utf-8'))
github JasonKessler / scattertext / demo_umap_documents.py View on Github external
import pandas as pd
from sklearn.feature_extraction.text import TfidfTransformer
import umap
import scattertext as st
from scipy.sparse.linalg import svds

convention_df = st.SampleCorpora.ConventionData2012.get_data()
convention_df['parse'] = convention_df['text'].apply(st.whitespace_nlp_with_sentences)
corpus = (st.CorpusFromParsedDocuments(convention_df,
                                       category_col='party',
                                       parsed_col='parse')
          .build()
          .get_stoplisted_unigram_corpus())
corpus = corpus.add_doc_names_as_metadata(corpus.get_df()['speaker'])

embeddings = TfidfTransformer().fit_transform(corpus.get_term_doc_mat())
projection_raw = umap.UMAP(min_dist=0.5, metric='cosine').fit_transform(embeddings).T
projection = pd.DataFrame({'term': corpus.get_metadata(),
                           'x': projection_raw[0],
                           'y': projection_raw[1]}).set_index('term')

category = 'democrat'
scores = (corpus.get_category_ids() == corpus.get_categories().index(category)).astype(int)
github JasonKessler / scattertext / demo_expected_vs_actual.py View on Github external
import numpy as np
import spacy
from sklearn.linear_model import LogisticRegression

from scattertext import SampleCorpora, produce_scattertext_explorer
from scattertext.CorpusFromPandas import CorpusFromPandas

nlp = spacy.load('en')
convention_df = SampleCorpora.ConventionData2012.get_data()
corpus = CorpusFromPandas(convention_df,
                          category_col='party',
                          text_col='text',
                          nlp=nlp).build()

term_freq_df = corpus.get_term_freq_df()

def scale(ar):
	return (ar - ar.min()) / (ar.max() - ar.min())

def zero_centered_scale(ar):
	ar[ar > 0] = scale(ar[ar > 0])
	ar[ar < 0] = -scale(-ar[ar < 0])
	return (ar + 1) / 2.
github JasonKessler / scattertext / demo.py View on Github external
from scattertext.CorpusFromParsedDocuments import CorpusFromParsedDocuments
from scattertext import SampleCorpora, whitespace_nlp_with_sentences, produce_scattertext_explorer

convention_df = SampleCorpora.ConventionData2012.get_data().assign(
	parse = lambda df: df.text.apply(whitespace_nlp_with_sentences)
)
corpus = CorpusFromParsedDocuments(convention_df, category_col='party', parsed_col='parse').build()

html = produce_scattertext_explorer(
	corpus,
	category='democrat',
	category_name='Democratic',
	not_category_name='Republican',
	minimum_term_frequency=5,
	pmi_threshold_coefficient=8,
	width_in_pixels=1000,
	metadata=convention_df['speaker'],
	d3_scale_chromatic_url='scattertext/data/viz/scripts/d3-scale-chromatic.v1.min.js',
	d3_url='scattertext/data/viz/scripts/d3.min.js',
)
github JasonKessler / scattertext / demo_sparse.py View on Github external
import spacy
from sklearn.linear_model import Lasso
from sklearn.linear_model import LogisticRegression

from scattertext import SampleCorpora, sparse_explorer
from scattertext.CorpusFromPandas import CorpusFromPandas

nlp = spacy.load('en')
convention_df = SampleCorpora.ConventionData2012.get_data()
corpus = CorpusFromPandas(convention_df,
                          category_col='party',
                          text_col='text',
                          nlp=nlp).build()
scores = corpus.get_logreg_coefs('democrat',
                                 LogisticRegression(penalty='l1', C=10, max_iter=10000, n_jobs=-1))
html = sparse_explorer(corpus,
                       category='democrat',
                       scores=scores,
                       category_name='Democratic',
                       not_category_name='Republican',
                       minimum_term_frequency=5,
                       width_in_pixels=1000,
                       metadata=convention_df['speaker'])
open('./demo_sparse.html', 'wb').write(html.encode('utf-8'))
print('Open ./demo_sparse.html in Chrome or Firefox.')