How to use scattertext - 10 common examples

To help you get started, we’ve selected a few scattertext examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github JasonKessler / scattertext / demo_emoji.py View on Github external
with ZipFile(io.BytesIO(urllib.request.urlopen(
			'http://followthehashtag.com/content/uploads/USA-Geolocated-tweets-free-dataset-Followthehashtag.zip'
	).read())) as zf:
		df = pd.read_excel(zf.open('dashboard_x_usa_x_filter_nativeretweets.xlsx'))
	df['first_name'] = df['User Name'].apply(
		lambda x: x.split()[0].lower() if type(x) == str and len(x.split()) > 0 else x)
	male_prob = agefromname.AgeFromName().get_all_name_male_prob()
	df_aug = pd.merge(df, male_prob, left_on='first_name', right_index=True)
	df_aug['gender'] = df_aug['prob'].apply(lambda x: 'm' if x > 0.9 else 'f' if x < 0.1 else '?')
	df_mf = df_aug[df_aug['gender'].isin(['m', 'f'])]
	df_mf.to_csv('emoji_data.csv', index=False)

nlp = st.tweet_tokenizier_factory(nltk.tokenize.TweetTokenizer())
df_mf['parse'] = df_mf['Tweet content'].apply(nlp)

corpus = st.CorpusFromParsedDocuments(
	df_mf,
	parsed_col='parse',
	category_col='gender',
	feats_from_spacy_doc=st.FeatsFromSpacyDocOnlyEmoji()
).build()

html = st.produce_scattertext_explorer(
	corpus,
	category='f',
	category_name='Female',
	not_category_name='Male',
	use_full_doc=True,
	term_ranker=OncePerDocFrequencyRanker,
	sort_by_dist=False,
	metadata=(df_mf['User Name']
	          + ' (@' + df_mf['Nickname'] + ') '
github JasonKessler / scattertext / scattertext / termcompaction / CompactTerms.py View on Github external
def _get_term_indices_to_compact_from_term_freqs(self, term_freqs, term_doc_matrix, non_text):
		idx = IndexStore()
		tdf_vals = term_freqs.values
		valid_terms_mask = tdf_vals.sum(axis=1) >= self.minimum_term_count
		tdf_vals = term_freqs[valid_terms_mask].values
		terms = np.array(term_freqs.index)[valid_terms_mask]

		lengths = []
		fact = CSRMatrixFactory()
		for i, t in enumerate(terms):
			for tok in t.split():
				fact[i, idx.getidx(tok)] = 1
			lengths.append(len(t.split()))
		lengths = np.array(lengths)
		mat = fact.get_csr_matrix()

		coocs = lengths - (mat * mat.T)
		pairs = np.argwhere(coocs == 0).T
		pairs = self._limit_to_non_identical_terms(pairs)
		pairs = self._limit_to_pairs_of_bigrams_and_a_constituent_unigram(pairs, terms)
		pairs = self._limit_to_redundant_unigrams(pairs, tdf_vals)
		idx_store = term_doc_matrix._get_relevant_idx_store(non_text)
		redundant_terms = idx_store.getidxstrictbatch(terms[np.unique(pairs[:, 1])])
		infrequent_terms = np.argwhere(~valid_terms_mask).T[0]
		terms_to_remove = np.concatenate([redundant_terms, infrequent_terms])
github JasonKessler / scattertext / scattertext / TermDocMatrixFactory.py View on Github external
category_idx_store = IndexStore()
    mX_factory = CSRMatrixFactory()
    for doci, (category, text) in enumerate(category_text_iter):
        y.append(category_idx_store.getidx(category))
        term_freq = Counter()
        for sent in text.strip(string.punctuation).lower().split('\n'):
            unigrams = []
            for tok in sent.strip().split():
                unigrams.append(tok)
            bigrams = list(map(' '.join, zip(unigrams[:-1], unigrams[1:])))
            for term in unigrams + bigrams:
                term_freq[term_idx_store.getidx(term)] += 1
        for word_idx, freq in term_freq.items():
            X_factory[doci, word_idx] = freq
    metadata_idx_store = IndexStore()
    return TermDocMatrix(X=X_factory.get_csr_matrix(),
                         mX=mX_factory.get_csr_matrix(),
                         y=np.array(y),
                         term_idx_store=term_idx_store,
                         metadata_idx_store=metadata_idx_store,
                         category_idx_store=category_idx_store)
github JasonKessler / scattertext / demo_hedges_r.py View on Github external
from scattertext import SampleCorpora, whitespace_nlp_with_sentences, produce_frequency_explorer, HedgesR
from scattertext.CorpusFromPandas import CorpusFromPandas

convention_df = SampleCorpora.ConventionData2012.get_data()
corpus = (CorpusFromPandas(convention_df,
                           category_col='party',
                           text_col='text',
                           nlp=whitespace_nlp_with_sentences)
          .build()
          .get_unigram_corpus())
html = produce_frequency_explorer(
    corpus,
    category='democrat',
    category_name='Democratic',
    not_category_name='Republican',
    term_scorer=HedgesR(corpus),
    metadata=convention_df['speaker'],
    grey_threshold=0
)
file_name = 'demo_hedges_r.html'
github JasonKessler / scattertext / demo_compact.py View on Github external
import scattertext as st

df = st.SampleCorpora.ConventionData2012.get_data().assign(
    parse=lambda df: df.text.apply(st.whitespace_nlp_with_sentences)
)

corpus = st.CorpusFromParsedDocuments(
    df, category_col='party', parsed_col='parse'
).build().get_unigram_corpus().compact(st.AssociationCompactor(2000))

html = st.produce_scattertext_explorer(
    corpus,
    category='democrat',
    category_name='Democratic',
    not_category_name='Republican',
    minimum_term_frequency=0, pmi_threshold_coefficient=0,
    width_in_pixels=1000, metadata=corpus.get_df()['speaker'],
    transform=st.Scalers.dense_rank,
    max_overlapping=3
github JasonKessler / scattertext / demo_without_spacy.py View on Github external
from scattertext import SampleCorpora, produce_scattertext_explorer
from scattertext.CorpusFromPandas import CorpusFromPandas
from scattertext.WhitespaceNLP import whitespace_nlp

nlp = whitespace_nlp

convention_df = SampleCorpora.ConventionData2012.get_data()
corpus = CorpusFromPandas(convention_df,
                          category_col='party',
                          text_col='text',
                          nlp=nlp).build()

html = produce_scattertext_explorer(corpus,
                                    category='democrat',
                                    category_name='Democratic',
                                    not_category_name='Republican',
                                    minimum_term_frequency=5,
                                    width_in_pixels=1000,
                                    metadata=convention_df['speaker'])
open('./demo_without_spacy.html', 'wb').write(html.encode('utf-8'))
print('Open ./demo_without_spacy.html in Chrome or Firefox.')
github JasonKessler / scattertext / demo_similarity.py View on Github external
def main():
	nlp = spacy.load('en')
	convention_df = SampleCorpora.ConventionData2012.get_data()
	corpus = CorpusFromPandas(convention_df,
	                          category_col='party',
	                          text_col='text',
	                          nlp=nlp).build()
	html = word_similarity_explorer(corpus,
	                                category='democrat',
	                                category_name='Democratic',
	                                not_category_name='Republican',
	                                target_term='jobs',
	                                minimum_term_frequency=5,
	                                width_in_pixels=1000,
	                                metadata=convention_df['speaker'],
	                                alpha=0.01,
	                                max_p_val=0.1,
	                                save_svg_button=True)
	open('./demo_similarity.html', 'wb').write(html.encode('utf-8'))
github JasonKessler / scattertext / demo_umap_documents.py View on Github external
import pandas as pd
from sklearn.feature_extraction.text import TfidfTransformer
import umap
import scattertext as st
from scipy.sparse.linalg import svds

convention_df = st.SampleCorpora.ConventionData2012.get_data()
convention_df['parse'] = convention_df['text'].apply(st.whitespace_nlp_with_sentences)
corpus = (st.CorpusFromParsedDocuments(convention_df,
                                       category_col='party',
                                       parsed_col='parse')
          .build()
          .get_stoplisted_unigram_corpus())
corpus = corpus.add_doc_names_as_metadata(corpus.get_df()['speaker'])

embeddings = TfidfTransformer().fit_transform(corpus.get_term_doc_mat())
projection_raw = umap.UMAP(min_dist=0.5, metric='cosine').fit_transform(embeddings).T
projection = pd.DataFrame({'term': corpus.get_metadata(),
                           'x': projection_raw[0],
                           'y': projection_raw[1]}).set_index('term')

category = 'democrat'
scores = (corpus.get_category_ids() == corpus.get_categories().index(category)).astype(int)
github JasonKessler / scattertext / demo_custom_coordinates.py View on Github external
import numpy as np
import spacy
from sklearn.linear_model import LogisticRegression

from scattertext import SampleCorpora, produce_scattertext_explorer
from scattertext.CorpusFromPandas import CorpusFromPandas
from scattertext.Scalers import scale

nlp = spacy.load('en')
convention_df = SampleCorpora.ConventionData2012.get_data()
corpus = CorpusFromPandas(convention_df,
                          category_col='party',
                          text_col='text',
                          nlp=nlp).build()

term_freq_df = corpus.get_term_freq_df()


def zero_centered_scale(ar):
	ar[ar > 0] = scale(ar[ar > 0])
	ar[ar < 0] = -scale(-ar[ar < 0])
	return (ar + 1) / 2.


frequencies_scaled = scale(np.log(term_freq_df.sum(axis=1).values))
scores = corpus.get_logreg_coefs('democrat',
github JasonKessler / scattertext / demo_expected_vs_actual.py View on Github external
import numpy as np
import spacy
from sklearn.linear_model import LogisticRegression

from scattertext import SampleCorpora, produce_scattertext_explorer
from scattertext.CorpusFromPandas import CorpusFromPandas

nlp = spacy.load('en')
convention_df = SampleCorpora.ConventionData2012.get_data()
corpus = CorpusFromPandas(convention_df,
                          category_col='party',
                          text_col='text',
                          nlp=nlp).build()

term_freq_df = corpus.get_term_freq_df()

def scale(ar):
	return (ar - ar.min()) / (ar.max() - ar.min())

def zero_centered_scale(ar):
	ar[ar > 0] = scale(ar[ar > 0])
	ar[ar < 0] = -scale(-ar[ar < 0])
	return (ar + 1) / 2.