Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
from scattertext import SampleCorpora, whitespace_nlp_with_sentences, produce_frequency_explorer, HedgesR
from scattertext.CorpusFromPandas import CorpusFromPandas
convention_df = SampleCorpora.ConventionData2012.get_data()
corpus = (CorpusFromPandas(convention_df,
category_col='party',
text_col='text',
nlp=whitespace_nlp_with_sentences)
.build()
.get_unigram_corpus())
html = produce_frequency_explorer(
corpus,
category='democrat',
category_name='Democratic',
not_category_name='Republican',
term_scorer=HedgesR(corpus),
metadata=convention_df['speaker'],
grey_threshold=0
)
file_name = 'demo_hedges_r.html'
def main():
nlp = spacy.load('en')
convention_df = SampleCorpora.ConventionData2012.get_data()
corpus = CorpusFromPandas(convention_df,
category_col='party',
text_col='text',
nlp=nlp).build()
html = word_similarity_explorer(corpus,
category='democrat',
category_name='Democratic',
not_category_name='Republican',
target_term='jobs',
minimum_term_frequency=5,
width_in_pixels=1000,
metadata=convention_df['speaker'],
alpha=0.01,
max_p_val=0.1,
save_svg_button=True)
open('./demo_similarity.html', 'wb').write(html.encode('utf-8'))
import pandas as pd
from sklearn.feature_extraction.text import TfidfTransformer
import umap
import scattertext as st
from scipy.sparse.linalg import svds
convention_df = st.SampleCorpora.ConventionData2012.get_data()
convention_df['parse'] = convention_df['text'].apply(st.whitespace_nlp_with_sentences)
corpus = (st.CorpusFromParsedDocuments(convention_df,
category_col='party',
parsed_col='parse')
.build()
.get_stoplisted_unigram_corpus())
corpus = corpus.add_doc_names_as_metadata(corpus.get_df()['speaker'])
embeddings = TfidfTransformer().fit_transform(corpus.get_term_doc_mat())
projection_raw = umap.UMAP(min_dist=0.5, metric='cosine').fit_transform(embeddings).T
projection = pd.DataFrame({'term': corpus.get_metadata(),
'x': projection_raw[0],
'y': projection_raw[1]}).set_index('term')
category = 'democrat'
scores = (corpus.get_category_ids() == corpus.get_categories().index(category)).astype(int)
import numpy as np
import spacy
from sklearn.linear_model import LogisticRegression
from scattertext import SampleCorpora, produce_scattertext_explorer
from scattertext.CorpusFromPandas import CorpusFromPandas
nlp = spacy.load('en')
convention_df = SampleCorpora.ConventionData2012.get_data()
corpus = CorpusFromPandas(convention_df,
category_col='party',
text_col='text',
nlp=nlp).build()
term_freq_df = corpus.get_term_freq_df()
def scale(ar):
return (ar - ar.min()) / (ar.max() - ar.min())
def zero_centered_scale(ar):
ar[ar > 0] = scale(ar[ar > 0])
ar[ar < 0] = -scale(-ar[ar < 0])
return (ar + 1) / 2.
from scattertext.CorpusFromParsedDocuments import CorpusFromParsedDocuments
from scattertext import SampleCorpora, whitespace_nlp_with_sentences, produce_scattertext_explorer
convention_df = SampleCorpora.ConventionData2012.get_data().assign(
parse = lambda df: df.text.apply(whitespace_nlp_with_sentences)
)
corpus = CorpusFromParsedDocuments(convention_df, category_col='party', parsed_col='parse').build()
html = produce_scattertext_explorer(
corpus,
category='democrat',
category_name='Democratic',
not_category_name='Republican',
minimum_term_frequency=5,
pmi_threshold_coefficient=8,
width_in_pixels=1000,
metadata=convention_df['speaker'],
d3_scale_chromatic_url='scattertext/data/viz/scripts/d3-scale-chromatic.v1.min.js',
d3_url='scattertext/data/viz/scripts/d3.min.js',
)
import spacy
from sklearn.linear_model import Lasso
from sklearn.linear_model import LogisticRegression
from scattertext import SampleCorpora, sparse_explorer
from scattertext.CorpusFromPandas import CorpusFromPandas
nlp = spacy.load('en')
convention_df = SampleCorpora.ConventionData2012.get_data()
corpus = CorpusFromPandas(convention_df,
category_col='party',
text_col='text',
nlp=nlp).build()
scores = corpus.get_logreg_coefs('democrat',
LogisticRegression(penalty='l1', C=10, max_iter=10000, n_jobs=-1))
html = sparse_explorer(corpus,
category='democrat',
scores=scores,
category_name='Democratic',
not_category_name='Republican',
minimum_term_frequency=5,
width_in_pixels=1000,
metadata=convention_df['speaker'])
open('./demo_sparse.html', 'wb').write(html.encode('utf-8'))
print('Open ./demo_sparse.html in Chrome or Firefox.')