Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
with ZipFile(io.BytesIO(urllib.request.urlopen(
'http://followthehashtag.com/content/uploads/USA-Geolocated-tweets-free-dataset-Followthehashtag.zip'
).read())) as zf:
df = pd.read_excel(zf.open('dashboard_x_usa_x_filter_nativeretweets.xlsx'))
df['first_name'] = df['User Name'].apply(
lambda x: x.split()[0].lower() if type(x) == str and len(x.split()) > 0 else x)
male_prob = agefromname.AgeFromName().get_all_name_male_prob()
df_aug = pd.merge(df, male_prob, left_on='first_name', right_index=True)
df_aug['gender'] = df_aug['prob'].apply(lambda x: 'm' if x > 0.9 else 'f' if x < 0.1 else '?')
df_mf = df_aug[df_aug['gender'].isin(['m', 'f'])]
df_mf.to_csv('emoji_data.csv', index=False)
nlp = st.tweet_tokenizier_factory(nltk.tokenize.TweetTokenizer())
df_mf['parse'] = df_mf['Tweet content'].apply(nlp)
corpus = st.CorpusFromParsedDocuments(
df_mf,
parsed_col='parse',
category_col='gender',
feats_from_spacy_doc=st.FeatsFromSpacyDocOnlyEmoji()
).build()
html = st.produce_scattertext_explorer(
corpus,
category='f',
category_name='Female',
not_category_name='Male',
use_full_doc=True,
term_ranker=OncePerDocFrequencyRanker,
sort_by_dist=False,
metadata=(df_mf['User Name']
+ ' (@' + df_mf['Nickname'] + ') '
def main():
df = pd.read_csv('https://cdn.rawgit.com/JasonKessler/scattertext/e508bf32/scattertext/data/chinese.csv')
df['text'] = df['text'].apply(chinese_nlp)
corpus = CorpusFromParsedDocuments(df,
category_col='novel',
parsed_col='text').build()
html = produce_scattertext_explorer(corpus,
category='Tale of Two Cities',
category_name='Tale of Two Cities',
not_category_name='Ulysses',
width_in_pixels=1000,
metadata=df['novel'],
asian_mode=True)
open('./demo_chinese.html', 'w').write(html)
print('Open ./demo_chinese.html in Chrome or Firefox.')
def make_political_corpus():
clean = clean_function_factory()
get_speaker_name = speaker_name_factory()
data = []
for party, speech in iter_party_speech_pairs():
cleaned_speech = clean(speech)
speaker_name = get_speaker_name(speech)
if cleaned_speech and cleaned_speech != '' and speaker_name != '':
parsed_speech = fast_but_crap_nlp(cleaned_speech)
data.append({'party': party,
'text': parsed_speech,
'speaker': speaker_name})
source_df = pd.DataFrame(data)
corpus = CorpusFromParsedDocuments(source_df,
category_col='party',
parsed_col='text').build()
return corpus, source_df
from sklearn.decomposition import TruncatedSVD
import scattertext as st
from scattertext import ClassPercentageCompactor, CSRMatrixFactory
from scattertext.representations.CorpusSentenceIterator import CorpusSentenceIterator
convention_df = st.SampleCorpora.ConventionData2012.get_data()
convention_df['parse'] = convention_df['text'].apply(st.whitespace_nlp_with_sentences)
corpus = (st.CorpusFromParsedDocuments(convention_df,
category_col='party',
parsed_col='parse')
.build()
.get_stoplisted_unigram_corpus().select(ClassPercentageCompactor(term_count=3)))
html = st.produce_projection_explorer(corpus,
embeddings=corpus.get_term_doc_mat(),
projection_model=TruncatedSVD(n_components=30, n_iter=10),
x_dim=0,
y_dim=1,
category='democrat',
category_name='Democratic',
not_category_name='Republican',
metadata=convention_df.speaker,
width_in_pixels=1000)
def main():
convention_df = SampleCorpora.ConventionData2012.get_data()
feat_builder = FeatsFromOnlyEmpath()
corpus = CorpusFromParsedDocuments(convention_df,
category_col='party',
parsed_col='text',
feats_from_spacy_doc=feat_builder).build()
html = produce_scattertext_explorer(corpus,
category='democrat',
category_name='Democratic',
not_category_name='Republican',
width_in_pixels=1000,
metadata=convention_df['speaker'],
use_non_text_features=True,
use_full_doc=True,
topic_model_term_lists=feat_builder.get_top_model_term_lists())
open('./Convention-Visualization-Empath.html', 'wb').write(html.encode('utf-8'))
print('Open ./Convention-Visualization-Empath.html in Chrome or Firefox.')
import scattertext as st
df = st.SampleCorpora.ConventionData2012.get_data().assign(
parse=lambda df: df.text.apply(st.whitespace_nlp_with_sentences)
)
corpus = st.CorpusFromParsedDocuments(
df, category_col='party', parsed_col='parse'
).build().get_unigram_corpus().compact(st.AssociationCompactor(2000))
html = st.produce_scattertext_explorer(
corpus,
category='democrat',
category_name='Democratic',
not_category_name='Republican',
minimum_term_frequency=0, pmi_threshold_coefficient=0,
width_in_pixels=1000, metadata=corpus.get_df()['speaker'],
transform=st.Scalers.dense_rank,
max_overlapping=3
)
open('./demo_compact.html', 'w').write(html)
print('open ./demo_compact.html in Chrome')
import scattertext as st
import spacy
nlp = spacy.load('en')
df = st.SampleCorpora.ConventionData2012.get_data().assign(
parse=lambda df: list(nlp.pipe(df.text))
)
corpus = st.CorpusFromParsedDocuments(
df,
category_col='party',
parsed_col='parse',
feats_from_spacy_doc=st.SpacyEntities(entity_types_to_use=['NAME', 'LOC'])
).build()
html = st.produce_scattertext_explorer(
corpus,
category='democrat',
category_name='Democratic',
not_category_name='Republican',
minimum_term_frequency=0, pmi_threshold_coefficient=0,
width_in_pixels=1000, metadata=corpus.get_df()['speaker'],
transform=st.Scalers.dense_rank,
max_overlapping=10,
max_docs_per_category=0
import pandas as pd
from sklearn.feature_extraction.text import TfidfTransformer
import umap
import scattertext as st
from scipy.sparse.linalg import svds
convention_df = st.SampleCorpora.ConventionData2012.get_data()
convention_df['parse'] = convention_df['text'].apply(st.whitespace_nlp_with_sentences)
corpus = (st.CorpusFromParsedDocuments(convention_df,
category_col='party',
parsed_col='parse')
.build()
.get_stoplisted_unigram_corpus())
corpus = corpus.add_doc_names_as_metadata(corpus.get_df()['speaker'])
embeddings = TfidfTransformer().fit_transform(corpus.get_term_doc_mat())
projection_raw = umap.UMAP(min_dist=0.5, metric='cosine').fit_transform(embeddings).T
projection = pd.DataFrame({'term': corpus.get_metadata(),
'x': projection_raw[0],
'y': projection_raw[1]}).set_index('term')
category = 'democrat'
scores = (corpus.get_category_ids() == corpus.get_categories().index(category)).astype(int)
html = st.produce_pca_explorer(corpus,
category=category,
import scattertext as st
import pandas as pd
from sklearn.feature_extraction.text import TfidfTransformer
from scipy.sparse.linalg import svds
convention_df = st.SampleCorpora.ConventionData2012.get_data()
convention_df['parse'] = convention_df['text'].apply(st.whitespace_nlp_with_sentences)
corpus = (st.CorpusFromParsedDocuments(convention_df,
category_col='party',
parsed_col='parse')
.build()
.get_stoplisted_unigram_corpus()
.remove_infrequent_words(minimum_term_count=3, term_ranker=st.OncePerDocFrequencyRanker))
embeddings = TfidfTransformer().fit_transform(corpus.get_term_doc_mat()).T
U, S, VT = svds(embeddings, k = 3, maxiter=20000, which='LM')
x_dim = 0; y_dim = 1
projection = pd.DataFrame({'term':corpus.get_terms(),
'x':U.T[x_dim],
'y':U.T[y_dim]}).set_index('term')
html = st.produce_pca_explorer(corpus,
category='democrat',
category_name='Democratic',
not_category_name='Republican',