Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
data_ = dimred.fit_transform(tfidf)
data__ = pd.DataFrame(data=data_,columns=["principal component 1","principal component 2"])
labels_ = pd.DataFrame(data=enc.inverse_transform(labels),columns=["class"])
data___ = pd.concat((data__,labels_),axis=1)
c = alt.Chart(data___,title="dimensionality reduction",height=600).mark_circle(size=20).encode(x='principal component 1', y='principal component 2',color=alt.Color('class', scale=alt.Scale(scheme='blues')),tooltip=["class"]).interactive()
st.altair_chart(c)
st.write("The explained variance is",np.round(np.sum(dimred.explained_variance_ratio_)*100,2),"%.")
# MODEL BUILDING.
st.header("Model Building")
st.write("The model is based on a **random forest**. Customize the model in the sidebar.")
st.sidebar.header("Customizing the model.")
n_estimators = st.sidebar.text_input('Number of trees in random forest.', '1000')
max_leaf_nodes = st.sidebar.text_input('Maximum number of lead nodes.', '25')
max_depth = st.sidebar.text_input('Maximum depth.', '5')
class_weight = st.sidebar.selectbox("Class weights for the model.",('balanced','balanced_subsample'))
forest_clf = RandomForestClassifier(n_estimators=int(n_estimators),max_depth=int(max_depth),max_leaf_nodes=int(max_leaf_nodes),class_weight=class_weight,oob_score=True,n_jobs=-1,random_state=0) # Define classifier to optimize.
#parameters = {'max_leaf_nodes':np.linspace(20,35,14,dtype='int')} # Define grid.
#clf = RandomizedSearchCV(forest_clf, parameters, n_iter=10, cv=3,iid=False, scoring='accuracy',n_jobs=-1) # Balanced accuracy as performance measure.
#@st.cache(show_spinner=False)
def train():
classifier = forest_clf.fit(tfidf, labels) # Train/optimize classifier.
#forest = classifier.best_estimator_
feature_importances = classifier.feature_importances_
indices = np.argsort(feature_importances)[::-1]
# Analyze Feature Importance.
n_f = 30 # Amount of Desired Features.
sorted_feature_names = []
for f in range(n_f):
sorted_feature_names.append(feature_names[indices[f]])
def components():
apps = get_apps(JSON_URL) # type: Dict[str, str]
logger.info(apps)
app_names = []
for name, _ in apps.items():
app_names.append(name)
run_app = st.sidebar.selectbox("Select the component", app_names)
# Fetch the content
python_code = get_file_content_as_string(apps[run_app])
# Run the child app
if python_code is not None:
try:
st.header("Result")
exec(python_code)
st.header("Source code")
st.markdown("Link: [Github](%s)" % apps[run_app])
st.code(python_code)
except Exception as e:
st.write("Error occurred when execute [{0}]".format(run_app))
st.error(str(e))
logger.error(e)
@st.cache(ignore_hash=True)
def process_text(model_name, text):
nlp = load_model(model_name)
return nlp(text)
st.sidebar.title("Interactive spaCy visualizer")
st.sidebar.markdown(
"""
Process text with [spaCy](https://spacy.io) models and visualize named entities,
dependencies and more. Uses spaCy's built-in
[displaCy](http://spacy.io/usage/visualizers) visualizer under the hood.
"""
)
spacy_model = st.sidebar.selectbox("Model name", SPACY_MODEL_NAMES)
model_load_state = st.info(f"Loading model '{spacy_model}'...")
nlp = load_model(spacy_model)
model_load_state.empty()
text = st.text_area("Text to analyze", DEFAULT_TEXT)
doc = process_text(spacy_model, text)
if "parser" in nlp.pipe_names:
st.header("Dependency Parse & Part-of-speech tags")
st.sidebar.header("Dependency Parse")
split_sents = st.sidebar.checkbox("Split sentences", value=True)
collapse_punct = st.sidebar.checkbox("Collapse punctuation", value=True)
collapse_phrases = st.sidebar.checkbox("Collapse phrases")
compact = st.sidebar.checkbox("Compact mode")
options = {
"collapse_punct": collapse_punct,
return show_test_predict_plot
elif menu_name == 'feature_target':
data_frequency = st.sidebar.selectbox('What is the FREQUENCY of your data? ', ['Select a frequency', 'Hourly', 'Daily', 'Monthly', 'Quarterly', 'Yearly'], 0)
# If the frequency do not select a frequency for the dataset, it will raise an error
if data_frequency == 'Select a frequency':
# Hiding traceback in order to only show the error message
sys.tracebacklimit = 0
raise ValueError('Please, select the FREQUENCY for your data')
# Show traceback error
sys.tracebacklimit = None
st.sidebar.markdown('### Choosing columns')
ds_column = st.sidebar.selectbox('Which one is your DATE column?', df.columns, 0)
y = st.sidebar.selectbox('Which column you want to PREDICT?', df.columns, 1)
exog_variables = st.sidebar.multiselect('Which are your exogenous variables?', df.drop([ds_column, y], axis=1).columns)
test_set_size = st.sidebar.slider('Validation set size', 3, 30, seasonality_dict[data_frequency])
return ds_column, y, data_frequency, test_set_size, exog_variables
elif menu_name == 'force_transformations':
st.sidebar.markdown('### Force data transformation (optional)')
transformation_techniques_list = ['Choose the best one', 'No transformation', 'First Difference',
'Log transformation', 'Seasonal Difference', 'Log First Difference',
'Log Difference + Seasonal Difference', 'Custom Difference']
transformation_techniques = st.sidebar.selectbox('Transformation technique', transformation_techniques_list, 0)
return transformation_techniques
elif menu_name == 'terms':
st.sidebar.markdown('### Model parameters')
st.sidebar.text('Terms for (p, d, q)x(P, D, Q)s')
p = st.sidebar.slider('p (AR)', 0, 30, min([terms[0], 30]))
d = st.sidebar.slider('d (I)', 0, 3, min([terms[1], 3]))
q = st.sidebar.slider('q (MA)', 0, 30, min([terms[2], 30]))
def main():
'''Set main() function. Includes sidebar navigation and respective routing.'''
st.sidebar.title("Explore")
app_mode = st.sidebar.selectbox( "Choose an Action", [
"About",
"Choose an Emotion",
"Choose an Artist",
"Classify a Song",
"Emotional Spectrum",
"Show Source Code"
])
# clear tmp
clear_tmp()
# nav
if app_mode == "About": show_about()
elif app_mode == "Choose an Emotion": explore_classified()
elif app_mode == 'Choose an Artist': explore_artists()
elif app_mode == "Classify a Song": classify_song()
show_test_predict_plot = st.sidebar.checkbox('Test set forecast', value=True)
return show_test_predict_plot
elif menu_name == 'feature_target':
data_frequency = st.sidebar.selectbox('What is the FREQUENCY of your data? ', ['Select a frequency', 'Hourly', 'Daily', 'Monthly', 'Quarterly', 'Yearly'], 0)
# If the frequency do not select a frequency for the dataset, it will raise an error
if data_frequency == 'Select a frequency':
# Hiding traceback in order to only show the error message
sys.tracebacklimit = 0
raise ValueError('Please, select the FREQUENCY for your data')
# Show traceback error
sys.tracebacklimit = None
st.sidebar.markdown('### Choosing columns')
ds_column = st.sidebar.selectbox('Which one is your DATE column?', df.columns, 0)
y = st.sidebar.selectbox('Which column you want to PREDICT?', df.columns, 1)
exog_variables = st.sidebar.multiselect('Which are your exogenous variables?', df.drop([ds_column, y], axis=1).columns)
test_set_size = st.sidebar.slider('Validation set size', 3, 30, seasonality_dict[data_frequency])
return ds_column, y, data_frequency, test_set_size, exog_variables
elif menu_name == 'force_transformations':
st.sidebar.markdown('### Force data transformation (optional)')
transformation_techniques_list = ['Choose the best one', 'No transformation', 'First Difference',
'Log transformation', 'Seasonal Difference', 'Log First Difference',
'Log Difference + Seasonal Difference', 'Custom Difference']
transformation_techniques = st.sidebar.selectbox('Transformation technique', transformation_techniques_list, 0)
return transformation_techniques
elif menu_name == 'terms':
st.sidebar.markdown('### Model parameters')
st.sidebar.text('Terms for (p, d, q)x(P, D, Q)s')
p = st.sidebar.slider('p (AR)', 0, 30, min([terms[0], 30]))
d = st.sidebar.slider('d (I)', 0, 3, min([terms[1], 3]))
def write():
"""Writes the page in gallery.py"""
st.sidebar.title("Interactive spaCy visualizer")
st.sidebar.markdown(
"""
Process text with [spaCy](https://spacy.io) models and visualize named entities,
dependencies and more. Uses spaCy's built-in
[displaCy](http://spacy.io/usage/visualizers) visualizer under the hood.
"""
)
st.write("Author: [Ines Montani](https://gist.github.com/ines)")
st.write(
"Source: [Github](https://gist.github.com/ines/b320cb8441b590eedf19137599ce6685)"
)
spacy_model = st.sidebar.selectbox("Model name", SPACY_MODEL_NAMES)
model_load_state = st.info(f"Loading model '{spacy_model}'...")
nlp = load_model(spacy_model)
model_load_state.empty()
text = st.text_area("Text to analyze", DEFAULT_TEXT)
doc = process_text(spacy_model, text)
if "parser" in nlp.pipe_names:
st.header("Dependency Parse & Part-of-speech tags")
st.sidebar.header("Dependency Parse")
split_sents = st.sidebar.checkbox("Split sentences", value=True)
collapse_punct = st.sidebar.checkbox("Collapse punctuation", value=True)
collapse_phrases = st.sidebar.checkbox("Collapse phrases")
compact = st.sidebar.checkbox("Compact mode")
options = {
"collapse_punct": collapse_punct,
fill=False
legend=True
number_threshold = 100
if st.sidebar.checkbox("Show settings"):
#Threshold visualization
option_threshold= st.sidebar.selectbox("Threshold",list(df_binary["threshold"]), index = 0)
threshold = True if option_threshold == "Yes" else False
#Fill visualization
if other_graph == False:
option_fill= st.sidebar.selectbox("Fill",list(df_binary["fill"]), index = 0)
fill = True if option_fill == "Yes" else False
if other_graph:
fill = None
#Legend visualization
option_legend= st.sidebar.selectbox("Legend",list(df_binary["legend"]), index = 0)
legend = True if option_legend == "Yes" else False
number_threshold = st.sidebar.slider("Number of thresholds:", min_value = 0,
max_value = 100, value = 100)
return threshold,fill,legend, number_threshold
df_binary= pd.DataFrame({"threshold":["No","Yes"],
"fill":["No","Yes"],
"legend":["Yes","No"]})
threshold=False
fill=False
legend=True
number_threshold = 100
if st.sidebar.checkbox("Show settings"):
#Threshold visualization
option_threshold= st.sidebar.selectbox("Threshold",list(df_binary["threshold"]), index = 0)
threshold = True if option_threshold == "Yes" else False
#Fill visualization
if other_graph == False:
option_fill= st.sidebar.selectbox("Fill",list(df_binary["fill"]), index = 0)
fill = True if option_fill == "Yes" else False
if other_graph:
fill = None
#Legend visualization
option_legend= st.sidebar.selectbox("Legend",list(df_binary["legend"]), index = 0)
legend = True if option_legend == "Yes" else False
number_threshold = st.sidebar.slider("Number of thresholds:", min_value = 0,
max_value = 100, value = 100)
return threshold,fill,legend, number_threshold
def main():
st.title("Medical Language Model Learner")
st.sidebar.markdown("""
Author: [Georgi Tancev](https://github.com/gtancev)
[Original Code](https://github.com/gtancev/MLML/blob/master/nlp_app.py),
[Article](https://towardsdatascience.com/mining-and-classifying-medical-text-documents-1876462f73bc),
[Data](https://www.kaggle.com/tboyle10/medicaltranscriptions/)
""")
st.sidebar.header("Sample Selection.")
filename = st.sidebar.selectbox("Choose a file.",("None", "mtsamples"))
st.header("Introduction")
st.markdown("""
This application guides you through the development of **a language model
that classifies clinical documents** according to their medical speciality.
It is based on a **term frequency-inverse document frequency (tf-idf)** approach. Tf-idf is a
numerical statistic that is intended to reflect how important a word is to a document
in a collection or corpus. It is often used as a weighting factor in searches of information
retrieval, text mining, and user modeling.
The tf-idf value increases proportionally to the number of times a word appears in the document
and is offset by the number of documents in the corpus that contain the word, which helps to
adjust for the fact that some words appear more frequently in general.
Tf-idf is one of the most popular term-weighting schemes today; 83% of text-based recommender