scattertext / scattertext_draft.py
Oliviayc's picture
Rename scattertext_funtion.py to scattertext_draft.py
32dc69e verified
import streamlit as st
import scattertext as stx
import spacy
import pandas as pd
import en_core_web_md
# load language model:
nlp = en_core_web_md.load()
nlp = spacy.load("en_core_web_md")
# config
st.title("Scattertext Analysis")
# TODO:update other web settings
# upload file
uploaded_file = st.file_uploader("Upload your text document", type=["csv", "txt"])
# read data
if uploaded_file is not None:
# choose function
function_choice = st.selectbox('Choose file source', ['Choose...', 'Customized', 'Download from Online Databases'])
# function1: generate plot from customized file
if st.button('Customized'):
# proceed data
if uploaded_file.name.endswith(".csv"):
df = pd.read_csv(uploaded_file)
elif uploaded_file.name.endswith(".txt"):
df = pd.read_table(uploaded_file, sep='\t') # TODO : doc: assume contents are seperated by Tabs.
chosen_column = st.selectbox("Choose text column for analysis", df.columns)
# convert to scattertext corpus
corpus = stx.CorpusFromPandas(
df,
category_col=df.head(),
text_col=df[chosen_column],
nlp=nlp,
). build()
# create visualization
# customize parameters
customize_category_name = st.text_input('Enter the category name')
customize_non_category_name = st.text_input('Enter the non-category name')
html = stx.produce_scattertext_explorer(corpus,
category=chosen_column,
category_name=customize_category_name,
not_category_name=customize_non_category_name,
width_in_pixels=1000,
minimum_term_frequency=0,
metadata=df)
st.components.v1.html(html)
else:
st.error("Unsupported file format.")
# function2: generate plot from databases
elif st.button('Download from Online Databases'): # TODO doc: Explain: analyze abstract.
# scopus & lens
if uploaded_file.name.endswith(".csv"):
df = pd.read_csv(uploaded_file)
chosen_column = st.selectbox("Choose text column for analysis BESIDES ABSTRACT", df.columns)
if chosen_column == 'Abstract':
st.write("This column cannot be selected, please select again")
else:
# make plot
corpus = stx.CorpusFromPandas(
df,
category_col=df[chosen_column],
text_col='Abstract',
nlp=nlp,
).build()
# generate HTML visualization
input_category_name = input('Enter the category name')
customize_category_name = st.text_input('Customize parameter', input_category_name)
input_non_category_name = input('Enter the non-category name')
customize_non_category_name = st.text_input('Customize parameter', input_non_category_name)
html = stx.produce_scattertext_explorer(corpus,
category=chosen_column,
category_name=customize_category_name,
not_category_name=customize_non_category_name,
width_in_pixels=1000,
minimum_term_frequency=0,
metadata=df)
st.components.v1.html(html)
# web of science
elif uploaded_file.name.endswith(".txt"):
df = pd.read_table(uploaded_file, sep='\t')
chosen_column = st.selectbox("Choose text column for analysis BESIDES ABSTRACT", df.head())
if chosen_column == 'AB':
st.write("This column cannot be selected, please select again")
else:
# make plot
corpus = stx.CorpusFromPandas(
df,
category_col=df[chosen_column],
text_col='Abstract',
nlp=nlp,
).build()
# generate HTML visualization
input_category_name = input('Enter the category name')
customize_category_name = st.text_input('Customize parameter', input_category_name)
input_non_category_name = input('Enter the non-category name')
customize_non_category_name = st.text_input('Customize parameter', input_non_category_name)
html = stx.produce_scattertext_explorer(corpus,
category=chosen_column,
category_name=customize_category_name,
not_category_name=customize_non_category_name,
width_in_pixels=1000,
minimum_term_frequency=0,
metadata=df)
st.components.v1.html(html)
else:
st.error("Unsupported file format.")
else:
st.write("Please upload a CSV or TXT file to begin.")