scattertext / app.py
Oliviayc's picture
try to debug dynamically determined column name
db25a2e
import scattertext as sct
import spacy
import pandas as pd
import en_core_web_md
import streamlit as st
# load language model
nlp = en_core_web_md.load()
nlp = spacy.load("en_core_web_md")
# Scopus file loading
st.title("Scattertext Analysis")
st.header("Put your file here... ")
def find_column_name(df, column_hint):
"""
Searches through the DataFrame columns to find a match for a hint provided,
ignoring case and whitespace differences.
"""
column_hint = column_hint.lower().replace(' ', '')
for col in df.columns:
if col.lower().replace(' ', '') == column_hint:
return col
raise ValueError(f"Column matching '{column_hint}' not found.")
def compatison1(selected_column):
# type_of_comparison 1
row2_col1, row2_col2 = st.columns(2)
with row2_col1:
first_source = st.selectbox("Choose First Source", df[source_title_col].unique(), key='first_source_select')
with row2_col2:
second_source = st.selectbox("Choose Second Source", df[source_title_col].unique(),
key='second_source_select')
# filter data
first_data = df[df[source_title_col] == first_source].copy()
second_data = df[df[source_title_col] == second_source].copy()
filtered_data = pd.concat([first_data, second_data])
if st.button("Generate the Scattertext Plot"):
# make plot
corpus = sct.CorpusFromPandas(
filtered_data,
category_col= source_title_col,
text_col= selected_column,
nlp=nlp,
).build()
# generate HTML visualization
html = sct.produce_scattertext_explorer(corpus,
category=first_source,
category_name=first_source,
not_category_name=second_source,
width_in_pixels=900,
minimum_term_frequency=0,
metadata=filtered_data)
st.components.v1.html(html, width=1000, height=600)
st.download_button('Download the plot', html)
return
# type_of_comparison 2
def comparison2(selected_column):
df[year_col] = pd.to_numeric(df[year_col], errors='coerce')
df.dropna(subset=[year_col], inplace=True)
df[year_col] = df[year_col].astype(int)
min_year = int(df[year_col].min())
max_year = int(df[year_col].max())
# layout row2
row2_col1, row2_col2 = st.columns(2)
with row2_col1:
first_range = st.slider("First range", min_value = min_year, max_value= max_year, step = 1, value= (min_year, max_year))
with row2_col2:
second_range = st.slider("Second range", min_value = min_year, max_value= max_year, step = 1, value= (min_year, max_year))
# filter data
first_range_filter_df = df[(df[year_col] >= first_range[0]) & (df[year_col] <= first_range[1])].copy()
first_range_filter_df['Topic Range'] = 'First range'
second_range_filter_df = df[(df[year_col] >= second_range[0]) & (df[year_col] <= second_range[1])].copy()
second_range_filter_df['Topic Range'] = 'Second range'
filtered_df = pd.concat([first_range_filter_df, second_range_filter_df])
if st.button("Generate the Scattertext Plot"):
# make plot
corpus = sct.CorpusFromPandas(
filtered_df,
category_col="Topic Range",
text_col= selected_column,
nlp=nlp,
).build()
# generate HTML visualization
html = sct.produce_scattertext_explorer(corpus,
category='First range',
category_name='First range',
not_category_name='Second range',
width_in_pixels=900,
minimum_term_frequency=0,
metadata=filtered_df)
st.components.v1.html(html, width=1000, height=600)
st.download_button('Download the plot', html)
return
if __name__ == '__main__':
uploaded_file = st.file_uploader("Choose a file", type=["csv", "txt"])
if uploaded_file is not None:
# determine file type
if uploaded_file.name.endswith(".csv"):
df = pd.read_csv(uploaded_file)
source_title_col = find_column_name(df, 'Source Title')
abstract_col = find_column_name(df, 'Abstract')
title_col = find_column_name(df, 'Title')
year_col = find_column_name(df, 'Year')
# preview the uploaded file
elif uploaded_file.name.endswith(".txt"):
df = pd.read_table(uploaded_file, sep='\t') # Doc: assume contents are seperated by Tabs.
abstract_col = 'AB'
title_col = 'TI'
source_title_col = 'SO'
year_col = 'PY'
# preview the uploaded file
else:
st.error("Unsupported file format.")
st.stop()
column_choices = (abstract_col, title_col)
row1_col1, row1_col2 = st.columns(2)
with row1_col1:
choice = st.selectbox("Choose column to analyze", column_choices)
with row1_col2:
comparison_options = ('Sources', 'Years')
type_of_comparison = st.selectbox("Type of comparison", comparison_options)
if type_of_comparison == 'Sources':
compatison1(choice)
if type_of_comparison == 'Years':
comparison2(choice)