import scattertext as sct import spacy import pandas as pd import en_core_web_md import streamlit as st # load language model nlp = en_core_web_md.load() nlp = spacy.load("en_core_web_md") # Scopus file loading st.title("Scattertext Analysis") st.header("Put your file here... ") def find_column_name(df, column_hint): """ Searches through the DataFrame columns to find a match for a hint provided, ignoring case and whitespace differences. """ column_hint = column_hint.lower().replace(' ', '') for col in df.columns: if col.lower().replace(' ', '') == column_hint: return col raise ValueError(f"Column matching '{column_hint}' not found.") def compatison1(selected_column): # type_of_comparison 1 row2_col1, row2_col2 = st.columns(2) with row2_col1: first_source = st.selectbox("Choose First Source", df[source_title_col].unique(), key='first_source_select') with row2_col2: second_source = st.selectbox("Choose Second Source", df[source_title_col].unique(), key='second_source_select') # filter data first_data = df[df[source_title_col] == first_source].copy() second_data = df[df[source_title_col] == second_source].copy() filtered_data = pd.concat([first_data, second_data]) if st.button("Generate the Scattertext Plot"): # make plot corpus = sct.CorpusFromPandas( filtered_data, category_col= source_title_col, text_col= selected_column, nlp=nlp, ).build() # generate HTML visualization html = sct.produce_scattertext_explorer(corpus, category=first_source, category_name=first_source, not_category_name=second_source, width_in_pixels=900, minimum_term_frequency=0, metadata=filtered_data) st.components.v1.html(html, width=1000, height=600) st.download_button('Download the plot', html) return # type_of_comparison 2 def comparison2(selected_column): df[year_col] = pd.to_numeric(df[year_col], errors='coerce') df.dropna(subset=[year_col], inplace=True) df[year_col] = df[year_col].astype(int) min_year = int(df[year_col].min()) max_year = int(df[year_col].max()) # layout row2 row2_col1, row2_col2 = st.columns(2) with row2_col1: first_range = st.slider("First range", min_value = min_year, max_value= max_year, step = 1, value= (min_year, max_year)) with row2_col2: second_range = st.slider("Second range", min_value = min_year, max_value= max_year, step = 1, value= (min_year, max_year)) # filter data first_range_filter_df = df[(df[year_col] >= first_range[0]) & (df[year_col] <= first_range[1])].copy() first_range_filter_df['Topic Range'] = 'First range' second_range_filter_df = df[(df[year_col] >= second_range[0]) & (df[year_col] <= second_range[1])].copy() second_range_filter_df['Topic Range'] = 'Second range' filtered_df = pd.concat([first_range_filter_df, second_range_filter_df]) if st.button("Generate the Scattertext Plot"): # make plot corpus = sct.CorpusFromPandas( filtered_df, category_col="Topic Range", text_col= selected_column, nlp=nlp, ).build() # generate HTML visualization html = sct.produce_scattertext_explorer(corpus, category='First range', category_name='First range', not_category_name='Second range', width_in_pixels=900, minimum_term_frequency=0, metadata=filtered_df) st.components.v1.html(html, width=1000, height=600) st.download_button('Download the plot', html) return if __name__ == '__main__': uploaded_file = st.file_uploader("Choose a file", type=["csv", "txt"]) if uploaded_file is not None: # determine file type if uploaded_file.name.endswith(".csv"): df = pd.read_csv(uploaded_file) source_title_col = find_column_name(df, 'Source Title') abstract_col = find_column_name(df, 'Abstract') title_col = find_column_name(df, 'Title') year_col = find_column_name(df, 'Year') # preview the uploaded file elif uploaded_file.name.endswith(".txt"): df = pd.read_table(uploaded_file, sep='\t') # Doc: assume contents are seperated by Tabs. abstract_col = 'AB' title_col = 'TI' source_title_col = 'SO' year_col = 'PY' # preview the uploaded file else: st.error("Unsupported file format.") st.stop() column_choices = (abstract_col, title_col) row1_col1, row1_col2 = st.columns(2) with row1_col1: choice = st.selectbox("Choose column to analyze", column_choices) with row1_col2: comparison_options = ('Sources', 'Years') type_of_comparison = st.selectbox("Type of comparison", comparison_options) if type_of_comparison == 'Sources': compatison1(choice) if type_of_comparison == 'Years': comparison2(choice)