import scattertext as sct
import spacy
import pandas as pd
import en_core_web_md
import streamlit as st


# load language model
nlp = en_core_web_md.load()
nlp = spacy.load("en_core_web_md")

# Scopus file loading
st.title("Scattertext Analysis")
st.header("Put your file here... ")

def find_column_name(df, column_hint):
    """
    Searches through the DataFrame columns to find a match for a hint provided,
    ignoring case and whitespace differences.
    """
    column_hint = column_hint.lower().replace(' ', '')
    for col in df.columns:
        if col.lower().replace(' ', '') == column_hint:
            return col
    raise ValueError(f"Column matching '{column_hint}' not found.")

def compatison1(selected_column):
    # type_of_comparison 1
    row2_col1, row2_col2 = st.columns(2)
    with row2_col1:
        first_source = st.selectbox("Choose First Source", df[source_title_col].unique(), key='first_source_select')
    with row2_col2:
        second_source = st.selectbox("Choose Second Source", df[source_title_col].unique(),
                                     key='second_source_select')

    # filter data
    first_data = df[df[source_title_col] == first_source].copy()
    second_data = df[df[source_title_col] == second_source].copy()
    filtered_data = pd.concat([first_data, second_data])

    if st.button("Generate the Scattertext Plot"):
        # make plot
        corpus = sct.CorpusFromPandas(
            filtered_data,
            category_col= source_title_col,
            text_col= selected_column,
            nlp=nlp,
        ).build()
        # generate HTML visualization
        html = sct.produce_scattertext_explorer(corpus,
                                                category=first_source,
                                                category_name=first_source,
                                                not_category_name=second_source,
                                                width_in_pixels=900,
                                                minimum_term_frequency=0,
                                                metadata=filtered_data)
        st.components.v1.html(html, width=1000, height=600)
        st.download_button('Download the plot', html)
    return


        # type_of_comparison 2
def comparison2(selected_column):
    df[year_col] = pd.to_numeric(df[year_col], errors='coerce')
    df.dropna(subset=[year_col], inplace=True)
    df[year_col] = df[year_col].astype(int)

    min_year = int(df[year_col].min())
    max_year = int(df[year_col].max())
    # layout row2
    row2_col1, row2_col2 = st.columns(2)
    with row2_col1:
        first_range = st.slider("First range", min_value = min_year, max_value= max_year, step = 1, value= (min_year, max_year))
    with row2_col2:
        second_range = st.slider("Second range", min_value = min_year, max_value= max_year, step = 1, value= (min_year, max_year))

    # filter data
    first_range_filter_df = df[(df[year_col] >= first_range[0]) & (df[year_col] <= first_range[1])].copy()
    first_range_filter_df['Topic Range'] = 'First range'

    second_range_filter_df = df[(df[year_col] >= second_range[0]) & (df[year_col] <= second_range[1])].copy()
    second_range_filter_df['Topic Range'] = 'Second range'

    filtered_df = pd.concat([first_range_filter_df, second_range_filter_df])

    if st.button("Generate the Scattertext Plot"):
        # make plot
        corpus = sct.CorpusFromPandas(
            filtered_df,
            category_col="Topic Range",
            text_col= selected_column,
            nlp=nlp,
        ).build()
        # generate HTML visualization
        html = sct.produce_scattertext_explorer(corpus,
                                                category='First range',
                                                category_name='First range',
                                                not_category_name='Second range',
                                                width_in_pixels=900,
                                                minimum_term_frequency=0,
                                                metadata=filtered_df)
        st.components.v1.html(html, width=1000, height=600)
        st.download_button('Download the plot', html)
    return


if __name__ == '__main__':
    uploaded_file = st.file_uploader("Choose a file", type=["csv", "txt"])
    if uploaded_file is not None:
        # determine file type
        if uploaded_file.name.endswith(".csv"):
            df = pd.read_csv(uploaded_file)
            source_title_col = find_column_name(df, 'Source Title')
            abstract_col = find_column_name(df, 'Abstract')
            title_col = find_column_name(df, 'Title')
            year_col = find_column_name(df, 'Year')
        # preview the uploaded file
        elif uploaded_file.name.endswith(".txt"):
            df = pd.read_table(uploaded_file, sep='\t')  # Doc: assume contents are seperated by Tabs.
            abstract_col = 'AB'
            title_col = 'TI'
            source_title_col = 'SO'
            year_col = 'PY'
            # preview the uploaded file
        else:
            st.error("Unsupported file format.")
            st.stop()

        column_choices = (abstract_col, title_col)

        row1_col1, row1_col2 = st.columns(2)
        with row1_col1:
            choice = st.selectbox("Choose column to analyze", column_choices)
        with row1_col2:
            comparison_options = ('Sources', 'Years')
            type_of_comparison = st.selectbox("Type of comparison", comparison_options)
        if type_of_comparison == 'Sources':
            compatison1(choice)
        if type_of_comparison == 'Years':
            comparison2(choice)