import scattertext as st
import spacy
import pandas as pd
import en_core_web_md


# load language model:
nlp = en_core_web_md.load()
nlp = spacy.load("en_core_web_md")

def customized_file(file_path: str,column_category: str, column_text: str, subcategory1: str, subcategory2: str):
    '''
    generate plot from user selected file
    :param file_path: the path of file to be analysis. it should be related path as the file or file directory should be in the same father directory of the script
    :param column_category: header of the subcategories
    :param column_text: header of text to be plotted
    :param subcategory1: the subcategory displayed on X-axis
    :param subcategory2: the subcategory displayed on Y-axis
    :return: the HTML path of scattertext plot
    '''
    # proceed data
    if file_path.endswith(".csv"):
        df = pd.read_csv(file_path)
    elif file_path.endswith(".txt"):
        df = pd.read_table(file_path, sep='\t')  # Doc: assume contents are seperated by Tabs.
    else:
        raise ValueError("Unsupported file format.")

    # filter Dataframe with target subcategories
    df_filtered = df[df[column_category].isin([subcategory1, subcategory2])]
    if df_filtered.empty:
        raise ValueError("This contect is empty. Check again")
    # convert to scattertext corpus
    corpus = st.CorpusFromPandas(df_filtered, category_col=column_category, text_col=column_text, nlp=nlp).build()
    # create visualization
    html = st.produce_scattertext_explorer(corpus,
                                            category = subcategory1,
                                            category_name = subcategory1,
                                            not_category_name = subcategory2,
                                            width_in_pixels = 1000,
                                            minimum_term_frequency = 0,
                                            metadata = df_filtered[column_category])

    html_file_path = f"scattertext_{subcategory1}_{subcategory2}.html"
    with open(html_file_path, "w", encoding='utf-8') as f:
        f.write(html)

    return html_file_path

test1 = customized_file('./sample_data/lens.csv', 'Lens ID','Abstract', '032-211-407-789-770', '036-842-302-145-799' )
test2 = customized_file('./sample_data/wos.txt', 'UT', 'AB','WOS:000685648800006','WOS:000448455800001')