import gradio as gr
import os
from huggingface_hub import HfApi, snapshot_download
from apscheduler.schedulers.background import BackgroundScheduler
from datasets import load_dataset
from src.utils import load_all_data
from src.md import ABOUT_TEXT, TOP_TEXT
from src.plt import plot_avg_correlation
import numpy as np

api = HfApi()

COLLAB_TOKEN = os.environ.get("COLLAB_TOKEN")
evals_repo = "ai2-adapt-dev/HERM-Results"

eval_set_repo = "ai2-adapt-dev/rm-benchmark-dev"
repo_dir_herm = "./evals/herm/"

def restart_space():
    api.restart_space(repo_id="ai2-adapt-dev/rm-benchmark-viewer", token=COLLAB_TOKEN)

print("Pulling evaluation results")
repo = snapshot_download(
    local_dir=repo_dir_herm,
    repo_id=evals_repo,
    use_auth_token=COLLAB_TOKEN,
    tqdm_class=None, 
    etag_timeout=30,
    repo_type="dataset",
)


def avg_over_herm(dataframe):
    """
    Averages over the subsets alpacaeval, mt-bench, llmbar, refusals, hep and returns dataframe with only these columns.
    """
    new_df = dataframe.copy()
    subsets = ["alpacaeval", "mt-bench", "llmbar", "refusals", "hep"]
    # for each subset, avg the columns that have the subset in the column name, then add a new column with subset name and avg
    for subset in subsets:
        if subset == "refusals":
            subset_cols = ["refusals-dangerous", "refusals-offensive", "donotanswer","xstest-should-refuse", "xstest-should-respond"]
        else:
            subset_cols = [col for col in new_df.columns if subset in col]
        new_df[subset] = np.round(np.nanmean(new_df[subset_cols].values, axis=1), 2)

    keep_columns = ["model", "average"] + subsets
    new_df = new_df[keep_columns]
    # replace average column with new average
    new_df["average"] = np.round(np.nanmean(new_df[subsets].values, axis=1), 2)
    # rename column "hep" to "hep (code)"
    new_df = new_df.rename(columns={"hep": "hep (code)"})        
    return new_df

def expand_subsets(dataframe):
    # TODO need to modify data/ script to do this
    pass

# reference for length bias categories
length_categories = {
    'alpacaeval-easy': 'True',
    'alpacaeval-hard': 'True',
    'alpacaeval-length': 'Neutral',
    'donotanswer': 'False',
    'hep-cpp': 'Neutral',
    'hep-go': 'Neutral',
    'hep-java': 'Neutral',
    'hep-js': 'Neutral',
    'hep-python': 'Neutral',
    'hep-rust': 'Neutral',
    'llmbar-adver-GPTInst': 'False',
    'llmbar-adver-GPTOut': 'Neutral',
    'llmbar-adver-manual': 'False',
    'llmbar-adver-neighbor': 'False',
    'llmbar-natural': 'Neutral',
    'mt-bench-easy': 'False',
    'mt-bench-hard': 'False',
    'mt-bench-med': 'Neutral',
    'refusals-dangerous': 'False',
    'refusals-offensive': 'False',
    'xstest-should-refuse': 'False',
    'xstest-should-respond': 'True'
}

def length_bias_check(dataframe):
    """
    Takes the raw herm dataframe and splits the data into new buckets according to length_categories.
    Then, take the average of the three buckets as "average"
    """
    new_df = dataframe.copy()
    existing_subsets = new_df.columns[2:]
    final_subsets = ["Length Bias", "Neutral", "Terse Bias"]
    # new data is empty list dict for each final subset
    new_data = {s: [] for s in final_subsets}

    # now, subsets correspond to those with True, Nuetral, and False length bias
    # check if length_categories[subset] == "True" or "False" or "Neutral"
    for subset in existing_subsets:
        subset_data = new_df[subset].values
        subset_length = length_categories[subset]
        # route to the correct bucket
        if subset_length == "True":
            new_data["Length Bias"].append(subset_data)
        elif subset_length == "Neutral":
            new_data["Neutral"].append(subset_data)
        elif subset_length == "False":
            new_data["Terse Bias"].append(subset_data)

    # take average of new_data and add to new_df (removing other columns than model)
    for subset in final_subsets:
        new_df[subset] = np.round(np.nanmean(new_data[subset], axis=0), 2)
    keep_columns = ["model"] + final_subsets
    new_df = new_df[keep_columns]
    # recompute average
    # new_df["average"] = np.round(np.nanmean(new_df[final_subsets].values, axis=1), 2)

    return new_df


herm_data = load_all_data(repo_dir_herm, subdir="eval-set").sort_values(by='average', ascending=False)
herm_data_avg = avg_over_herm(herm_data).sort_values(by='average', ascending=False)
herm_data_length = length_bias_check(herm_data).sort_values(by='Terse Bias', ascending=False)
prefs_data = load_all_data(repo_dir_herm, subdir="pref-sets").sort_values(by='average', ascending=False)
# prefs_data_sub = expand_subsets(prefs_data).sort_values(by='average', ascending=False)

col_types_herm = ["markdown"] + ["number"] * (len(herm_data.columns) - 1)
col_types_herm_avg = ["markdown"] + ["number"] * (len(herm_data_avg.columns) - 1)
cols_herm_data_length = ["markdown"] + ["number"] * (len(herm_data_length.columns) - 1)
col_types_prefs = ["markdown"] + ["number"] * (len(prefs_data.columns) - 1)
# col_types_prefs_sub = ["markdown"] + ["number"] * (len(prefs_data_sub.columns) - 1)

# for showing random samples
eval_set = load_dataset(eval_set_repo, use_auth_token=COLLAB_TOKEN, split="filtered")
def random_sample(r: gr.Request, subset):
    if subset is None or subset == []:
        sample_index = np.random.randint(0, len(eval_set) - 1)
        sample = eval_set[sample_index]
    else: # filter by subsets (can be list)
        if isinstance(subset, str):
            subset = [subset]
        # filter down dataset to only include the subset(s)
        eval_set_filtered = eval_set.filter(lambda x: x["subset"] in subset)
        sample_index = np.random.randint(0, len(eval_set_filtered) - 1)
        sample = eval_set_filtered[sample_index]

    markdown_text = '\n\n'.join([f"**{key}**:\n\n{value}" for key, value in sample.items()])
    return markdown_text

subsets = eval_set.unique("subset")

with gr.Blocks() as app:
    # create tabs for the app, moving the current table to one titled "HERM" and the benchmark_text to a tab called "About"
    with gr.Row():
        gr.Markdown(TOP_TEXT)
    with gr.Tabs(elem_classes="tab-buttons") as tabs:
        with gr.TabItem("HERM Eval Set - Overview"):
            with gr.Row():
                herm_table = gr.Dataframe(
                    herm_data_avg.values,
                    datatype=col_types_herm_avg,
                    headers=herm_data_avg.columns.tolist(),
                    elem_id="herm_dataframe_avg",
                    height=1000,
                )
        with gr.TabItem("HERM Eval Set - Detailed"):
            with gr.Row():
                herm_table = gr.Dataframe(
                    herm_data.values,
                    datatype=col_types_herm,
                    headers=herm_data.columns.tolist(),
                    elem_id="herm_dataframe",
                    height=1000,
                )
        with gr.TabItem("HERM Eval Set - Length Bias"):
            with gr.Row():
                herm_table = gr.Dataframe(
                    herm_data_length.values,
                    datatype=cols_herm_data_length,
                    headers=herm_data_length.columns.tolist(),
                    elem_id="herm_dataframe_length",
                    height=1000,
                )
        with gr.TabItem("Known Pref. Sets"):
            with gr.Row():
                PREF_SET_TEXT = """
                For more information, see the [dataset](https://huggingface.co/datasets/allenai/pref-test-sets).
                """
                gr.Markdown(PREF_SET_TEXT)
            with gr.Row():
                pref_sets_table = gr.Dataframe(
                    prefs_data.values,
                    datatype=col_types_prefs,
                    headers=prefs_data.columns.tolist(),
                    elem_id="prefs_dataframe",
                    height=1000,
                )

        with gr.TabItem("About"):
            with gr.Row():
                gr.Markdown(ABOUT_TEXT)

        with gr.TabItem("Dataset Viewer"):
            with gr.Row():
                # loads one sample
                gr.Markdown("## Random Dataset Sample Viewer")
                subset_selector = gr.Dropdown(subsets, label="Subset", value=None, multiselect=True)
                button = gr.Button("Show Random Sample")

            with gr.Row():
                sample_display = gr.Markdown("{sampled data loads here}")

            button.click(fn=random_sample, inputs=[subset_selector], outputs=[sample_display])
        # removed plot because not pretty enough
        # with gr.TabItem("Model Correlation"):
        #     with gr.Row():
        #         plot = plot_avg_correlation(herm_data_avg, prefs_data)
        #         gr.Plot(plot)
                
# Load data when app starts, TODO make this used somewhere...
# def load_data_on_start():
#     data_herm = load_all_data(repo_dir_herm)
#     herm_table.update(data_herm)

#     data_herm_avg = avg_over_herm(repo_dir_herm)
#     herm_table.update(data_herm_avg)

#     data_prefs = load_all_data(repo_dir_prefs)
#     pref_sets_table.update(data_prefs)

scheduler = BackgroundScheduler()
scheduler.add_job(restart_space, "interval", seconds=10800) # restarted every 3h
scheduler.start()


app.queue().launch()