import json
from dataclasses import dataclass, field, fields
from functools import cached_property
from pathlib import Path
from typing import Literal

import gradio as gr
import numpy as np
import pandas as pd
import plotly.graph_objects as go
from pandas import DataFrame
from pandas.io.formats.style import Styler

from content import *

TASK_METRICS = {
    "arc": "acc_norm",
    "hellaswag": "acc_norm",
    "mmlu": "acc_norm",
    "truthfulqa": "mc2",
}

MODEL_TYPE_EMOJIS = {
    "pretrained": "🟢",
    "fine-tuned": "🔶",
    "instruction-tuned": "⭕",
    "RL-tuned": "🟦",
}

NOT_GIVEN_SYMBOL = "❔"


@dataclass
class Result:
    model_name: str
    short_name: str
    model_type: Literal["pretrained", "fine-tuned", "instruction-tuned", "RL-tuned"]
    dutch_coverage: Literal["none", "pretrained", "fine-tuned"]
    num_parameters: int
    arc: float = field(default=np.nan)
    average: float = field(default=np.nan, init=False)
    hellaswag: float = field(default=np.nan)
    mmlu: float = field(default=np.nan)
    truthfulqa: float = field(default=np.nan)
    num_parameters_kmb: str = field(init=False)

    def __post_init__(self):
        if self.model_type not in ["pretrained", "fine-tuned", "instruction-tuned", "RL-tuned", "not-given"]:
            raise ValueError(
                f"Model type {self.model_type} must be one of 'pretrained', 'fine-tuned',"
                f" 'instruction-tuned', 'RL-tuned', 'not-given"
            )
        if self.dutch_coverage not in ["none", "pretrained", "fine-tuned", "not-given"]:
            raise ValueError(
                f"Dutch coverage {self.dutch_coverage} must be one of 'none', 'pretrained', 'fine-tuned', 'not-given"
            )

        field_names = {f.name for f in fields(self)}
        for task_name in TASK_METRICS:
            if task_name not in field_names:
                raise ValueError(f"Task name {task_name} not found in Result class fields so cannot create DataFrame")

        if any([np.isnan(getattr(self, task_name)) for task_name in TASK_METRICS]):
            self.average = np.nan
        else:
            self.average = sum([getattr(self, task_name) for task_name in TASK_METRICS]) / 4
        self.num_parameters_kmb = convert_number_to_kmb(self.num_parameters)


@dataclass
class ResultSet:
    results: list[Result]
    column_names: dict[str, str] = field(default_factory=dict)
    column_types: dict[str, str] = field(default_factory=dict)

    def __post_init__(self):
        if not self.column_names:
            # Order will be the order of the columns in the DataFrame
            self.column_names = {
                "short_name": "Model",
                "model_type": "T",
                "dutch_coverage": "🇳🇱",
                "num_parameters": "Size",
                "average": "Avg.",
                "arc": "ARC (25-shot)",
                "hellaswag": "HellaSwag (10-shot)",
                "mmlu": "MMLU (5-shot)",
                "truthfulqa": "TruthfulQA (0-shot)",
            }
            self.column_types = {
                "Model": "markdown",
                "T": "str",
                "🇳🇱": "str",
                "Size": "str",
                "Avg.": "number",
                "ARC (25-shot)": "number",
                "HellaSwag (10-shot)": "number",
                "MMLU (5-shot)": "number",
                "TruthfulQA (0-shot)": "number",
            }

        for column_type in self.column_types:
            if column_type not in set(self.column_names.values()):
                raise ValueError(
                    f"Column names specified in column_types must be values in column_names."
                    f" {column_type} not found."
                )

        if "average" not in self.column_names:
            raise ValueError("Column names must contain 'average' column name")

        field_names = [f.name for f in fields(Result)]
        for column_name in self.column_names:
            if column_name not in field_names:
                raise ValueError(f"Column name {column_name} not found in Result class so cannot create DataFrame")

    @cached_property
    def df(self) -> DataFrame:
        data = [
            {col_name: getattr(result, attr) for attr, col_name in self.column_names.items()}
            for result in self.results
        ]

        df = pd.DataFrame(data)
        df = df.sort_values(by=self.column_names["average"], ascending=False)
        return df

    @cached_property
    def styled_df(self) -> Styler:
        data = [
            {
                col_name: (
                    f"<a target='_blank' href='https://huggingface.co/{result.model_name}'"
                    f" style='color: var(--link-text-color); text-decoration: underline;text-decoration-style:"
                    f" dotted;'>{result.short_name}</a>"
                )
                if attr == "short_name"
                else MODEL_TYPE_EMOJIS.get(result.model_type, NOT_GIVEN_SYMBOL)
                if attr == "model_type"
                else (result.dutch_coverage if result.dutch_coverage != "not-given" else NOT_GIVEN_SYMBOL)
                if attr == "dutch_coverage"
                else getattr(result, attr)
                for attr, col_name in self.column_names.items()
            }
            for result in self.results
        ]

        df = pd.DataFrame(data)
        df = df.sort_values(by=self.column_names["average"], ascending=False)
        number_cols = [col for attr, col in self.column_names.items() if attr in TASK_METRICS or attr == "average"]
        styler = df.style.format("{:.4f}", subset=number_cols, na_rep="<missing>")

        def highlight_max(col):
            return np.where(col == np.nanmax(col.to_numpy()), "font-weight: bold;", None)

        styler = styler.apply(highlight_max, axis=0, subset=number_cols)
        num_params_col = self.column_names["num_parameters"]
        styler = styler.format(convert_number_to_kmb, subset=num_params_col)
        styler.set_caption("Leaderboard on Dutch benchmarks.")
        styler = styler.hide()
        return styler

    @cached_property
    def latex_df(self) -> Styler:
        number_cols = [col for attr, col in self.column_names.items() if attr in TASK_METRICS or attr == "average"]
        styler = self.df.style.format("{:.2f}", subset=number_cols, na_rep="<missing>")

        def highlight_max(col):
            return np.where(col == np.nanmax(col.to_numpy()), "font-weight: bold;", None)

        styler = styler.apply(highlight_max, axis=0, subset=number_cols)
        num_params_col = self.column_names["num_parameters"]
        styler = styler.format(convert_number_to_kmb, subset=num_params_col)
        styler.set_caption("Leaderboard on Dutch benchmarks.")
        styler = styler.hide()
        return styler

    @cached_property
    def viz_checkboxes(self):
        model_col_name = self.column_names["short_name"]
        avg_col = self.column_names["average"]
        top3_models = self.df.sort_values(by=avg_col, ascending=False)[model_col_name].tolist()[:3]
        return gr.CheckboxGroup(self.df[model_col_name].tolist(), label="Models", value=top3_models)

    def plot(self, model_names: list[str]):
        if not model_names:
            return None

        # Only get task columns and model name
        task_columns = [col for attr, col in self.column_names.items() if attr in TASK_METRICS or attr == "short_name"]
        df = self.df[task_columns]

        # Rename the columns to the task names
        reversed_col_names = {v: k for k, v in self.column_names.items() if v != "Model"}
        df = df.rename(columns=reversed_col_names)

        # Only keep the selected models
        df = df[df["Model"].isin(model_names)]

        # Melt the dataframe to long format
        df = df.melt(id_vars=["Model"], var_name="Task", value_name="Score").sort_values(by="Task")

        # Populate figure
        fig = go.Figure()
        for model_name in model_names:
            model_df = df[df["Model"] == model_name]
            scores = model_df["Score"].tolist()
            tasks = model_df["Task"].tolist()

            # Repeat the first point at the end to close the lines
            # Cf. https://community.plotly.com/t/closing-line-for-radar-cart-and-popup-window-on-chart-radar/47711/4
            scores.append(scores[0])
            tasks.append(tasks[0])

            fig.add_trace(go.Scatterpolar(r=scores, theta=tasks, name=model_name))

        fig.update_layout(
            title="Model performance on Dutch benchmarks",
        )

        return fig


def convert_number_to_kmb(number: int) -> str:
    """
    Converts a number to a string with K, M or B suffix
    :param number: the number to convert
    :return: a string with the number and a suffix, e.g. "7B", rounded to one decimal
    """
    if number >= 1_000_000_000:
        return f"{round(number / 1_000_000_000, 1)}B"
    elif number >= 1_000_000:
        return f"{round(number / 1_000_000, 1)}M"
    elif number >= 1_000:
        return f"{round(number / 1_000, 1)}K"
    else:
        return str(number)


def collect_results() -> ResultSet:
    """
    Collects results from the evals folder and returns a dictionary of results
    :return: a dictionary of results where the keys are typles of (model_name, language) and the values are
    dictionaries of the form {benchmark_name: performance_score}
    """
    evals_dir = Path(__file__).parent.joinpath("evals")
    pf_overview = evals_dir.joinpath("models.json")
    if not pf_overview.exists():
        raise ValueError(
            f"Overview file {pf_overview} not found. Make sure to generate it first with `generate_overview_json.py`."
        )

    model_info = json.loads(pf_overview.read_text(encoding="utf-8"))
    model_results = {}
    for pfin in evals_dir.rglob("*.json"):
        data = json.loads(pfin.read_text(encoding="utf-8"))

        if "results" not in data:
            continue

        task_results = data["results"]
        short_name = pfin.stem.split("_", 2)[2].lower()

        if short_name not in model_info:
            raise KeyError(
                f"Model {short_name} not found in overview file {pf_overview.name}. This means that a results JSON"
                f" file exists that has not yet been processed. First run the `generate_overview_json.py` script."
            )

        if short_name not in model_results:
            model_results[short_name] = {
                "short_name": short_name,
                "model_name": model_info[short_name]["model_name"],
                "model_type": model_info[short_name]["model_type"],
                "dutch_coverage": model_info[short_name]["dutch_coverage"],
                "num_parameters": model_info[short_name]["num_parameters"],
            }

        for task_name, task_result in task_results.items():
            task_name = task_name.rsplit("_", 1)[0]
            metric = TASK_METRICS[task_name]
            model_results[short_name][task_name] = task_result[metric]

    model_results = ResultSet([Result(**res) for short_name, res in model_results.items()])

    return model_results


with gr.Blocks() as demo:
    gr.HTML(TITLE)
    gr.Markdown(INTRO_TEXT)

    gr.Markdown(
        f"## Leaderboard\nOnly representative for the Dutch version (`*_nl`) of the benchmarks!"
        " All models have been benchmarked in 8-bit. `<missing>` values indicate that those benchmarks are still"
        " pending."
    )

    results = collect_results()

    gr.components.Dataframe(
        results.styled_df,
        headers=list(results.df.columns),
        datatype=[results.column_types[col] for col in results.df.columns],  # To ensure same order as headers
        interactive=False,
        elem_id="leaderboard-table",
    )

    with gr.Row():
        with gr.Column():
            modeltypes_str = "<br>".join([f"- {emoji}: {modeltype}" for modeltype, emoji in MODEL_TYPE_EMOJIS.items()])
            gr.Markdown(f"Model types:<br>{modeltypes_str}")

        with gr.Column():
            gr.Markdown(
                f"Language coverage ({results.column_names['dutch_coverage']}):"
                f"<br>- `none`: no explicit/deliberate Dutch coverage,"
                f"<br>- `pretrained`: pretrained on Dutch data,"
                f"<br>- `fine-tuned`: fine-tuned on Dutch data"
            )

        with gr.Column():
            metrics_str = "<br>".join([f"- {task}: `{metric}`" for task, metric in TASK_METRICS.items()])
            gr.Markdown(f"Reported metrics:<br>{metrics_str}")

    gr.Markdown("## LaTeX")
    gr.Code(results.latex_df.to_latex(convert_css=True))

    gr.Markdown("## Visualization")
    with gr.Row():
        with gr.Column():
            buttons = results.viz_checkboxes

        with gr.Column(scale=2):
            plot = gr.Plot(container=True)
            buttons.change(results.plot, inputs=buttons, outputs=[plot])
            demo.load(results.plot, inputs=buttons, outputs=[plot])

    gr.Markdown(DISCLAIMER, elem_classes="markdown-text")
    gr.Markdown(CREDIT, elem_classes="markdown-text")
    gr.Markdown(CITATION, elem_classes="markdown-text")


if __name__ == "__main__":
    demo.launch()