import json from dataclasses import dataclass, field, fields from functools import cached_property from pathlib import Path from typing import Literal import gradio as gr import numpy as np import pandas as pd import plotly.graph_objects as go from pandas import DataFrame from pandas.io.formats.style import Styler from content import * TASK_METRICS = { "arc": "acc_norm", "hellaswag": "acc_norm", "mmlu": "acc_norm", "truthfulqa": "mc2", } MODEL_TYPE_EMOJIS = { "pretrained": "🟢", "fine-tuned": "🔶", "instruction-tuned": "⭕", "RL-tuned": "🟦", } NOT_GIVEN_SYMBOL = "❔" @dataclass class Result: model_name: str short_name: str model_type: Literal["pretrained", "fine-tuned", "instruction-tuned", "RL-tuned"] dutch_coverage: Literal["none", "pretrained", "fine-tuned"] num_parameters: int arc: float = field(default=np.nan) average: float = field(default=np.nan, init=False) hellaswag: float = field(default=np.nan) mmlu: float = field(default=np.nan) truthfulqa: float = field(default=np.nan) num_parameters_kmb: str = field(init=False) def __post_init__(self): if self.model_type not in ["pretrained", "fine-tuned", "instruction-tuned", "RL-tuned", "not-given"]: raise ValueError( f"Model type {self.model_type} must be one of 'pretrained', 'fine-tuned'," f" 'instruction-tuned', 'RL-tuned', 'not-given" ) if self.dutch_coverage not in ["none", "pretrained", "fine-tuned", "not-given"]: raise ValueError( f"Dutch coverage {self.dutch_coverage} must be one of 'none', 'pretrained', 'fine-tuned', 'not-given" ) field_names = {f.name for f in fields(self)} for task_name in TASK_METRICS: if task_name not in field_names: raise ValueError(f"Task name {task_name} not found in Result class fields so cannot create DataFrame") if any([np.isnan(getattr(self, task_name)) for task_name in TASK_METRICS]): self.average = np.nan else: self.average = sum([getattr(self, task_name) for task_name in TASK_METRICS]) / 4 self.num_parameters_kmb = convert_number_to_kmb(self.num_parameters) @dataclass class ResultSet: results: list[Result] column_names: dict[str, str] = field(default_factory=dict) column_types: dict[str, str] = field(default_factory=dict) def __post_init__(self): if not self.column_names: # Order will be the order of the columns in the DataFrame self.column_names = { "short_name": "Model", "model_type": "T", "dutch_coverage": "🇳🇱", "num_parameters": "Size", "average": "Avg.", "arc": "ARC (25-shot)", "hellaswag": "HellaSwag (10-shot)", "mmlu": "MMLU (5-shot)", "truthfulqa": "TruthfulQA (0-shot)", } self.column_types = { "Model": "markdown", "T": "str", "🇳🇱": "str", "Size": "str", "Avg.": "number", "ARC (25-shot)": "number", "HellaSwag (10-shot)": "number", "MMLU (5-shot)": "number", "TruthfulQA (0-shot)": "number", } for column_type in self.column_types: if column_type not in set(self.column_names.values()): raise ValueError( f"Column names specified in column_types must be values in column_names." f" {column_type} not found." ) if "average" not in self.column_names: raise ValueError("Column names must contain 'average' column name") field_names = [f.name for f in fields(Result)] for column_name in self.column_names: if column_name not in field_names: raise ValueError(f"Column name {column_name} not found in Result class so cannot create DataFrame") @cached_property def df(self) -> DataFrame: data = [ {col_name: getattr(result, attr) for attr, col_name in self.column_names.items()} for result in self.results ] df = pd.DataFrame(data) df = df.sort_values(by=self.column_names["average"], ascending=False) return df @cached_property def styled_df(self) -> Styler: data = [ { col_name: ( f"{result.short_name}" ) if attr == "short_name" else MODEL_TYPE_EMOJIS.get(result.model_type, NOT_GIVEN_SYMBOL) if attr == "model_type" else (result.dutch_coverage if result.dutch_coverage != "not-given" else NOT_GIVEN_SYMBOL) if attr == "dutch_coverage" else getattr(result, attr) for attr, col_name in self.column_names.items() } for result in self.results ] df = pd.DataFrame(data) df = df.sort_values(by=self.column_names["average"], ascending=False) number_cols = [col for attr, col in self.column_names.items() if attr in TASK_METRICS or attr == "average"] styler = df.style.format("{:.4f}", subset=number_cols, na_rep="") def highlight_max(col): return np.where(col == np.nanmax(col.to_numpy()), "font-weight: bold;", None) styler = styler.apply(highlight_max, axis=0, subset=number_cols) num_params_col = self.column_names["num_parameters"] styler = styler.format(convert_number_to_kmb, subset=num_params_col) styler.set_caption("Leaderboard on Dutch benchmarks.") styler = styler.hide() return styler @cached_property def latex_df(self) -> Styler: number_cols = [col for attr, col in self.column_names.items() if attr in TASK_METRICS or attr == "average"] styler = self.df.style.format("{:.2f}", subset=number_cols, na_rep="") def highlight_max(col): return np.where(col == np.nanmax(col.to_numpy()), "font-weight: bold;", None) styler = styler.apply(highlight_max, axis=0, subset=number_cols) num_params_col = self.column_names["num_parameters"] styler = styler.format(convert_number_to_kmb, subset=num_params_col) styler.set_caption("Leaderboard on Dutch benchmarks.") styler = styler.hide() return styler @cached_property def viz_checkboxes(self): model_col_name = self.column_names["short_name"] avg_col = self.column_names["average"] top3_models = self.df.sort_values(by=avg_col, ascending=False)[model_col_name].tolist()[:3] return gr.CheckboxGroup(self.df[model_col_name].tolist(), label="Models", value=top3_models) def plot(self, model_names: list[str]): if not model_names: return None # Only get task columns and model name task_columns = [col for attr, col in self.column_names.items() if attr in TASK_METRICS or attr == "short_name"] df = self.df[task_columns] # Rename the columns to the task names reversed_col_names = {v: k for k, v in self.column_names.items() if v != "Model"} df = df.rename(columns=reversed_col_names) # Only keep the selected models df = df[df["Model"].isin(model_names)] # Melt the dataframe to long format df = df.melt(id_vars=["Model"], var_name="Task", value_name="Score").sort_values(by="Task") # Populate figure fig = go.Figure() for model_name in model_names: model_df = df[df["Model"] == model_name] scores = model_df["Score"].tolist() tasks = model_df["Task"].tolist() # Repeat the first point at the end to close the lines # Cf. https://community.plotly.com/t/closing-line-for-radar-cart-and-popup-window-on-chart-radar/47711/4 scores.append(scores[0]) tasks.append(tasks[0]) fig.add_trace(go.Scatterpolar(r=scores, theta=tasks, name=model_name)) fig.update_layout( title="Model performance on Dutch benchmarks", ) return fig def convert_number_to_kmb(number: int) -> str: """ Converts a number to a string with K, M or B suffix :param number: the number to convert :return: a string with the number and a suffix, e.g. "7B", rounded to one decimal """ if number >= 1_000_000_000: return f"{round(number / 1_000_000_000, 1)}B" elif number >= 1_000_000: return f"{round(number / 1_000_000, 1)}M" elif number >= 1_000: return f"{round(number / 1_000, 1)}K" else: return str(number) def collect_results() -> ResultSet: """ Collects results from the evals folder and returns a dictionary of results :return: a dictionary of results where the keys are typles of (model_name, language) and the values are dictionaries of the form {benchmark_name: performance_score} """ evals_dir = Path(__file__).parent.joinpath("evals") pf_overview = evals_dir.joinpath("models.json") if not pf_overview.exists(): raise ValueError( f"Overview file {pf_overview} not found. Make sure to generate it first with `generate_overview_json.py`." ) model_info = json.loads(pf_overview.read_text(encoding="utf-8")) model_results = {} for pfin in evals_dir.rglob("*.json"): data = json.loads(pfin.read_text(encoding="utf-8")) if "results" not in data: continue task_results = data["results"] short_name = pfin.stem.split("_", 2)[2].lower() if short_name not in model_info: raise KeyError( f"Model {short_name} not found in overview file {pf_overview.name}. This means that a results JSON" f" file exists that has not yet been processed. First run the `generate_overview_json.py` script." ) if short_name not in model_results: model_results[short_name] = { "short_name": short_name, "model_name": model_info[short_name]["model_name"], "model_type": model_info[short_name]["model_type"], "dutch_coverage": model_info[short_name]["dutch_coverage"], "num_parameters": model_info[short_name]["num_parameters"], } for task_name, task_result in task_results.items(): task_name = task_name.rsplit("_", 1)[0] metric = TASK_METRICS[task_name] model_results[short_name][task_name] = task_result[metric] model_results = ResultSet([Result(**res) for short_name, res in model_results.items()]) return model_results with gr.Blocks() as demo: gr.HTML(TITLE) gr.Markdown(INTRO_TEXT) gr.Markdown( f"## Leaderboard\nOnly representative for the Dutch version (`*_nl`) of the benchmarks!" " All models have been benchmarked in 8-bit. `` values indicate that those benchmarks are still" " pending." ) results = collect_results() gr.components.Dataframe( results.styled_df, headers=list(results.df.columns), datatype=[results.column_types[col] for col in results.df.columns], # To ensure same order as headers interactive=False, elem_id="leaderboard-table", ) with gr.Row(): with gr.Column(): modeltypes_str = "
".join([f"- {emoji}: {modeltype}" for modeltype, emoji in MODEL_TYPE_EMOJIS.items()]) gr.Markdown(f"Model types:
{modeltypes_str}") with gr.Column(): gr.Markdown( f"Language coverage ({results.column_names['dutch_coverage']}):" f"
- `none`: no explicit/deliberate Dutch coverage," f"
- `pretrained`: pretrained on Dutch data," f"
- `fine-tuned`: fine-tuned on Dutch data" ) with gr.Column(): metrics_str = "
".join([f"- {task}: `{metric}`" for task, metric in TASK_METRICS.items()]) gr.Markdown(f"Reported metrics:
{metrics_str}") gr.Markdown("## LaTeX") gr.Code(results.latex_df.to_latex(convert_css=True)) gr.Markdown("## Visualization") with gr.Row(): with gr.Column(): buttons = results.viz_checkboxes with gr.Column(scale=2): plot = gr.Plot(container=True) buttons.change(results.plot, inputs=buttons, outputs=[plot]) demo.load(results.plot, inputs=buttons, outputs=[plot]) gr.Markdown(DISCLAIMER, elem_classes="markdown-text") gr.Markdown(CREDIT, elem_classes="markdown-text") gr.Markdown(CITATION, elem_classes="markdown-text") if __name__ == "__main__": demo.launch()