import json
from dataclasses import dataclass, field, fields
from functools import cached_property
from pathlib import Path
from typing import Literal
import gradio as gr
import numpy as np
import pandas as pd
import plotly.graph_objects as go
from pandas import DataFrame
from pandas.io.formats.style import Styler
from content import *
TASK_METRICS = {
"arc": "acc_norm",
"hellaswag": "acc_norm",
"mmlu": "acc_norm",
"truthfulqa": "mc2",
}
MODEL_TYPE_EMOJIS = {
"pretrained": "🟢",
"fine-tuned": "🔶",
"instruction-tuned": "⭕",
"RL-tuned": "🟦",
}
NOT_GIVEN_SYMBOL = "❔"
@dataclass
class Result:
model_name: str
short_name: str
model_type: Literal["pretrained", "fine-tuned", "instruction-tuned", "RL-tuned"]
dutch_coverage: Literal["none", "pretrained", "fine-tuned"]
num_parameters: int
arc: float = field(default=np.nan)
average: float = field(default=np.nan, init=False)
hellaswag: float = field(default=np.nan)
mmlu: float = field(default=np.nan)
truthfulqa: float = field(default=np.nan)
num_parameters_kmb: str = field(init=False)
def __post_init__(self):
if self.model_type not in ["pretrained", "fine-tuned", "instruction-tuned", "RL-tuned", "not-given"]:
raise ValueError(
f"Model type {self.model_type} must be one of 'pretrained', 'fine-tuned',"
f" 'instruction-tuned', 'RL-tuned', 'not-given"
)
if self.dutch_coverage not in ["none", "pretrained", "fine-tuned", "not-given"]:
raise ValueError(
f"Dutch coverage {self.dutch_coverage} must be one of 'none', 'pretrained', 'fine-tuned', 'not-given"
)
field_names = {f.name for f in fields(self)}
for task_name in TASK_METRICS:
if task_name not in field_names:
raise ValueError(f"Task name {task_name} not found in Result class fields so cannot create DataFrame")
if any([np.isnan(getattr(self, task_name)) for task_name in TASK_METRICS]):
self.average = np.nan
else:
self.average = sum([getattr(self, task_name) for task_name in TASK_METRICS]) / 4
self.num_parameters_kmb = convert_number_to_kmb(self.num_parameters)
@dataclass
class ResultSet:
results: list[Result]
column_names: dict[str, str] = field(default_factory=dict)
column_types: dict[str, str] = field(default_factory=dict)
def __post_init__(self):
if not self.column_names:
# Order will be the order of the columns in the DataFrame
self.column_names = {
"short_name": "Model",
"model_type": "T",
"dutch_coverage": "🇳🇱",
"num_parameters": "Size",
"average": "Avg.",
"arc": "ARC (25-shot)",
"hellaswag": "HellaSwag (10-shot)",
"mmlu": "MMLU (5-shot)",
"truthfulqa": "TruthfulQA (0-shot)",
}
self.column_types = {
"Model": "markdown",
"T": "str",
"🇳🇱": "str",
"Size": "str",
"Avg.": "number",
"ARC (25-shot)": "number",
"HellaSwag (10-shot)": "number",
"MMLU (5-shot)": "number",
"TruthfulQA (0-shot)": "number",
}
for column_type in self.column_types:
if column_type not in set(self.column_names.values()):
raise ValueError(
f"Column names specified in column_types must be values in column_names."
f" {column_type} not found."
)
if "average" not in self.column_names:
raise ValueError("Column names must contain 'average' column name")
field_names = [f.name for f in fields(Result)]
for column_name in self.column_names:
if column_name not in field_names:
raise ValueError(f"Column name {column_name} not found in Result class so cannot create DataFrame")
@cached_property
def df(self) -> DataFrame:
data = [
{col_name: getattr(result, attr) for attr, col_name in self.column_names.items()}
for result in self.results
]
df = pd.DataFrame(data)
df = df.sort_values(by=self.column_names["average"], ascending=False)
return df
@cached_property
def styled_df(self) -> Styler:
data = [
{
col_name: (
f"{result.short_name}"
)
if attr == "short_name"
else MODEL_TYPE_EMOJIS.get(result.model_type, NOT_GIVEN_SYMBOL)
if attr == "model_type"
else (result.dutch_coverage if result.dutch_coverage != "not-given" else NOT_GIVEN_SYMBOL)
if attr == "dutch_coverage"
else getattr(result, attr)
for attr, col_name in self.column_names.items()
}
for result in self.results
]
df = pd.DataFrame(data)
df = df.sort_values(by=self.column_names["average"], ascending=False)
number_cols = [col for attr, col in self.column_names.items() if attr in TASK_METRICS or attr == "average"]
styler = df.style.format("{:.4f}", subset=number_cols, na_rep="")
def highlight_max(col):
return np.where(col == np.nanmax(col.to_numpy()), "font-weight: bold;", None)
styler = styler.apply(highlight_max, axis=0, subset=number_cols)
num_params_col = self.column_names["num_parameters"]
styler = styler.format(convert_number_to_kmb, subset=num_params_col)
styler.set_caption("Leaderboard on Dutch benchmarks.")
styler = styler.hide()
return styler
@cached_property
def latex_df(self) -> Styler:
number_cols = [col for attr, col in self.column_names.items() if attr in TASK_METRICS or attr == "average"]
styler = self.df.style.format("{:.2f}", subset=number_cols, na_rep="")
def highlight_max(col):
return np.where(col == np.nanmax(col.to_numpy()), "font-weight: bold;", None)
styler = styler.apply(highlight_max, axis=0, subset=number_cols)
num_params_col = self.column_names["num_parameters"]
styler = styler.format(convert_number_to_kmb, subset=num_params_col)
styler.set_caption("Leaderboard on Dutch benchmarks.")
styler = styler.hide()
return styler
@cached_property
def viz_checkboxes(self):
model_col_name = self.column_names["short_name"]
avg_col = self.column_names["average"]
top3_models = self.df.sort_values(by=avg_col, ascending=False)[model_col_name].tolist()[:3]
return gr.CheckboxGroup(self.df[model_col_name].tolist(), label="Models", value=top3_models)
def plot(self, model_names: list[str]):
if not model_names:
return None
# Only get task columns and model name
task_columns = [col for attr, col in self.column_names.items() if attr in TASK_METRICS or attr == "short_name"]
df = self.df[task_columns]
# Rename the columns to the task names
reversed_col_names = {v: k for k, v in self.column_names.items() if v != "Model"}
df = df.rename(columns=reversed_col_names)
# Only keep the selected models
df = df[df["Model"].isin(model_names)]
# Melt the dataframe to long format
df = df.melt(id_vars=["Model"], var_name="Task", value_name="Score").sort_values(by="Task")
# Populate figure
fig = go.Figure()
for model_name in model_names:
model_df = df[df["Model"] == model_name]
scores = model_df["Score"].tolist()
tasks = model_df["Task"].tolist()
# Repeat the first point at the end to close the lines
# Cf. https://community.plotly.com/t/closing-line-for-radar-cart-and-popup-window-on-chart-radar/47711/4
scores.append(scores[0])
tasks.append(tasks[0])
fig.add_trace(go.Scatterpolar(r=scores, theta=tasks, name=model_name))
fig.update_layout(
title="Model performance on Dutch benchmarks",
)
return fig
def convert_number_to_kmb(number: int) -> str:
"""
Converts a number to a string with K, M or B suffix
:param number: the number to convert
:return: a string with the number and a suffix, e.g. "7B", rounded to one decimal
"""
if number >= 1_000_000_000:
return f"{round(number / 1_000_000_000, 1)}B"
elif number >= 1_000_000:
return f"{round(number / 1_000_000, 1)}M"
elif number >= 1_000:
return f"{round(number / 1_000, 1)}K"
else:
return str(number)
def collect_results() -> ResultSet:
"""
Collects results from the evals folder and returns a dictionary of results
:return: a dictionary of results where the keys are typles of (model_name, language) and the values are
dictionaries of the form {benchmark_name: performance_score}
"""
evals_dir = Path(__file__).parent.joinpath("evals")
pf_overview = evals_dir.joinpath("models.json")
if not pf_overview.exists():
raise ValueError(
f"Overview file {pf_overview} not found. Make sure to generate it first with `generate_overview_json.py`."
)
model_info = json.loads(pf_overview.read_text(encoding="utf-8"))
model_results = {}
for pfin in evals_dir.rglob("*.json"):
data = json.loads(pfin.read_text(encoding="utf-8"))
if "results" not in data:
continue
task_results = data["results"]
short_name = pfin.stem.split("_", 2)[2].lower()
if short_name not in model_info:
raise KeyError(
f"Model {short_name} not found in overview file {pf_overview.name}. This means that a results JSON"
f" file exists that has not yet been processed. First run the `generate_overview_json.py` script."
)
if short_name not in model_results:
model_results[short_name] = {
"short_name": short_name,
"model_name": model_info[short_name]["model_name"],
"model_type": model_info[short_name]["model_type"],
"dutch_coverage": model_info[short_name]["dutch_coverage"],
"num_parameters": model_info[short_name]["num_parameters"],
}
for task_name, task_result in task_results.items():
task_name = task_name.rsplit("_", 1)[0]
metric = TASK_METRICS[task_name]
model_results[short_name][task_name] = task_result[metric]
model_results = ResultSet([Result(**res) for short_name, res in model_results.items()])
return model_results
with gr.Blocks() as demo:
gr.HTML(TITLE)
gr.Markdown(INTRO_TEXT)
gr.Markdown(
f"## Leaderboard\nOnly representative for the Dutch version (`*_nl`) of the benchmarks!"
" All models have been benchmarked in 8-bit. `` values indicate that those benchmarks are still"
" pending."
)
results = collect_results()
gr.components.Dataframe(
results.styled_df,
headers=list(results.df.columns),
datatype=[results.column_types[col] for col in results.df.columns], # To ensure same order as headers
interactive=False,
elem_id="leaderboard-table",
)
with gr.Row():
with gr.Column():
modeltypes_str = "
".join([f"- {emoji}: {modeltype}" for modeltype, emoji in MODEL_TYPE_EMOJIS.items()])
gr.Markdown(f"Model types:
{modeltypes_str}")
with gr.Column():
gr.Markdown(
f"Language coverage ({results.column_names['dutch_coverage']}):"
f"
- `none`: no explicit/deliberate Dutch coverage,"
f"
- `pretrained`: pretrained on Dutch data,"
f"
- `fine-tuned`: fine-tuned on Dutch data"
)
with gr.Column():
metrics_str = "
".join([f"- {task}: `{metric}`" for task, metric in TASK_METRICS.items()])
gr.Markdown(f"Reported metrics:
{metrics_str}")
gr.Markdown("## LaTeX")
gr.Code(results.latex_df.to_latex(convert_css=True))
gr.Markdown("## Visualization")
with gr.Row():
with gr.Column():
buttons = results.viz_checkboxes
with gr.Column(scale=2):
plot = gr.Plot(container=True)
buttons.change(results.plot, inputs=buttons, outputs=[plot])
demo.load(results.plot, inputs=buttons, outputs=[plot])
gr.Markdown(DISCLAIMER, elem_classes="markdown-text")
gr.Markdown(CREDIT, elem_classes="markdown-text")
gr.Markdown(CITATION, elem_classes="markdown-text")
if __name__ == "__main__":
demo.launch()