import json from collections import defaultdict from dataclasses import dataclass, field from functools import cached_property from pathlib import Path import numpy as np import pandas as pd import gradio as gr from pandas import DataFrame from pandas.io.formats.style import Styler from content import * ARC = "arc" HELLASWAG = "hellaswag" MMLU = "mmlu" TRUTHFULQA = "truthfulqa" BENCHMARKS = [ARC, HELLASWAG, MMLU, TRUTHFULQA] METRICS = ["acc_norm", "acc_norm", "acc_norm", "mc2"] MODEL_COL = "Model" AVERAGE_COL = "Average" ARC_COL = "ARC (25-shot)" HELLASWAG_COL = "HellaSwag (10-shot)️" MMLU_COL = "MMLU (5-shot)" TRUTHFULQA_COL = "TruthfulQA (0-shot)" TRAIN_TYPE_COL = "Training type" TRAIN_TYPE_COL = "Training type" NUM_PARAMETERS = "Num. parameters" @dataclass class Result: train_type: str num_parameters: int arc: float = field(default=0.) hellaswag: float = field(default=0.) mmlu: float = field(default=0.) truthfulqa: float = field(default=0.) @cached_property def num_parameters_kmb(self) -> str: return convert_number_to_kmb(self.num_parameters) @cached_property def average(self) -> float: return self.arc + self.hellaswag + self.mmlu + self.truthfulqa / 4 def convert_number_to_kmb(number: int) -> str: """ Converts a number to a string with K, M or B suffix :param number: the number to convert :return: a string with the number and a suffix, e.g. "7B", rounded to one decimal """ if number >= 1_000_000_000: return f"{round(number / 1_000_000_000, 1)}B" elif number >= 1_000_000: return f"{round(number / 1_000_000, 1)}M" elif number >= 1_000: return f"{round(number / 1_000, 1)}K" else: return str(number) def collect_results() -> dict[tuple[str, str], dict[str, float]]: """ Collects results from the evals folder and returns a dictionary of results :return: a dictionary of results where the keys are typles of (model_name, language) and the values are dictionaries of the form {benchmark_name: performance_score} """ performance_dict = defaultdict(dict) for pfin in Path("evals").rglob("*.json"): data = json.loads(pfin.read_text(encoding="utf-8")) if "results" not in data or "config" not in data: continue results = data["results"] config = data["config"] if "model_args" not in config: continue model_args = config["model_args"].split(",") pretrained = [x for x in model_args if x.startswith("pretrained=")] if len(pretrained) != 1: continue pretrained = pretrained[0].split("=")[1] pretrained = pretrained.split("/")[-1] for lang_task, perfs in results.items(): task, lang = lang_task.split("_") assert task in BENCHMARKS if lang and task: metric = METRICS[BENCHMARKS.index(task)] p = round(perfs[metric] * 100, 1) performance_dict[(pretrained, lang)][task] = p return dict(performance_dict) def build_performance_df(performance_dict: dict[tuple[str, str], dict[str, float]]) -> DataFrame: """ Builds a dataframe from the performance dictionary :param performance_dict: a dictionary of results where the keys are typles of (model_name, language) and the values are dictionaries of the form {benchmark_name: performance_score} :return: a pd.DataFrame that has as rows the model names and as columns the benchmarks """ data = [] dutch_training_info = json.loads(Path(__file__).parent.joinpath("evals/dutch_models.json").read_text(encoding="utf-8")) for (pretrained, lang), perfs in performance_dict.items(): arc_perf = perfs.get(ARC, 0.0) hellaswag_perf = perfs.get(HELLASWAG, 0.0) mmlu_perf = perfs.get(MMLU, 0.0) truthfulqa_perf = perfs.get(TRUTHFULQA, 0.0) training_type = dutch_training_info.get(pretrained, "NA") avg = round((arc_perf + hellaswag_perf + mmlu_perf + truthfulqa_perf) / 4, 1) row = [pretrained, training_type, avg, arc_perf, hellaswag_perf, mmlu_perf, truthfulqa_perf] data.append(row) df = pd.DataFrame.from_records(data, columns=COLS) df = df.sort_values(by=[AVERAGE_COL], ascending=False) return df def style_df(df: DataFrame) -> Styler: """ Styles the dataframe by rounding to two decimals and putting the max value in bold per column :param df: the dataframe to style :return: the Styler """ styler = df.style.format("{:.2f}", subset=df.columns[2:]) def highlight_max(col): return np.where(col == np.nanmax(col.to_numpy()), "font-weight: bold;", None) styler = styler.apply(highlight_max, axis=1, subset=df.columns[2:]) styler = styler.hide() return styler MODEL_COL = "Model" AVERAGE_COL = "Average" ARC_COL = "ARC (25-shot)" HELLASWAG_COL = "HellaSwag (10-shot)️" MMLU_COL = "MMLU (5-shot)" TRUTHFULQA_COL = "TruthfulQA (0-shot)" TRAIN_TYPE_COL = "Training type" TRAIN_TYPE_COL = "Training type" NUM_PARAMETERS = "Num. parameters" COLS = [MODEL_COL, TRAIN_TYPE_COL, AVERAGE_COL, ARC_COL, HELLASWAG_COL, MMLU_COL, TRUTHFULQA_COL] TYPES = ["str", "number", "number", "number", "number", "number"] results = collect_results() original_df = build_performance_df(results) styled_df = style_df(original_df) with gr.Blocks() as demo: gr.HTML(TITLE) gr.Markdown(INTRO_TEXT) gr.Markdown("## Leaderboard\nOnly representative for the Dutch version (`*_nl`) of the benchmarks!") gr.components.Dataframe( value=original_df, headers=COLS, datatype=TYPES, elem_id="leaderboard-table", ) gr.Markdown("Training type: PT: pretrained on only/mostly Dutch; FT: **only** finetuned on" " Dutch; NA not specifically pretrained nor finetuned on Dutch but Dutch data may have been a (small) portion of the training data") gr.Markdown("## LaTeX") gr.Code(styled_df.to_latex(convert_css=True)) gr.Markdown(CREDIT, elem_classes="markdown-text") gr.Markdown(CITATION, elem_classes="markdown-text") if __name__ == '__main__': demo.launch()