Bram Vanroy
commited on
Commit
·
2c801d0
1
Parent(s):
851256b
add training type
Browse files
app.py
CHANGED
@@ -62,18 +62,22 @@ def build_performance_df(performance_dict: dict[tuple[str, str], dict[str, float
|
|
62 |
:return: a pd.DataFrame that has as rows the model names and as columns the benchmarks
|
63 |
"""
|
64 |
data = []
|
|
|
|
|
65 |
for (pretrained, lang), perfs in performance_dict.items():
|
66 |
arc_perf = perfs.get(ARC, 0.0)
|
67 |
hellaswag_perf = perfs.get(HELLASWAG, 0.0)
|
68 |
mmlu_perf = perfs.get(MMLU, 0.0)
|
69 |
truthfulqa_perf = perfs.get(TRUTHFULQA, 0.0)
|
|
|
70 |
|
71 |
avg = round((arc_perf + hellaswag_perf + mmlu_perf + truthfulqa_perf) / 4, 1)
|
72 |
-
row = [pretrained, avg, arc_perf, hellaswag_perf, mmlu_perf, truthfulqa_perf]
|
73 |
data.append(row)
|
74 |
|
75 |
df = pd.DataFrame.from_records(data, columns=COLS)
|
76 |
df = df.sort_values(by=[AVERAGE_COL], ascending=False)
|
|
|
77 |
return df
|
78 |
|
79 |
|
@@ -83,12 +87,12 @@ def style_df(df: DataFrame) -> Styler:
|
|
83 |
:param df: the dataframe to style
|
84 |
:return: the Styler
|
85 |
"""
|
86 |
-
styler = df.style.format("{:.2f}", subset=df.columns[
|
87 |
|
88 |
def highlight_max(col):
|
89 |
return np.where(col == np.nanmax(col.to_numpy()), "font-weight: bold;", None)
|
90 |
|
91 |
-
styler = styler.apply(highlight_max, axis=1, subset=df.columns[
|
92 |
styler = styler.hide()
|
93 |
return styler
|
94 |
|
@@ -99,8 +103,9 @@ ARC_COL = "ARC (25-shot)"
|
|
99 |
HELLASWAG_COL = "HellaSwag (10-shot)️"
|
100 |
MMLU_COL = "MMLU (5-shot)"
|
101 |
TRUTHFULQA_COL = "TruthfulQA (0-shot)"
|
|
|
102 |
|
103 |
-
COLS = [MODEL_COL, AVERAGE_COL, ARC_COL, HELLASWAG_COL, MMLU_COL, TRUTHFULQA_COL]
|
104 |
TYPES = ["str", "number", "number", "number", "number", "number"]
|
105 |
|
106 |
results = collect_results()
|
@@ -117,6 +122,8 @@ with gr.Blocks() as demo:
|
|
117 |
datatype=TYPES,
|
118 |
elem_id="leaderboard-table",
|
119 |
)
|
|
|
|
|
120 |
|
121 |
gr.Markdown("## LaTeX")
|
122 |
gr.Code(styled_df.to_latex(convert_css=True))
|
|
|
62 |
:return: a pd.DataFrame that has as rows the model names and as columns the benchmarks
|
63 |
"""
|
64 |
data = []
|
65 |
+
dutch_training_info = json.loads(Path(__file__).parent.joinpath("evals/dutch_models.json").read_text(encoding="utf-8"))
|
66 |
+
|
67 |
for (pretrained, lang), perfs in performance_dict.items():
|
68 |
arc_perf = perfs.get(ARC, 0.0)
|
69 |
hellaswag_perf = perfs.get(HELLASWAG, 0.0)
|
70 |
mmlu_perf = perfs.get(MMLU, 0.0)
|
71 |
truthfulqa_perf = perfs.get(TRUTHFULQA, 0.0)
|
72 |
+
training_type = dutch_training_info.get(pretrained, "NA")
|
73 |
|
74 |
avg = round((arc_perf + hellaswag_perf + mmlu_perf + truthfulqa_perf) / 4, 1)
|
75 |
+
row = [pretrained, training_type, avg, arc_perf, hellaswag_perf, mmlu_perf, truthfulqa_perf]
|
76 |
data.append(row)
|
77 |
|
78 |
df = pd.DataFrame.from_records(data, columns=COLS)
|
79 |
df = df.sort_values(by=[AVERAGE_COL], ascending=False)
|
80 |
+
|
81 |
return df
|
82 |
|
83 |
|
|
|
87 |
:param df: the dataframe to style
|
88 |
:return: the Styler
|
89 |
"""
|
90 |
+
styler = df.style.format("{:.2f}", subset=df.columns[2:])
|
91 |
|
92 |
def highlight_max(col):
|
93 |
return np.where(col == np.nanmax(col.to_numpy()), "font-weight: bold;", None)
|
94 |
|
95 |
+
styler = styler.apply(highlight_max, axis=1, subset=df.columns[2:])
|
96 |
styler = styler.hide()
|
97 |
return styler
|
98 |
|
|
|
103 |
HELLASWAG_COL = "HellaSwag (10-shot)️"
|
104 |
MMLU_COL = "MMLU (5-shot)"
|
105 |
TRUTHFULQA_COL = "TruthfulQA (0-shot)"
|
106 |
+
TRAIN_TYPE_COL = "Training type"
|
107 |
|
108 |
+
COLS = [MODEL_COL, TRAIN_TYPE_COL, AVERAGE_COL, ARC_COL, HELLASWAG_COL, MMLU_COL, TRUTHFULQA_COL]
|
109 |
TYPES = ["str", "number", "number", "number", "number", "number"]
|
110 |
|
111 |
results = collect_results()
|
|
|
122 |
datatype=TYPES,
|
123 |
elem_id="leaderboard-table",
|
124 |
)
|
125 |
+
gr.Markdown("Training type: <code>PT</code>: pretrained on only/mostly Dutch; <code>FT</code>: **only** finetuned on"
|
126 |
+
" Dutch; <code>NA</code> not specifically pretrained nor finetuned on Dutch but Dutch data may have been a (small) portion of the training data")
|
127 |
|
128 |
gr.Markdown("## LaTeX")
|
129 |
gr.Code(styled_df.to_latex(convert_css=True))
|