Commit
β’
bea7063
1
Parent(s):
22fb9eb
Support more than 2 models
Browse files- app.py +27 -39
- src/details.py +24 -22
- src/results.py +22 -20
app.py
CHANGED
@@ -7,7 +7,7 @@ from src.details import (
|
|
7 |
clear_details,
|
8 |
display_details,
|
9 |
display_loading_message_for_details,
|
10 |
-
|
11 |
update_load_details_component,
|
12 |
update_sample_idx_component,
|
13 |
update_subtasks_component,
|
@@ -20,7 +20,7 @@ from src.results import (
|
|
20 |
display_results,
|
21 |
download_results,
|
22 |
fetch_result_paths,
|
23 |
-
|
24 |
plot_results,
|
25 |
sort_result_paths_per_model,
|
26 |
update_load_results_component,
|
@@ -30,11 +30,11 @@ from src.results import (
|
|
30 |
|
31 |
# if __name__ == "__main__":
|
32 |
result_paths_per_model = sort_result_paths_per_model(fetch_result_paths())
|
33 |
-
|
34 |
|
35 |
-
with gr.Blocks(fill_height=True, fill_width=True
|
36 |
gr.HTML("<h1 style='text-align: center;'>Compare Results of the π€ Open LLM Leaderboard</h1>")
|
37 |
-
gr.HTML("<h3 style='text-align: center;'>Select
|
38 |
gr.HTML(
|
39 |
"<p style='text-align: center; color:orange;'>β This demo is a beta version, and we're actively working on it, so you might find some tiny bugs! Please report any issues you have in the Community tab to help us make it better for all.</p>"
|
40 |
)
|
@@ -43,10 +43,7 @@ with gr.Blocks(fill_height=True, fill_width=True, css=".col_heading {width: 50%}
|
|
43 |
"Check out the [documentation](https://huggingface.co/docs/leaderboards/open_llm_leaderboard/about) π to find explanations on the evaluations used, their configuration parameters and details on the input/outputs for the models."
|
44 |
)
|
45 |
with gr.Row():
|
46 |
-
|
47 |
-
model_id_1 = gr.Dropdown(choices=list(result_paths_per_model.keys()), label="Models")
|
48 |
-
with gr.Column():
|
49 |
-
model_id_2 = gr.Dropdown(choices=list(result_paths_per_model.keys()), label="Models")
|
50 |
|
51 |
with gr.Row():
|
52 |
with gr.Tab("Results"):
|
@@ -69,8 +66,7 @@ with gr.Blocks(fill_height=True, fill_width=True, css=".col_heading {width: 50%}
|
|
69 |
results_plot_1 = gr.Plot(visible=True)
|
70 |
results_plot_2 = gr.Plot(visible=True)
|
71 |
results = gr.HTML()
|
72 |
-
|
73 |
-
results_dataframe_2 = gr.Dataframe(visible=False)
|
74 |
download_results_btn = gr.Button("Download")
|
75 |
results_file = gr.File(visible=False)
|
76 |
with gr.Tab("Configs"):
|
@@ -115,12 +111,10 @@ with gr.Blocks(fill_height=True, fill_width=True, css=".col_heading {width: 50%}
|
|
115 |
)
|
116 |
details_show_only_differences = gr.Checkbox(label="Show Only Differences", value=False, info="Options")
|
117 |
details = gr.HTML()
|
118 |
-
|
119 |
-
details_dataframe_2 = gr.Dataframe(visible=False)
|
120 |
-
details_dataframe = gr.DataFrame(visible=False)
|
121 |
|
122 |
gr.on(
|
123 |
-
triggers=[
|
124 |
fn=update_load_results_component,
|
125 |
outputs=[load_results_btn, load_configs_btn],
|
126 |
)
|
@@ -129,9 +123,9 @@ with gr.Blocks(fill_height=True, fill_width=True, css=".col_heading {width: 50%}
|
|
129 |
fn=display_loading_message_for_results,
|
130 |
outputs=[results, configs],
|
131 |
).then(
|
132 |
-
fn=
|
133 |
-
inputs=
|
134 |
-
outputs=
|
135 |
).then(
|
136 |
fn=update_tasks_component,
|
137 |
outputs=[results_task, configs_task],
|
@@ -152,18 +146,17 @@ with gr.Blocks(fill_height=True, fill_width=True, css=".col_heading {width: 50%}
|
|
152 |
# Display results
|
153 |
gr.on(
|
154 |
triggers=[
|
155 |
-
|
156 |
-
results_dataframe_2.change,
|
157 |
results_task.change,
|
158 |
hide_std_errors.change,
|
159 |
show_only_differences.change,
|
160 |
],
|
161 |
fn=display_results,
|
162 |
-
inputs=[results_task, hide_std_errors, show_only_differences
|
163 |
outputs=[results, configs],
|
164 |
).then(
|
165 |
fn=plot_results,
|
166 |
-
inputs=[
|
167 |
outputs=[results_plot_1, results_plot_2],
|
168 |
).then(
|
169 |
fn=clear_results_file,
|
@@ -178,10 +171,8 @@ with gr.Blocks(fill_height=True, fill_width=True, css=".col_heading {width: 50%}
|
|
178 |
triggers=[clear_results_btn.click, clear_configs_btn.click],
|
179 |
fn=clear_results,
|
180 |
outputs=[
|
181 |
-
|
182 |
-
|
183 |
-
results_dataframe_1,
|
184 |
-
results_dataframe_2,
|
185 |
load_results_btn,
|
186 |
load_configs_btn,
|
187 |
results_task,
|
@@ -203,41 +194,38 @@ with gr.Blocks(fill_height=True, fill_width=True, css=".col_heading {width: 50%}
|
|
203 |
outputs=[login_btn, subtask],
|
204 |
)
|
205 |
gr.on(
|
206 |
-
triggers=[
|
207 |
fn=update_load_details_component,
|
208 |
-
inputs=[
|
209 |
outputs=load_details_btn,
|
210 |
)
|
211 |
load_details_btn.click(
|
212 |
fn=display_loading_message_for_details,
|
213 |
outputs=details,
|
214 |
).then(
|
215 |
-
fn=
|
216 |
-
inputs=[
|
217 |
-
outputs=
|
218 |
).then(
|
219 |
fn=update_sample_idx_component,
|
220 |
-
inputs=[
|
221 |
outputs=sample_idx,
|
222 |
)
|
223 |
gr.on(
|
224 |
triggers=[
|
225 |
-
|
226 |
-
details_dataframe_2.change,
|
227 |
sample_idx.change,
|
228 |
details_show_only_differences.change,
|
229 |
],
|
230 |
fn=display_details,
|
231 |
-
inputs=[sample_idx, details_show_only_differences
|
232 |
outputs=details,
|
233 |
)
|
234 |
clear_details_btn.click(
|
235 |
fn=clear_details,
|
236 |
outputs=[
|
237 |
-
|
238 |
-
|
239 |
-
details_dataframe_1,
|
240 |
-
details_dataframe_2,
|
241 |
details_task,
|
242 |
subtask,
|
243 |
load_details_btn,
|
|
|
7 |
clear_details,
|
8 |
display_details,
|
9 |
display_loading_message_for_details,
|
10 |
+
load_details,
|
11 |
update_load_details_component,
|
12 |
update_sample_idx_component,
|
13 |
update_subtasks_component,
|
|
|
20 |
display_results,
|
21 |
download_results,
|
22 |
fetch_result_paths,
|
23 |
+
load_results,
|
24 |
plot_results,
|
25 |
sort_result_paths_per_model,
|
26 |
update_load_results_component,
|
|
|
30 |
|
31 |
# if __name__ == "__main__":
|
32 |
result_paths_per_model = sort_result_paths_per_model(fetch_result_paths())
|
33 |
+
load_results = partial(load_results, result_paths_per_model=result_paths_per_model)
|
34 |
|
35 |
+
with gr.Blocks(fill_height=True, fill_width=True) as demo:
|
36 |
gr.HTML("<h1 style='text-align: center;'>Compare Results of the π€ Open LLM Leaderboard</h1>")
|
37 |
+
gr.HTML("<h3 style='text-align: center;'>Select models to load and compare their results</h3>")
|
38 |
gr.HTML(
|
39 |
"<p style='text-align: center; color:orange;'>β This demo is a beta version, and we're actively working on it, so you might find some tiny bugs! Please report any issues you have in the Community tab to help us make it better for all.</p>"
|
40 |
)
|
|
|
43 |
"Check out the [documentation](https://huggingface.co/docs/leaderboards/open_llm_leaderboard/about) π to find explanations on the evaluations used, their configuration parameters and details on the input/outputs for the models."
|
44 |
)
|
45 |
with gr.Row():
|
46 |
+
model_ids = gr.Dropdown(choices=list(result_paths_per_model.keys()), label="Models", multiselect=True)
|
|
|
|
|
|
|
47 |
|
48 |
with gr.Row():
|
49 |
with gr.Tab("Results"):
|
|
|
66 |
results_plot_1 = gr.Plot(visible=True)
|
67 |
results_plot_2 = gr.Plot(visible=True)
|
68 |
results = gr.HTML()
|
69 |
+
results_dataframe = gr.State()
|
|
|
70 |
download_results_btn = gr.Button("Download")
|
71 |
results_file = gr.File(visible=False)
|
72 |
with gr.Tab("Configs"):
|
|
|
111 |
)
|
112 |
details_show_only_differences = gr.Checkbox(label="Show Only Differences", value=False, info="Options")
|
113 |
details = gr.HTML()
|
114 |
+
details_dataframe = gr.State()
|
|
|
|
|
115 |
|
116 |
gr.on(
|
117 |
+
triggers=[model_ids.input],
|
118 |
fn=update_load_results_component,
|
119 |
outputs=[load_results_btn, load_configs_btn],
|
120 |
)
|
|
|
123 |
fn=display_loading_message_for_results,
|
124 |
outputs=[results, configs],
|
125 |
).then(
|
126 |
+
fn=load_results,
|
127 |
+
inputs=model_ids,
|
128 |
+
outputs=results_dataframe,
|
129 |
).then(
|
130 |
fn=update_tasks_component,
|
131 |
outputs=[results_task, configs_task],
|
|
|
146 |
# Display results
|
147 |
gr.on(
|
148 |
triggers=[
|
149 |
+
results_dataframe.change,
|
|
|
150 |
results_task.change,
|
151 |
hide_std_errors.change,
|
152 |
show_only_differences.change,
|
153 |
],
|
154 |
fn=display_results,
|
155 |
+
inputs=[results_dataframe, results_task, hide_std_errors, show_only_differences],
|
156 |
outputs=[results, configs],
|
157 |
).then(
|
158 |
fn=plot_results,
|
159 |
+
inputs=[results_dataframe, results_task],
|
160 |
outputs=[results_plot_1, results_plot_2],
|
161 |
).then(
|
162 |
fn=clear_results_file,
|
|
|
171 |
triggers=[clear_results_btn.click, clear_configs_btn.click],
|
172 |
fn=clear_results,
|
173 |
outputs=[
|
174 |
+
model_ids,
|
175 |
+
results_dataframe,
|
|
|
|
|
176 |
load_results_btn,
|
177 |
load_configs_btn,
|
178 |
results_task,
|
|
|
194 |
outputs=[login_btn, subtask],
|
195 |
)
|
196 |
gr.on(
|
197 |
+
triggers=[model_ids.input, subtask.input, details_task.input],
|
198 |
fn=update_load_details_component,
|
199 |
+
inputs=[model_ids, subtask],
|
200 |
outputs=load_details_btn,
|
201 |
)
|
202 |
load_details_btn.click(
|
203 |
fn=display_loading_message_for_details,
|
204 |
outputs=details,
|
205 |
).then(
|
206 |
+
fn=load_details,
|
207 |
+
inputs=[model_ids, subtask],
|
208 |
+
outputs=details_dataframe,
|
209 |
).then(
|
210 |
fn=update_sample_idx_component,
|
211 |
+
inputs=[details_dataframe],
|
212 |
outputs=sample_idx,
|
213 |
)
|
214 |
gr.on(
|
215 |
triggers=[
|
216 |
+
details_dataframe.change,
|
|
|
217 |
sample_idx.change,
|
218 |
details_show_only_differences.change,
|
219 |
],
|
220 |
fn=display_details,
|
221 |
+
inputs=[details_dataframe, sample_idx, details_show_only_differences],
|
222 |
outputs=details,
|
223 |
)
|
224 |
clear_details_btn.click(
|
225 |
fn=clear_details,
|
226 |
outputs=[
|
227 |
+
model_ids,
|
228 |
+
details_dataframe,
|
|
|
|
|
229 |
details_task,
|
230 |
subtask,
|
231 |
load_details_btn,
|
src/details.py
CHANGED
@@ -32,8 +32,8 @@ def update_subtasks_component(task, profile: gr.OAuthProfile | None):
|
|
32 |
)
|
33 |
|
34 |
|
35 |
-
def update_load_details_component(
|
36 |
-
if
|
37 |
return gr.Button("Load Details", interactive=True)
|
38 |
else:
|
39 |
return gr.Button("Load Details", interactive=False)
|
@@ -56,24 +56,22 @@ async def load_details_dataframe(model_id, subtask):
|
|
56 |
path = max(paths)
|
57 |
data = await load_jsonlines_file(path)
|
58 |
df = pd.json_normalize(data)
|
59 |
-
|
60 |
-
|
61 |
-
df
|
62 |
-
return df
|
63 |
-
# return df.set_index(pd.Index([model_id])).reset_index()
|
64 |
|
65 |
|
66 |
-
async def
|
67 |
-
|
68 |
-
|
|
|
69 |
|
70 |
|
71 |
-
def display_details(sample_idx, show_only_differences
|
72 |
-
|
73 |
-
if not rows:
|
74 |
return
|
75 |
-
|
76 |
-
df =
|
77 |
|
78 |
# Style
|
79 |
# - Option: Show only differences
|
@@ -92,15 +90,21 @@ def display_details(sample_idx, show_only_differences, *dfs):
|
|
92 |
{
|
93 |
"selector": "td",
|
94 |
"props": [("overflow-wrap", "break-word"), ("max-width", "1px")],
|
95 |
-
}
|
|
|
|
|
|
|
|
|
96 |
]
|
97 |
)
|
98 |
.to_html()
|
99 |
)
|
100 |
|
101 |
|
102 |
-
def update_sample_idx_component(
|
103 |
-
|
|
|
|
|
104 |
return gr.Number(
|
105 |
label="Sample Index",
|
106 |
info="Index of the sample to be displayed",
|
@@ -112,11 +116,9 @@ def update_sample_idx_component(*dfs):
|
|
112 |
|
113 |
|
114 |
def clear_details():
|
115 |
-
#
|
116 |
return (
|
117 |
-
|
118 |
-
None,
|
119 |
-
None,
|
120 |
None,
|
121 |
None,
|
122 |
None,
|
|
|
32 |
)
|
33 |
|
34 |
|
35 |
+
def update_load_details_component(model_id, subtask):
|
36 |
+
if model_id and subtask:
|
37 |
return gr.Button("Load Details", interactive=True)
|
38 |
else:
|
39 |
return gr.Button("Load Details", interactive=False)
|
|
|
56 |
path = max(paths)
|
57 |
data = await load_jsonlines_file(path)
|
58 |
df = pd.json_normalize(data)
|
59 |
+
# Keep model_name:
|
60 |
+
df["model_name"] = model_id
|
61 |
+
return df.sort_values("doc_id").set_index("doc_id", drop=False).set_index("model_name", append=True)
|
|
|
|
|
62 |
|
63 |
|
64 |
+
async def load_details(model_ids, subtask):
|
65 |
+
dfs = await asyncio.gather(*[load_details_dataframe(model_id, subtask) for model_id in model_ids])
|
66 |
+
if dfs:
|
67 |
+
return pd.concat(dfs)
|
68 |
|
69 |
|
70 |
+
def display_details(df, sample_idx, show_only_differences):
|
71 |
+
if df is None:
|
|
|
72 |
return
|
73 |
+
df = df.loc[df.index.levels[0][sample_idx]]
|
74 |
+
df = df.T.rename_axis(columns=None)
|
75 |
|
76 |
# Style
|
77 |
# - Option: Show only differences
|
|
|
90 |
{
|
91 |
"selector": "td",
|
92 |
"props": [("overflow-wrap", "break-word"), ("max-width", "1px")],
|
93 |
+
},
|
94 |
+
{
|
95 |
+
"selector": ".col_heading",
|
96 |
+
"props": [("width", f"{100 / len(df.columns)}%")],
|
97 |
+
},
|
98 |
]
|
99 |
)
|
100 |
.to_html()
|
101 |
)
|
102 |
|
103 |
|
104 |
+
def update_sample_idx_component(df):
|
105 |
+
if df is None:
|
106 |
+
return
|
107 |
+
maximum = len(df) - 1
|
108 |
return gr.Number(
|
109 |
label="Sample Index",
|
110 |
info="Index of the sample to be displayed",
|
|
|
116 |
|
117 |
|
118 |
def clear_details():
|
119 |
+
# model_ids, details_dataframe, details_task, subtask, load_details_btn, sample_idx
|
120 |
return (
|
121 |
+
gr.Dropdown(value=[]),
|
|
|
|
|
122 |
None,
|
123 |
None,
|
124 |
None,
|
src/results.py
CHANGED
@@ -42,24 +42,16 @@ async def load_results_dataframe(model_id, result_paths_per_model=None):
|
|
42 |
model_name = result.get("model_name", "Model")
|
43 |
df = pd.json_normalize([data])
|
44 |
# df.columns = df.columns.str.split(".") # .split return a list instead of a tuple
|
45 |
-
return df.set_index(pd.Index([model_name]))
|
46 |
|
47 |
|
48 |
-
async def
|
49 |
-
|
50 |
-
*[load_results_dataframe(model_id, result_paths_per_model) for model_id in model_ids]
|
51 |
-
)
|
52 |
-
return result
|
53 |
-
|
54 |
-
|
55 |
-
def concat_results(dfs):
|
56 |
-
dfs = [df.set_index("index") for df in dfs if "index" in df.columns]
|
57 |
if dfs:
|
58 |
return pd.concat(dfs)
|
59 |
|
60 |
|
61 |
-
def display_results(task, hide_std_errors, show_only_differences
|
62 |
-
df = concat_results(dfs)
|
63 |
if df is None:
|
64 |
return None, None
|
65 |
df = df.T.rename_axis(columns=None)
|
@@ -111,6 +103,19 @@ def display_tab(tab, df, task, hide_std_errors=True, show_only_differences=False
|
|
111 |
# Format index values: remove prefix and suffix
|
112 |
start = len(f"{tab}.leaderboard_") if task == "All" else len(f"{tab}.{task} ")
|
113 |
df.format_index(lambda idx: idx[start:].removesuffix(",none"), axis="index")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
114 |
return df.to_html()
|
115 |
|
116 |
|
@@ -127,11 +132,9 @@ def update_tasks_component():
|
|
127 |
|
128 |
|
129 |
def clear_results():
|
130 |
-
#
|
131 |
return (
|
132 |
-
|
133 |
-
None,
|
134 |
-
None,
|
135 |
None,
|
136 |
*(gr.Button("Load", interactive=False),) * 2,
|
137 |
*(
|
@@ -151,8 +154,7 @@ def display_loading_message_for_results():
|
|
151 |
return ("<h3 style='text-align: center;'>Loading...</h3>",) * 2
|
152 |
|
153 |
|
154 |
-
def plot_results(
|
155 |
-
df = concat_results(dfs)
|
156 |
if df is not None:
|
157 |
df = df[
|
158 |
[
|
@@ -190,7 +192,7 @@ def plot_results(task, *dfs):
|
|
190 |
df.T.rename_axis(columns="Model"),
|
191 |
barmode="group",
|
192 |
labels={"index": "Benchmark" if task == "All" else "Subtask", "value": "Score"},
|
193 |
-
color_discrete_sequence=
|
194 |
)
|
195 |
fig_1.update_yaxes(range=[0, 1])
|
196 |
fig_2 = px.line_polar(
|
@@ -200,7 +202,7 @@ def plot_results(task, *dfs):
|
|
200 |
color="Model",
|
201 |
line_close=True,
|
202 |
range_r=[0, 1],
|
203 |
-
color_discrete_sequence=
|
204 |
)
|
205 |
# Avoid bug with radar:
|
206 |
fig_2.update_layout(
|
|
|
42 |
model_name = result.get("model_name", "Model")
|
43 |
df = pd.json_normalize([data])
|
44 |
# df.columns = df.columns.str.split(".") # .split return a list instead of a tuple
|
45 |
+
return df.set_index(pd.Index([model_name]))
|
46 |
|
47 |
|
48 |
+
async def load_results(model_ids, result_paths_per_model=None):
|
49 |
+
dfs = await asyncio.gather(*[load_results_dataframe(model_id, result_paths_per_model) for model_id in model_ids])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
50 |
if dfs:
|
51 |
return pd.concat(dfs)
|
52 |
|
53 |
|
54 |
+
def display_results(df, task, hide_std_errors, show_only_differences):
|
|
|
55 |
if df is None:
|
56 |
return None, None
|
57 |
df = df.T.rename_axis(columns=None)
|
|
|
103 |
# Format index values: remove prefix and suffix
|
104 |
start = len(f"{tab}.leaderboard_") if task == "All" else len(f"{tab}.{task} ")
|
105 |
df.format_index(lambda idx: idx[start:].removesuffix(",none"), axis="index")
|
106 |
+
# Fix overflow
|
107 |
+
df.set_table_styles(
|
108 |
+
[
|
109 |
+
{
|
110 |
+
"selector": "td",
|
111 |
+
"props": [("overflow-wrap", "break-word"), ("max-width", "1px")],
|
112 |
+
},
|
113 |
+
{
|
114 |
+
"selector": ".col_heading",
|
115 |
+
"props": [("width", f"{100 / len(df.columns)}%")],
|
116 |
+
},
|
117 |
+
]
|
118 |
+
)
|
119 |
return df.to_html()
|
120 |
|
121 |
|
|
|
132 |
|
133 |
|
134 |
def clear_results():
|
135 |
+
# model_ids, dataframe, load_results_btn, load_configs_btn, results_task, configs_task
|
136 |
return (
|
137 |
+
gr.Dropdown(value=[]),
|
|
|
|
|
138 |
None,
|
139 |
*(gr.Button("Load", interactive=False),) * 2,
|
140 |
*(
|
|
|
154 |
return ("<h3 style='text-align: center;'>Loading...</h3>",) * 2
|
155 |
|
156 |
|
157 |
+
def plot_results(df, task):
|
|
|
158 |
if df is not None:
|
159 |
df = df[
|
160 |
[
|
|
|
192 |
df.T.rename_axis(columns="Model"),
|
193 |
barmode="group",
|
194 |
labels={"index": "Benchmark" if task == "All" else "Subtask", "value": "Score"},
|
195 |
+
color_discrete_sequence=px.colors.qualitative.Safe, # TODO: https://plotly.com/python/discrete-color/
|
196 |
)
|
197 |
fig_1.update_yaxes(range=[0, 1])
|
198 |
fig_2 = px.line_polar(
|
|
|
202 |
color="Model",
|
203 |
line_close=True,
|
204 |
range_r=[0, 1],
|
205 |
+
color_discrete_sequence=px.colors.qualitative.Safe, # TODO: https://plotly.com/python/discrete-color/
|
206 |
)
|
207 |
# Avoid bug with radar:
|
208 |
fig_2.update_layout(
|