albertvillanova HF staff commited on
Commit
bea7063
β€’
1 Parent(s): 22fb9eb

Support more than 2 models

Browse files
Files changed (3) hide show
  1. app.py +27 -39
  2. src/details.py +24 -22
  3. src/results.py +22 -20
app.py CHANGED
@@ -7,7 +7,7 @@ from src.details import (
7
  clear_details,
8
  display_details,
9
  display_loading_message_for_details,
10
- load_details_dataframes,
11
  update_load_details_component,
12
  update_sample_idx_component,
13
  update_subtasks_component,
@@ -20,7 +20,7 @@ from src.results import (
20
  display_results,
21
  download_results,
22
  fetch_result_paths,
23
- load_results_dataframes,
24
  plot_results,
25
  sort_result_paths_per_model,
26
  update_load_results_component,
@@ -30,11 +30,11 @@ from src.results import (
30
 
31
  # if __name__ == "__main__":
32
  result_paths_per_model = sort_result_paths_per_model(fetch_result_paths())
33
- load_results_dataframes = partial(load_results_dataframes, result_paths_per_model=result_paths_per_model)
34
 
35
- with gr.Blocks(fill_height=True, fill_width=True, css=".col_heading {width: 50%}") as demo:
36
  gr.HTML("<h1 style='text-align: center;'>Compare Results of the πŸ€— Open LLM Leaderboard</h1>")
37
- gr.HTML("<h3 style='text-align: center;'>Select 2 models to load and compare their results</h3>")
38
  gr.HTML(
39
  "<p style='text-align: center; color:orange;'>⚠ This demo is a beta version, and we're actively working on it, so you might find some tiny bugs! Please report any issues you have in the Community tab to help us make it better for all.</p>"
40
  )
@@ -43,10 +43,7 @@ with gr.Blocks(fill_height=True, fill_width=True, css=".col_heading {width: 50%}
43
  "Check out the [documentation](https://huggingface.co/docs/leaderboards/open_llm_leaderboard/about) πŸ“„ to find explanations on the evaluations used, their configuration parameters and details on the input/outputs for the models."
44
  )
45
  with gr.Row():
46
- with gr.Column():
47
- model_id_1 = gr.Dropdown(choices=list(result_paths_per_model.keys()), label="Models")
48
- with gr.Column():
49
- model_id_2 = gr.Dropdown(choices=list(result_paths_per_model.keys()), label="Models")
50
 
51
  with gr.Row():
52
  with gr.Tab("Results"):
@@ -69,8 +66,7 @@ with gr.Blocks(fill_height=True, fill_width=True, css=".col_heading {width: 50%}
69
  results_plot_1 = gr.Plot(visible=True)
70
  results_plot_2 = gr.Plot(visible=True)
71
  results = gr.HTML()
72
- results_dataframe_1 = gr.Dataframe(visible=False)
73
- results_dataframe_2 = gr.Dataframe(visible=False)
74
  download_results_btn = gr.Button("Download")
75
  results_file = gr.File(visible=False)
76
  with gr.Tab("Configs"):
@@ -115,12 +111,10 @@ with gr.Blocks(fill_height=True, fill_width=True, css=".col_heading {width: 50%}
115
  )
116
  details_show_only_differences = gr.Checkbox(label="Show Only Differences", value=False, info="Options")
117
  details = gr.HTML()
118
- details_dataframe_1 = gr.Dataframe(visible=False)
119
- details_dataframe_2 = gr.Dataframe(visible=False)
120
- details_dataframe = gr.DataFrame(visible=False)
121
 
122
  gr.on(
123
- triggers=[model_id_1.input, model_id_2.input],
124
  fn=update_load_results_component,
125
  outputs=[load_results_btn, load_configs_btn],
126
  )
@@ -129,9 +123,9 @@ with gr.Blocks(fill_height=True, fill_width=True, css=".col_heading {width: 50%}
129
  fn=display_loading_message_for_results,
130
  outputs=[results, configs],
131
  ).then(
132
- fn=load_results_dataframes,
133
- inputs=[model_id_1, model_id_2],
134
- outputs=[results_dataframe_1, results_dataframe_2],
135
  ).then(
136
  fn=update_tasks_component,
137
  outputs=[results_task, configs_task],
@@ -152,18 +146,17 @@ with gr.Blocks(fill_height=True, fill_width=True, css=".col_heading {width: 50%}
152
  # Display results
153
  gr.on(
154
  triggers=[
155
- results_dataframe_1.change,
156
- results_dataframe_2.change,
157
  results_task.change,
158
  hide_std_errors.change,
159
  show_only_differences.change,
160
  ],
161
  fn=display_results,
162
- inputs=[results_task, hide_std_errors, show_only_differences, results_dataframe_1, results_dataframe_2],
163
  outputs=[results, configs],
164
  ).then(
165
  fn=plot_results,
166
- inputs=[results_task, results_dataframe_1, results_dataframe_2], # results,
167
  outputs=[results_plot_1, results_plot_2],
168
  ).then(
169
  fn=clear_results_file,
@@ -178,10 +171,8 @@ with gr.Blocks(fill_height=True, fill_width=True, css=".col_heading {width: 50%}
178
  triggers=[clear_results_btn.click, clear_configs_btn.click],
179
  fn=clear_results,
180
  outputs=[
181
- model_id_1,
182
- model_id_2,
183
- results_dataframe_1,
184
- results_dataframe_2,
185
  load_results_btn,
186
  load_configs_btn,
187
  results_task,
@@ -203,41 +194,38 @@ with gr.Blocks(fill_height=True, fill_width=True, css=".col_heading {width: 50%}
203
  outputs=[login_btn, subtask],
204
  )
205
  gr.on(
206
- triggers=[model_id_1.input, model_id_2.input, subtask.input, details_task.input],
207
  fn=update_load_details_component,
208
- inputs=[model_id_1, model_id_2, subtask],
209
  outputs=load_details_btn,
210
  )
211
  load_details_btn.click(
212
  fn=display_loading_message_for_details,
213
  outputs=details,
214
  ).then(
215
- fn=load_details_dataframes,
216
- inputs=[subtask, model_id_1, model_id_2],
217
- outputs=[details_dataframe_1, details_dataframe_2],
218
  ).then(
219
  fn=update_sample_idx_component,
220
- inputs=[details_dataframe_1, details_dataframe_2],
221
  outputs=sample_idx,
222
  )
223
  gr.on(
224
  triggers=[
225
- details_dataframe_1.change,
226
- details_dataframe_2.change,
227
  sample_idx.change,
228
  details_show_only_differences.change,
229
  ],
230
  fn=display_details,
231
- inputs=[sample_idx, details_show_only_differences, details_dataframe_1, details_dataframe_2],
232
  outputs=details,
233
  )
234
  clear_details_btn.click(
235
  fn=clear_details,
236
  outputs=[
237
- model_id_1,
238
- model_id_2,
239
- details_dataframe_1,
240
- details_dataframe_2,
241
  details_task,
242
  subtask,
243
  load_details_btn,
 
7
  clear_details,
8
  display_details,
9
  display_loading_message_for_details,
10
+ load_details,
11
  update_load_details_component,
12
  update_sample_idx_component,
13
  update_subtasks_component,
 
20
  display_results,
21
  download_results,
22
  fetch_result_paths,
23
+ load_results,
24
  plot_results,
25
  sort_result_paths_per_model,
26
  update_load_results_component,
 
30
 
31
  # if __name__ == "__main__":
32
  result_paths_per_model = sort_result_paths_per_model(fetch_result_paths())
33
+ load_results = partial(load_results, result_paths_per_model=result_paths_per_model)
34
 
35
+ with gr.Blocks(fill_height=True, fill_width=True) as demo:
36
  gr.HTML("<h1 style='text-align: center;'>Compare Results of the πŸ€— Open LLM Leaderboard</h1>")
37
+ gr.HTML("<h3 style='text-align: center;'>Select models to load and compare their results</h3>")
38
  gr.HTML(
39
  "<p style='text-align: center; color:orange;'>⚠ This demo is a beta version, and we're actively working on it, so you might find some tiny bugs! Please report any issues you have in the Community tab to help us make it better for all.</p>"
40
  )
 
43
  "Check out the [documentation](https://huggingface.co/docs/leaderboards/open_llm_leaderboard/about) πŸ“„ to find explanations on the evaluations used, their configuration parameters and details on the input/outputs for the models."
44
  )
45
  with gr.Row():
46
+ model_ids = gr.Dropdown(choices=list(result_paths_per_model.keys()), label="Models", multiselect=True)
 
 
 
47
 
48
  with gr.Row():
49
  with gr.Tab("Results"):
 
66
  results_plot_1 = gr.Plot(visible=True)
67
  results_plot_2 = gr.Plot(visible=True)
68
  results = gr.HTML()
69
+ results_dataframe = gr.State()
 
70
  download_results_btn = gr.Button("Download")
71
  results_file = gr.File(visible=False)
72
  with gr.Tab("Configs"):
 
111
  )
112
  details_show_only_differences = gr.Checkbox(label="Show Only Differences", value=False, info="Options")
113
  details = gr.HTML()
114
+ details_dataframe = gr.State()
 
 
115
 
116
  gr.on(
117
+ triggers=[model_ids.input],
118
  fn=update_load_results_component,
119
  outputs=[load_results_btn, load_configs_btn],
120
  )
 
123
  fn=display_loading_message_for_results,
124
  outputs=[results, configs],
125
  ).then(
126
+ fn=load_results,
127
+ inputs=model_ids,
128
+ outputs=results_dataframe,
129
  ).then(
130
  fn=update_tasks_component,
131
  outputs=[results_task, configs_task],
 
146
  # Display results
147
  gr.on(
148
  triggers=[
149
+ results_dataframe.change,
 
150
  results_task.change,
151
  hide_std_errors.change,
152
  show_only_differences.change,
153
  ],
154
  fn=display_results,
155
+ inputs=[results_dataframe, results_task, hide_std_errors, show_only_differences],
156
  outputs=[results, configs],
157
  ).then(
158
  fn=plot_results,
159
+ inputs=[results_dataframe, results_task],
160
  outputs=[results_plot_1, results_plot_2],
161
  ).then(
162
  fn=clear_results_file,
 
171
  triggers=[clear_results_btn.click, clear_configs_btn.click],
172
  fn=clear_results,
173
  outputs=[
174
+ model_ids,
175
+ results_dataframe,
 
 
176
  load_results_btn,
177
  load_configs_btn,
178
  results_task,
 
194
  outputs=[login_btn, subtask],
195
  )
196
  gr.on(
197
+ triggers=[model_ids.input, subtask.input, details_task.input],
198
  fn=update_load_details_component,
199
+ inputs=[model_ids, subtask],
200
  outputs=load_details_btn,
201
  )
202
  load_details_btn.click(
203
  fn=display_loading_message_for_details,
204
  outputs=details,
205
  ).then(
206
+ fn=load_details,
207
+ inputs=[model_ids, subtask],
208
+ outputs=details_dataframe,
209
  ).then(
210
  fn=update_sample_idx_component,
211
+ inputs=[details_dataframe],
212
  outputs=sample_idx,
213
  )
214
  gr.on(
215
  triggers=[
216
+ details_dataframe.change,
 
217
  sample_idx.change,
218
  details_show_only_differences.change,
219
  ],
220
  fn=display_details,
221
+ inputs=[details_dataframe, sample_idx, details_show_only_differences],
222
  outputs=details,
223
  )
224
  clear_details_btn.click(
225
  fn=clear_details,
226
  outputs=[
227
+ model_ids,
228
+ details_dataframe,
 
 
229
  details_task,
230
  subtask,
231
  load_details_btn,
src/details.py CHANGED
@@ -32,8 +32,8 @@ def update_subtasks_component(task, profile: gr.OAuthProfile | None):
32
  )
33
 
34
 
35
- def update_load_details_component(model_id_1, model_id_2, subtask):
36
- if (model_id_1 or model_id_2) and subtask:
37
  return gr.Button("Load Details", interactive=True)
38
  else:
39
  return gr.Button("Load Details", interactive=False)
@@ -56,24 +56,22 @@ async def load_details_dataframe(model_id, subtask):
56
  path = max(paths)
57
  data = await load_jsonlines_file(path)
58
  df = pd.json_normalize(data)
59
- df = df.sort_values(by=["doc_id"])
60
- # df = df.rename_axis("Parameters", axis="columns")
61
- df["model_name"] = model_id # Keep model_name
62
- return df
63
- # return df.set_index(pd.Index([model_id])).reset_index()
64
 
65
 
66
- async def load_details_dataframes(subtask, *model_ids):
67
- result = await asyncio.gather(*[load_details_dataframe(model_id, subtask) for model_id in model_ids])
68
- return result
 
69
 
70
 
71
- def display_details(sample_idx, show_only_differences, *dfs):
72
- rows = [df.iloc[sample_idx] for df in dfs if "model_name" in df.columns and sample_idx < len(df)]
73
- if not rows:
74
  return
75
- # Pop model_name and add it to the column name
76
- df = pd.concat([row.rename(row.pop("model_name")) for row in rows], axis="columns")
77
 
78
  # Style
79
  # - Option: Show only differences
@@ -92,15 +90,21 @@ def display_details(sample_idx, show_only_differences, *dfs):
92
  {
93
  "selector": "td",
94
  "props": [("overflow-wrap", "break-word"), ("max-width", "1px")],
95
- }
 
 
 
 
96
  ]
97
  )
98
  .to_html()
99
  )
100
 
101
 
102
- def update_sample_idx_component(*dfs):
103
- maximum = max([len(df) - 1 for df in dfs])
 
 
104
  return gr.Number(
105
  label="Sample Index",
106
  info="Index of the sample to be displayed",
@@ -112,11 +116,9 @@ def update_sample_idx_component(*dfs):
112
 
113
 
114
  def clear_details():
115
- # model_id_1, model_id_2, details_dataframe_1, details_dataframe_2, details_task, subtask, load_details_btn, sample_idx
116
  return (
117
- None,
118
- None,
119
- None,
120
  None,
121
  None,
122
  None,
 
32
  )
33
 
34
 
35
+ def update_load_details_component(model_id, subtask):
36
+ if model_id and subtask:
37
  return gr.Button("Load Details", interactive=True)
38
  else:
39
  return gr.Button("Load Details", interactive=False)
 
56
  path = max(paths)
57
  data = await load_jsonlines_file(path)
58
  df = pd.json_normalize(data)
59
+ # Keep model_name:
60
+ df["model_name"] = model_id
61
+ return df.sort_values("doc_id").set_index("doc_id", drop=False).set_index("model_name", append=True)
 
 
62
 
63
 
64
+ async def load_details(model_ids, subtask):
65
+ dfs = await asyncio.gather(*[load_details_dataframe(model_id, subtask) for model_id in model_ids])
66
+ if dfs:
67
+ return pd.concat(dfs)
68
 
69
 
70
+ def display_details(df, sample_idx, show_only_differences):
71
+ if df is None:
 
72
  return
73
+ df = df.loc[df.index.levels[0][sample_idx]]
74
+ df = df.T.rename_axis(columns=None)
75
 
76
  # Style
77
  # - Option: Show only differences
 
90
  {
91
  "selector": "td",
92
  "props": [("overflow-wrap", "break-word"), ("max-width", "1px")],
93
+ },
94
+ {
95
+ "selector": ".col_heading",
96
+ "props": [("width", f"{100 / len(df.columns)}%")],
97
+ },
98
  ]
99
  )
100
  .to_html()
101
  )
102
 
103
 
104
+ def update_sample_idx_component(df):
105
+ if df is None:
106
+ return
107
+ maximum = len(df) - 1
108
  return gr.Number(
109
  label="Sample Index",
110
  info="Index of the sample to be displayed",
 
116
 
117
 
118
  def clear_details():
119
+ # model_ids, details_dataframe, details_task, subtask, load_details_btn, sample_idx
120
  return (
121
+ gr.Dropdown(value=[]),
 
 
122
  None,
123
  None,
124
  None,
src/results.py CHANGED
@@ -42,24 +42,16 @@ async def load_results_dataframe(model_id, result_paths_per_model=None):
42
  model_name = result.get("model_name", "Model")
43
  df = pd.json_normalize([data])
44
  # df.columns = df.columns.str.split(".") # .split return a list instead of a tuple
45
- return df.set_index(pd.Index([model_name])).reset_index()
46
 
47
 
48
- async def load_results_dataframes(*model_ids, result_paths_per_model=None):
49
- result = await asyncio.gather(
50
- *[load_results_dataframe(model_id, result_paths_per_model) for model_id in model_ids]
51
- )
52
- return result
53
-
54
-
55
- def concat_results(dfs):
56
- dfs = [df.set_index("index") for df in dfs if "index" in df.columns]
57
  if dfs:
58
  return pd.concat(dfs)
59
 
60
 
61
- def display_results(task, hide_std_errors, show_only_differences, *dfs):
62
- df = concat_results(dfs)
63
  if df is None:
64
  return None, None
65
  df = df.T.rename_axis(columns=None)
@@ -111,6 +103,19 @@ def display_tab(tab, df, task, hide_std_errors=True, show_only_differences=False
111
  # Format index values: remove prefix and suffix
112
  start = len(f"{tab}.leaderboard_") if task == "All" else len(f"{tab}.{task} ")
113
  df.format_index(lambda idx: idx[start:].removesuffix(",none"), axis="index")
 
 
 
 
 
 
 
 
 
 
 
 
 
114
  return df.to_html()
115
 
116
 
@@ -127,11 +132,9 @@ def update_tasks_component():
127
 
128
 
129
  def clear_results():
130
- # model_id_1, model_id_2, dataframe_1, dataframe_2, load_results_btn, load_configs_btn, results_task, configs_task
131
  return (
132
- None,
133
- None,
134
- None,
135
  None,
136
  *(gr.Button("Load", interactive=False),) * 2,
137
  *(
@@ -151,8 +154,7 @@ def display_loading_message_for_results():
151
  return ("<h3 style='text-align: center;'>Loading...</h3>",) * 2
152
 
153
 
154
- def plot_results(task, *dfs):
155
- df = concat_results(dfs)
156
  if df is not None:
157
  df = df[
158
  [
@@ -190,7 +192,7 @@ def plot_results(task, *dfs):
190
  df.T.rename_axis(columns="Model"),
191
  barmode="group",
192
  labels={"index": "Benchmark" if task == "All" else "Subtask", "value": "Score"},
193
- color_discrete_sequence=["#FF9D00", "#32343D"],
194
  )
195
  fig_1.update_yaxes(range=[0, 1])
196
  fig_2 = px.line_polar(
@@ -200,7 +202,7 @@ def plot_results(task, *dfs):
200
  color="Model",
201
  line_close=True,
202
  range_r=[0, 1],
203
- color_discrete_sequence=["#FF9D00", "#32343D"],
204
  )
205
  # Avoid bug with radar:
206
  fig_2.update_layout(
 
42
  model_name = result.get("model_name", "Model")
43
  df = pd.json_normalize([data])
44
  # df.columns = df.columns.str.split(".") # .split return a list instead of a tuple
45
+ return df.set_index(pd.Index([model_name]))
46
 
47
 
48
+ async def load_results(model_ids, result_paths_per_model=None):
49
+ dfs = await asyncio.gather(*[load_results_dataframe(model_id, result_paths_per_model) for model_id in model_ids])
 
 
 
 
 
 
 
50
  if dfs:
51
  return pd.concat(dfs)
52
 
53
 
54
+ def display_results(df, task, hide_std_errors, show_only_differences):
 
55
  if df is None:
56
  return None, None
57
  df = df.T.rename_axis(columns=None)
 
103
  # Format index values: remove prefix and suffix
104
  start = len(f"{tab}.leaderboard_") if task == "All" else len(f"{tab}.{task} ")
105
  df.format_index(lambda idx: idx[start:].removesuffix(",none"), axis="index")
106
+ # Fix overflow
107
+ df.set_table_styles(
108
+ [
109
+ {
110
+ "selector": "td",
111
+ "props": [("overflow-wrap", "break-word"), ("max-width", "1px")],
112
+ },
113
+ {
114
+ "selector": ".col_heading",
115
+ "props": [("width", f"{100 / len(df.columns)}%")],
116
+ },
117
+ ]
118
+ )
119
  return df.to_html()
120
 
121
 
 
132
 
133
 
134
  def clear_results():
135
+ # model_ids, dataframe, load_results_btn, load_configs_btn, results_task, configs_task
136
  return (
137
+ gr.Dropdown(value=[]),
 
 
138
  None,
139
  *(gr.Button("Load", interactive=False),) * 2,
140
  *(
 
154
  return ("<h3 style='text-align: center;'>Loading...</h3>",) * 2
155
 
156
 
157
+ def plot_results(df, task):
 
158
  if df is not None:
159
  df = df[
160
  [
 
192
  df.T.rename_axis(columns="Model"),
193
  barmode="group",
194
  labels={"index": "Benchmark" if task == "All" else "Subtask", "value": "Score"},
195
+ color_discrete_sequence=px.colors.qualitative.Safe, # TODO: https://plotly.com/python/discrete-color/
196
  )
197
  fig_1.update_yaxes(range=[0, 1])
198
  fig_2 = px.line_polar(
 
202
  color="Model",
203
  line_close=True,
204
  range_r=[0, 1],
205
+ color_discrete_sequence=px.colors.qualitative.Safe, # TODO: https://plotly.com/python/discrete-color/
206
  )
207
  # Avoid bug with radar:
208
  fig_2.update_layout(