Spaces:
AIR-Bench
/
Running on CPU Upgrade

refactor-leaderboard-0605

#16
by nan - opened
Files changed (3) hide show
  1. app.py +54 -212
  2. src/display/gradio_formatting.py +92 -0
  3. src/display/utils.py +1 -1
app.py CHANGED
@@ -14,14 +14,14 @@ from src.display.css_html_js import custom_css
14
  from src.display.utils import COL_NAME_IS_ANONYMOUS, COL_NAME_REVISION, COL_NAME_TIMESTAMP
15
  from src.envs import API, EVAL_RESULTS_PATH, REPO_ID, RESULTS_REPO, TOKEN
16
  from src.read_evals import get_raw_eval_results, get_leaderboard_df
17
- from src.utils import update_table, update_metric, update_table_long_doc, upload_file, get_default_cols, submit_results, clear_reranking_selections
18
-
 
19
 
20
  def restart_space():
21
  API.restart_space(repo_id=REPO_ID)
22
 
23
 
24
-
25
  try:
26
  snapshot_download(
27
  repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30,
@@ -54,6 +54,9 @@ shown_columns_long_doc, types_long_doc = get_default_cols(
54
  leaderboard_df_long_doc = leaderboard_df_long_doc[~leaderboard_df_long_doc[COL_NAME_IS_ANONYMOUS]][shown_columns_long_doc]
55
  leaderboard_df_long_doc.drop([COL_NAME_REVISION, COL_NAME_TIMESTAMP], axis=1, inplace=True)
56
 
 
 
 
57
 
58
  def update_metric_qa(
59
  metric: str,
@@ -66,7 +69,6 @@ def update_metric_qa(
66
  ):
67
  return update_metric(raw_data, 'qa', metric, domains, langs, reranking_model, query, show_anonymous, show_revision_and_timestamp)
68
 
69
-
70
  def update_metric_long_doc(
71
  metric: str,
72
  domains: list,
@@ -90,124 +92,47 @@ with demo:
90
  with gr.Column():
91
  # search retrieval models
92
  with gr.Row():
93
- selected_version = gr.Dropdown(
94
- choices=["AIR-Bench_24.04",],
95
- value="AIR-Bench_24.04",
96
- label="Select the version of AIR-Bench",
97
- interactive = True
98
- )
99
  with gr.Row():
100
- search_bar = gr.Textbox(
101
- placeholder=" 🔍 Search for retrieval methods (separate multiple queries with `;`) and press ENTER...",
102
- show_label=False,
103
- elem_id="search-bar",
104
- info="Search the retrieval methods"
105
- )
106
- # select reranking model
107
- reranking_models = sorted(list(frozenset([eval_result.reranking_model for eval_result in raw_data])))
108
  with gr.Row():
109
- selected_rerankings = gr.Dropdown(
110
- choices=reranking_models,
111
- # value=reranking_models,
112
- label="Select the reranking models",
113
- elem_id="reranking-select",
114
- interactive=True,
115
- multiselect=True
116
- )
117
  with gr.Row():
118
- select_noreranker_only_btn = gr.Button(
119
- value="Only show results without ranking models",
120
- )
121
 
122
  with gr.Column(min_width=320):
123
  # select the metric
124
- selected_metric = gr.Dropdown(
125
- choices=METRIC_LIST,
126
- value=DEFAULT_METRIC,
127
- label="Select the metric",
128
- interactive=True,
129
- elem_id="metric-select",
130
- )
131
  # select domain
132
  with gr.Row():
133
- selected_domains = gr.CheckboxGroup(
134
- choices=DOMAIN_COLS_QA,
135
- value=DOMAIN_COLS_QA,
136
- label="Select the domains",
137
- elem_id="domain-column-select",
138
- interactive=True,
139
- )
140
  # select language
141
  with gr.Row():
142
- selected_langs = gr.Dropdown(
143
- choices=LANG_COLS_QA,
144
- value=LANG_COLS_QA,
145
- label="Select the languages",
146
- elem_id="language-column-select",
147
- multiselect=True,
148
- interactive=True
149
- )
150
  with gr.Row():
151
- show_anonymous = gr.Checkbox(
152
- label="Show anonymous submissions",
153
- value=False,
154
- info="The anonymous submissions might have invalid model information."
155
- )
156
  with gr.Row():
157
- show_revision_and_timestamp = gr.Checkbox(
158
- label="Show submission details",
159
- value=False,
160
- info="Show the revision and timestamp information of submissions"
161
- )
162
 
163
- leaderboard_table = gr.components.Dataframe(
164
- value=leaderboard_df_qa,
165
- datatype=types_qa,
166
- elem_id="leaderboard-table",
167
- interactive=False,
168
- visible=True,
169
- )
170
 
171
  # Dummy leaderboard for handling the case when the user uses backspace key
172
- hidden_leaderboard_table_for_search = gr.components.Dataframe(
173
- value=original_df_qa,
174
- datatype=types_qa,
175
- visible=False,
176
- )
177
 
178
- # Set search_bar listener
179
- search_bar.submit(
180
- update_table,
181
- [
182
- hidden_leaderboard_table_for_search,
183
- selected_domains,
184
- selected_langs,
185
- selected_rerankings,
186
- search_bar,
187
- show_anonymous,
188
- ],
189
  leaderboard_table,
 
 
 
 
 
 
 
 
190
  )
191
 
192
- # Set column-wise listener
193
- for selector in [
194
- selected_domains, selected_langs, show_anonymous, show_revision_and_timestamp, selected_rerankings
195
- ]:
196
- selector.change(
197
- update_table,
198
- [
199
- hidden_leaderboard_table_for_search,
200
- selected_domains,
201
- selected_langs,
202
- selected_rerankings,
203
- search_bar,
204
- show_anonymous,
205
- show_revision_and_timestamp
206
- ],
207
- leaderboard_table,
208
- queue=True,
209
- )
210
-
211
  # set metric listener
212
  selected_metric.change(
213
  update_metric_qa,
@@ -223,135 +148,57 @@ with demo:
223
  queue=True
224
  )
225
 
226
- select_noreranker_only_btn.click(
227
- clear_reranking_selections,
228
- outputs=selected_rerankings
229
- )
230
-
231
  with gr.TabItem("Long Doc", elem_id="long-doc-benchmark-tab-table", id=1):
232
  with gr.Row():
233
  with gr.Column():
234
  with gr.Row():
235
- selected_version = gr.Dropdown(
236
- choices=["AIR-Bench_24.04",],
237
- value="AIR-Bench_24.04",
238
- label="Select the version of AIR-Bench",
239
- interactive=True
240
- )
241
  with gr.Row():
242
- search_bar = gr.Textbox(
243
- info="Search the retrieval methods",
244
- placeholder=" 🔍 Search for retrieval methods (separate multiple queries with `;`)"
245
- " and press ENTER...",
246
- show_label=False,
247
- elem_id="search-bar-long-doc",
248
- )
249
  # select reranking model
250
- reranking_models = list(frozenset([eval_result.reranking_model for eval_result in raw_data]))
251
  with gr.Row():
252
- selected_rerankings = gr.Dropdown(
253
- choices=reranking_models,
254
- # value=reranking_models,
255
- label="Select the reranking models",
256
- elem_id="reranking-select-long-doc",
257
- interactive=True,
258
- multiselect=True,
259
- )
260
  with gr.Row():
261
- select_noreranker_only_btn = gr.Button(
262
- value="Only show results without ranking models",
263
- )
264
  with gr.Column(min_width=320):
265
  # select the metric
266
  with gr.Row():
267
- selected_metric = gr.Dropdown(
268
- choices=METRIC_LIST,
269
- value=DEFAULT_METRIC,
270
- label="Select the metric",
271
- interactive=True,
272
- elem_id="metric-select-long-doc",
273
- )
274
  # select domain
275
  with gr.Row():
276
- selected_domains = gr.CheckboxGroup(
277
- choices=DOMAIN_COLS_LONG_DOC,
278
- value=DOMAIN_COLS_LONG_DOC,
279
- label="Select the domains",
280
- elem_id="domain-column-select-long-doc",
281
- interactive=True,
282
- )
283
  # select language
284
  with gr.Row():
285
- selected_langs = gr.Dropdown(
286
- choices=LANG_COLS_LONG_DOC,
287
- value=LANG_COLS_LONG_DOC,
288
- label="Select the languages",
289
- elem_id="language-column-select-long-doc",
290
- multiselect=True,
291
- interactive=True
292
  )
293
  with gr.Row():
294
- show_anonymous = gr.Checkbox(
295
- label="Show anonymous submissions",
296
- value=False,
297
- info="The anonymous submissions might have invalid model information."
298
- )
299
  with gr.Row():
300
- show_revision_and_timestamp = gr.Checkbox(
301
- label="Show submission details",
302
- value=False,
303
- info="Show the revision and timestamp information of submissions"
304
- )
305
 
306
- leaderboard_table_long_doc = gr.components.Dataframe(
307
- value=leaderboard_df_long_doc,
308
- datatype=types_long_doc,
309
- elem_id="leaderboard-table-long-doc",
310
- interactive=False,
311
- visible=True,
312
  )
313
 
314
  # Dummy leaderboard for handling the case when the user uses backspace key
315
- hidden_leaderboard_table_for_search = gr.components.Dataframe(
316
- value=original_df_long_doc,
317
- datatype=types_long_doc,
318
- visible=False,
319
  )
320
 
321
- # Set search_bar listener
322
- search_bar.submit(
323
- update_table_long_doc,
324
- [
325
- hidden_leaderboard_table_for_search,
326
- selected_domains,
327
- selected_langs,
328
- selected_rerankings,
329
- search_bar,
330
- show_anonymous,
331
- show_revision_and_timestamp
332
- ],
333
- leaderboard_table_long_doc,
334
  )
335
 
336
- # Set column-wise listener
337
- for selector in [
338
- selected_domains, selected_langs, show_anonymous, show_revision_and_timestamp, selected_rerankings
339
- ]:
340
- selector.change(
341
- update_table_long_doc,
342
- [
343
- hidden_leaderboard_table_for_search,
344
- selected_domains,
345
- selected_langs,
346
- selected_rerankings,
347
- search_bar,
348
- show_anonymous,
349
- show_revision_and_timestamp
350
- ],
351
- leaderboard_table_long_doc,
352
- queue=True,
353
- )
354
-
355
  # set metric listener
356
  selected_metric.change(
357
  update_metric_long_doc,
@@ -364,15 +211,10 @@ with demo:
364
  show_anonymous,
365
  show_revision_and_timestamp
366
  ],
367
- leaderboard_table_long_doc,
368
  queue=True
369
  )
370
 
371
- select_noreranker_only_btn.click(
372
- clear_reranking_selections,
373
- outputs=selected_rerankings
374
- )
375
-
376
  with gr.TabItem("🚀Submit here!", elem_id="submit-tab-table", id=2):
377
  with gr.Column():
378
  with gr.Row():
 
14
  from src.display.utils import COL_NAME_IS_ANONYMOUS, COL_NAME_REVISION, COL_NAME_TIMESTAMP
15
  from src.envs import API, EVAL_RESULTS_PATH, REPO_ID, RESULTS_REPO, TOKEN
16
  from src.read_evals import get_raw_eval_results, get_leaderboard_df
17
+ from src.utils import update_metric, upload_file, get_default_cols, submit_results
18
+ from src.display.gradio_formatting import get_version_dropdown, get_search_bar, get_reranking_dropdown, get_noreranker_button, get_metric_dropdown, get_domain_dropdown, get_language_dropdown, get_anonymous_checkbox, get_revision_and_ts_checkbox, get_leaderboard_table
19
+ from src.display.gradio_listener import set_listeners
20
 
21
  def restart_space():
22
  API.restart_space(repo_id=REPO_ID)
23
 
24
 
 
25
  try:
26
  snapshot_download(
27
  repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30,
 
54
  leaderboard_df_long_doc = leaderboard_df_long_doc[~leaderboard_df_long_doc[COL_NAME_IS_ANONYMOUS]][shown_columns_long_doc]
55
  leaderboard_df_long_doc.drop([COL_NAME_REVISION, COL_NAME_TIMESTAMP], axis=1, inplace=True)
56
 
57
+ # select reranking model
58
+ reranking_models = sorted(list(frozenset([eval_result.reranking_model for eval_result in raw_data])))
59
+
60
 
61
  def update_metric_qa(
62
  metric: str,
 
69
  ):
70
  return update_metric(raw_data, 'qa', metric, domains, langs, reranking_model, query, show_anonymous, show_revision_and_timestamp)
71
 
 
72
  def update_metric_long_doc(
73
  metric: str,
74
  domains: list,
 
92
  with gr.Column():
93
  # search retrieval models
94
  with gr.Row():
95
+ selected_version = get_version_dropdown()
 
 
 
 
 
96
  with gr.Row():
97
+ search_bar = get_search_bar()
 
 
 
 
 
 
 
98
  with gr.Row():
99
+ selected_rerankings = get_reranking_dropdown(reranking_models)
 
 
 
 
 
 
 
100
  with gr.Row():
101
+ select_noreranker_only_btn = get_noreranker_button()
 
 
102
 
103
  with gr.Column(min_width=320):
104
  # select the metric
105
+ selected_metric = get_metric_dropdown(METRIC_LIST, DEFAULT_METRIC)
 
 
 
 
 
 
106
  # select domain
107
  with gr.Row():
108
+ selected_domains = get_domain_dropdown(DOMAIN_COLS_QA, DOMAIN_COLS_QA)
 
 
 
 
 
 
109
  # select language
110
  with gr.Row():
111
+ selected_langs = get_language_dropdown(LANG_COLS_QA, LANG_COLS_QA)
 
 
 
 
 
 
 
112
  with gr.Row():
113
+ show_anonymous = get_anonymous_checkbox()
 
 
 
 
114
  with gr.Row():
115
+ show_revision_and_timestamp = get_revision_and_ts_checkbox()
 
 
 
 
116
 
117
+
118
+ leaderboard_table = get_leaderboard_table(leaderboard_df_qa, types_qa)
 
 
 
 
 
119
 
120
  # Dummy leaderboard for handling the case when the user uses backspace key
121
+ hidden_leaderboard_table_for_search = get_leaderboard_table(original_df_qa, types_qa, visible=False)
 
 
 
 
122
 
123
+ set_listeners(
124
+ "qa",
 
 
 
 
 
 
 
 
 
125
  leaderboard_table,
126
+ hidden_leaderboard_table_for_search,
127
+ search_bar,
128
+ select_noreranker_only_btn,
129
+ selected_domains,
130
+ selected_langs,
131
+ selected_rerankings,
132
+ show_anonymous,
133
+ show_revision_and_timestamp,
134
  )
135
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
136
  # set metric listener
137
  selected_metric.change(
138
  update_metric_qa,
 
148
  queue=True
149
  )
150
 
 
 
 
 
 
151
  with gr.TabItem("Long Doc", elem_id="long-doc-benchmark-tab-table", id=1):
152
  with gr.Row():
153
  with gr.Column():
154
  with gr.Row():
155
+ selected_version = get_version_dropdown()
 
 
 
 
 
156
  with gr.Row():
157
+ search_bar = get_search_bar()
 
 
 
 
 
 
158
  # select reranking model
 
159
  with gr.Row():
160
+ selected_rerankings = get_reranking_dropdown(reranking_models)
 
 
 
 
 
 
 
161
  with gr.Row():
162
+ select_noreranker_only_btn = get_noreranker_button()
 
 
163
  with gr.Column(min_width=320):
164
  # select the metric
165
  with gr.Row():
166
+ selected_metric = get_metric_dropdown(METRIC_LIST, DEFAULT_METRIC)
 
 
 
 
 
 
167
  # select domain
168
  with gr.Row():
169
+ selected_domains = get_domain_dropdown(DOMAIN_COLS_LONG_DOC, DOMAIN_COLS_LONG_DOC)
 
 
 
 
 
 
170
  # select language
171
  with gr.Row():
172
+ selected_langs = get_language_dropdown(
173
+ LANG_COLS_LONG_DOC, LANG_COLS_LONG_DOC
 
 
 
 
 
174
  )
175
  with gr.Row():
176
+ show_anonymous = get_anonymous_checkbox()
 
 
 
 
177
  with gr.Row():
178
+ show_revision_and_timestamp = get_revision_and_ts_checkbox()
 
 
 
 
179
 
180
+ leaderboard_table = get_leaderboard_table(
181
+ leaderboard_df_long_doc, types_long_doc
 
 
 
 
182
  )
183
 
184
  # Dummy leaderboard for handling the case when the user uses backspace key
185
+ hidden_leaderboard_table_for_search =get_leaderboard_table(
186
+ original_df_long_doc, types_long_doc, visible=False
 
 
187
  )
188
 
189
+ set_listeners(
190
+ "long-doc",
191
+ leaderboard_table,
192
+ hidden_leaderboard_table_for_search,
193
+ search_bar,
194
+ select_noreranker_only_btn,
195
+ selected_domains,
196
+ selected_langs,
197
+ selected_rerankings,
198
+ show_anonymous,
199
+ show_revision_and_timestamp,
 
 
200
  )
201
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
202
  # set metric listener
203
  selected_metric.change(
204
  update_metric_long_doc,
 
211
  show_anonymous,
212
  show_revision_and_timestamp
213
  ],
214
+ leaderboard_table,
215
  queue=True
216
  )
217
 
 
 
 
 
 
218
  with gr.TabItem("🚀Submit here!", elem_id="submit-tab-table", id=2):
219
  with gr.Column():
220
  with gr.Row():
src/display/gradio_formatting.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+
3
+
4
+ def get_version_dropdown():
5
+ return gr.Dropdown(
6
+ choices=["AIR-Bench_24.04", ],
7
+ value="AIR-Bench_24.04",
8
+ label="Select the version of AIR-Bench",
9
+ interactive=True
10
+ )
11
+
12
+
13
+ def get_search_bar():
14
+ return gr.Textbox(
15
+ placeholder=" 🔍 Search for retrieval methods (separate multiple queries with `;`) and press ENTER...",
16
+ show_label=False,
17
+ # elem_id="search-bar",
18
+ info="Search the retrieval methods"
19
+ )
20
+
21
+
22
+ def get_reranking_dropdown(model_list):
23
+ return gr.Dropdown(
24
+ choices=model_list,
25
+ label="Select the reranking models",
26
+ # elem_id="reranking-select",
27
+ interactive=True,
28
+ multiselect=True
29
+ )
30
+
31
+
32
+ def get_noreranker_button():
33
+ return gr.Button(
34
+ value="Only show results without ranking models",
35
+ )
36
+
37
+
38
+ def get_metric_dropdown(metric_list, default_metrics):
39
+ return gr.Dropdown(
40
+ choices=metric_list,
41
+ value=default_metrics,
42
+ label="Select the metric",
43
+ interactive=True,
44
+ # elem_id="metric-select-long-doc",
45
+ )
46
+
47
+
48
+ def get_domain_dropdown(domain_list, default_domains):
49
+ return gr.CheckboxGroup(
50
+ choices=domain_list,
51
+ value=default_domains,
52
+ label="Select the domains",
53
+ # elem_id="domain-column-select",
54
+ interactive=True,
55
+ )
56
+
57
+
58
+ def get_language_dropdown(language_list, default_languages):
59
+ return gr.Dropdown(
60
+ choices=language_list,
61
+ value=language_list,
62
+ label="Select the languages",
63
+ # elem_id="language-column-select",
64
+ multiselect=True,
65
+ interactive=True
66
+ )
67
+
68
+
69
+ def get_anonymous_checkbox():
70
+ return gr.Checkbox(
71
+ label="Show anonymous submissions",
72
+ value=False,
73
+ info="The anonymous submissions might have invalid model information."
74
+ )
75
+
76
+
77
+ def get_revision_and_ts_checkbox():
78
+ return gr.Checkbox(
79
+ label="Show submission details",
80
+ value=False,
81
+ info="Show the revision and timestamp information of submissions"
82
+ )
83
+
84
+
85
+ def get_leaderboard_table(df, datatype, visible=True):
86
+ return gr.components.Dataframe(
87
+ value=df,
88
+ datatype=datatype,
89
+ elem_id="leaderboard-table",
90
+ interactive=False,
91
+ visible=visible,
92
+ )
src/display/utils.py CHANGED
@@ -90,4 +90,4 @@ COLS_LITE = [c.name for c in fields(AutoEvalColumnQA) if c.displayed_by_default
90
 
91
  QA_BENCHMARK_COLS = [t.value.col_name for t in BenchmarksQA]
92
 
93
- LONG_DOC_BENCHMARK_COLS = [t.value.col_name for t in BenchmarksLongDoc]
 
90
 
91
  QA_BENCHMARK_COLS = [t.value.col_name for t in BenchmarksQA]
92
 
93
+ LONG_DOC_BENCHMARK_COLS = [t.value.col_name for t in BenchmarksLongDoc]