Alex Jude KlaudiaTH commited on
Commit
6f17dc5
Β·
1 Parent(s): a200cc8

New leaderboard design (#19)

Browse files

* New Leaderboard Design: New design skeleton
* New Leaderboard Design: Removed unnecessary updates

---------

Co-authored-by: KlaudiaTH <KlaudiaTH@users.noreply.github.com>

Files changed (3) hide show
  1. app.py +132 -69
  2. core.py +0 -50
  3. style.py +94 -0
app.py CHANGED
@@ -14,8 +14,12 @@ with demo:
14
 
15
  selected_tab = gr.State(value=0)
16
 
17
- with gr.Column():
18
- with gr.Row():
 
 
 
 
19
  with gr.Column():
20
  with gr.Row():
21
  search_bar = gr.Textbox(
@@ -24,7 +28,6 @@ with demo:
24
  show_label=True,
25
  elem_id="search-bar",
26
  )
27
-
28
  model_types = gr.CheckboxGroup(
29
  label="Select model type",
30
  choices=[
@@ -36,6 +39,7 @@ with demo:
36
  ],
37
  value=list(T_SYMBOLS.values()),
38
  )
 
39
  with gr.Row():
40
  langs_bar = gr.CheckboxGroup(
41
  choices=[(LANG_SYMBOLS.get(l, l), l) for l in core.languages_list],
@@ -52,101 +56,160 @@ with demo:
52
  size="sm",
53
  scale=1,
54
  )
55
- select = gr.Button(value="Select all languages", size="sm", scale=1)
56
-
57
- def update_bar(selected_tab):
58
- if selected_tab in [0, 1]:
59
- choices = [(LANG_SYMBOLS.get(l, l), l) for l in core.languages_list]
60
- value = core.languages_list
61
- else:
62
- raise ValueError
63
- langs_bar = gr.CheckboxGroup(
64
- choices=choices,
65
- value=value,
66
- label="Select languages to average over",
67
- elem_id="column-select",
68
- interactive=True,
69
- )
70
- return langs_bar
71
-
72
- select.click(update_bar, inputs=[selected_tab], outputs=langs_bar)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
 
74
  with gr.Row():
75
- shown_tasks = gr.CheckboxGroup(
76
- choices=[],
77
- value=[],
78
- label="Select tasks to show",
79
  elem_id="column-select",
80
  interactive=True,
81
- scale=50,
82
- )
83
- fewshot = gr.Radio(
84
- choices=[("0-Shot", False), ("Few-shot", True)],
85
- value=True,
86
- label="Select evaluation type",
87
- scale=29,
88
  )
89
- clear = gr.ClearButton(shown_tasks, value="Deselect all tasks", size="sm", scale=21)
90
-
91
- with gr.Tabs(elem_classes="tab-buttons") as tabs:
92
- with gr.TabItem("πŸ… LLM accuracy benchmark", elem_id="llm-benchmark-tab-table-acc", id=0) as acc:
93
- leaderboard_table = gr.Dataframe()
94
- with gr.TabItem(
95
- "🌐 LLM translation benchmark",
96
- elem_id="llm-benchmark-tab-table-misc",
97
- id=1,
98
- ) as misc:
99
- leaderboard_table_misc = gr.Dataframe()
100
-
101
- demo.load(
102
- core.update_task_groups_and_fewshot,
103
- [gr.State(value=0), model_types, langs_bar, fewshot],
104
- [shown_tasks, fewshot, selected_tab, model_types, langs_bar],
105
- )
106
- fewshot.change(
107
- core.update_task_groups_and_fewshot,
108
- [selected_tab, model_types, langs_bar, fewshot],
109
- [shown_tasks, fewshot, selected_tab, model_types, langs_bar],
110
- )
111
- acc.select(
112
- core.update_task_groups_and_fewshot,
113
- inputs=[gr.State(value=0), model_types, langs_bar, fewshot],
114
- outputs=[shown_tasks, fewshot, selected_tab, model_types, langs_bar],
115
- )
116
- misc.select(
117
- core.update_task_groups_and_fewshot,
118
- inputs=[gr.State(value=1), model_types, langs_bar, fewshot],
119
- outputs=[shown_tasks, fewshot, selected_tab, model_types, langs_bar],
120
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
121
  for comp, fn in [
122
  (search_bar, "submit"),
123
  (langs_bar, "change"),
124
  (shown_tasks, "change"),
125
- (fewshot, "change"),
126
  (model_types, "change"),
127
  ]:
128
  getattr(comp, fn)(
129
  core.update_df,
130
- [shown_tasks, search_bar, langs_bar, model_types, fewshot],
131
  leaderboard_table,
132
  )
 
 
 
 
 
 
 
133
  getattr(comp, fn)(
134
  core.update_df,
135
- [shown_tasks, search_bar, langs_bar, model_types, fewshot],
136
  leaderboard_table_misc,
137
  )
138
 
139
  gr.Blocks.load(
140
  block=demo,
141
  fn=core.update_df,
142
- inputs=[shown_tasks, search_bar, langs_bar, model_types, fewshot],
143
  outputs=leaderboard_table,
144
  )
145
 
146
  gr.Blocks.load(
147
  block=demo,
148
  fn=core.update_df,
149
- inputs=[shown_tasks, search_bar, langs_bar, model_types, fewshot],
150
  outputs=leaderboard_table_misc,
151
  )
152
 
 
14
 
15
  selected_tab = gr.State(value=0)
16
 
17
+ with gr.Tabs(elem_classes="tab-buttons") as tabs:
18
+ with gr.TabItem(
19
+ "πŸ… LLM accuracy benchmark",
20
+ elem_id="llm-benchmark-tab-table-acc",
21
+ id=0,
22
+ ) as acc:
23
  with gr.Column():
24
  with gr.Row():
25
  search_bar = gr.Textbox(
 
28
  show_label=True,
29
  elem_id="search-bar",
30
  )
 
31
  model_types = gr.CheckboxGroup(
32
  label="Select model type",
33
  choices=[
 
39
  ],
40
  value=list(T_SYMBOLS.values()),
41
  )
42
+
43
  with gr.Row():
44
  langs_bar = gr.CheckboxGroup(
45
  choices=[(LANG_SYMBOLS.get(l, l), l) for l in core.languages_list],
 
56
  size="sm",
57
  scale=1,
58
  )
59
+ select = gr.Button(
60
+ value="Select all languages",
61
+ size="sm",
62
+ scale=1,
63
+ )
64
+ select.click(
65
+ lambda: gr.CheckboxGroup(value=core.languages_list),
66
+ inputs=[],
67
+ outputs=langs_bar,
68
+ )
69
+
70
+ with gr.Row():
71
+ shown_tasks = gr.CheckboxGroup(
72
+ choices=core.get_available_task_groups(core.get_selected_task_type(0), True),
73
+ value=core.get_available_task_groups(core.get_selected_task_type(0), True),
74
+ label="Select tasks to show",
75
+ elem_id="column-select",
76
+ interactive=True,
77
+ scale=50,
78
+ )
79
+ clear = gr.ClearButton(
80
+ shown_tasks,
81
+ value="Deselect all tasks",
82
+ size="sm",
83
+ scale=1,
84
+ )
85
+ select = gr.Button(
86
+ value="Select all tasks",
87
+ size="sm",
88
+ scale=1,
89
+ )
90
+ select.click(
91
+ lambda: gr.CheckboxGroup(value=core.get_available_task_groups(core.get_selected_task_type(0), True)),
92
+ inputs=[],
93
+ outputs=shown_tasks,
94
+ )
95
+ leaderboard_table = gr.Dataframe()
96
+
97
+ with gr.TabItem(
98
+ "🌐 LLM translation benchmark",
99
+ elem_id="llm-benchmark-tab-table-misc",
100
+ id=1,
101
+ ) as misc:
102
+ with gr.Column():
103
+ with gr.Row():
104
+ search_bar_misc = gr.Textbox(
105
+ label="Search models",
106
+ placeholder=" πŸ” Separate multiple queries with ';' and press ENTER...",
107
+ show_label=True,
108
+ elem_id="search-bar",
109
+ )
110
+
111
+ model_types_misc = gr.CheckboxGroup(
112
+ label="Select model type",
113
+ choices=[
114
+ (
115
+ f"Pretrained {T_SYMBOLS['pretrained']}",
116
+ T_SYMBOLS["pretrained"],
117
+ ),
118
+ (f"Chat {T_SYMBOLS['chat']}", T_SYMBOLS["chat"]),
119
+ ],
120
+ value=list(T_SYMBOLS.values()),
121
+ )
122
 
123
  with gr.Row():
124
+ langs_bar_misc = gr.CheckboxGroup(
125
+ choices=[(LANG_SYMBOLS.get(l, l), l) for l in core.languages_list],
126
+ value=core.languages_list,
127
+ label="Select languages to average over",
128
  elem_id="column-select",
129
  interactive=True,
130
+ scale=6,
 
 
 
 
 
 
131
  )
132
+ with gr.Column(scale=1):
133
+ clear_misc = gr.ClearButton(
134
+ langs_bar_misc,
135
+ value="Deselect all languages",
136
+ size="sm",
137
+ scale=1,
138
+ )
139
+ select_misc = gr.Button(
140
+ value="Select all languages",
141
+ size="sm",
142
+ scale=1,
143
+ )
144
+ select_misc.click(
145
+ lambda: gr.CheckboxGroup(value=core.languages_list),
146
+ inputs=[],
147
+ outputs=langs_bar_misc,
148
+ )
149
+
150
+ with gr.Row():
151
+ shown_tasks_misc = gr.CheckboxGroup(
152
+ choices=core.get_available_task_groups(core.get_selected_task_type(1), False),
153
+ value=core.get_available_task_groups(core.get_selected_task_type(1), False),
154
+ label="Select tasks to show",
155
+ elem_id="column-select",
156
+ interactive=True,
157
+ scale=50,
158
+ )
159
+ clear_tasks_misc = gr.ClearButton(
160
+ shown_tasks_misc,
161
+ value="Deselect all tasks",
162
+ size="sm",
163
+ scale=1,
164
+ )
165
+ select_all_tasks_misc = gr.Button(
166
+ value="Select all tasks",
167
+ size="sm",
168
+ scale=1,
169
+ )
170
+ select_all_tasks_misc.click(
171
+ lambda: gr.CheckboxGroup(value=core.get_available_task_groups(core.get_selected_task_type(1), False)),
172
+ inputs=[],
173
+ outputs=shown_tasks_misc,
174
+ )
175
+
176
+ leaderboard_table_misc = gr.Dataframe()
177
+
178
  for comp, fn in [
179
  (search_bar, "submit"),
180
  (langs_bar, "change"),
181
  (shown_tasks, "change"),
 
182
  (model_types, "change"),
183
  ]:
184
  getattr(comp, fn)(
185
  core.update_df,
186
+ [shown_tasks, search_bar, langs_bar, model_types, gr.State(value=True)],
187
  leaderboard_table,
188
  )
189
+
190
+ for comp, fn in [
191
+ (search_bar_misc, "submit"),
192
+ (langs_bar_misc, "change"),
193
+ (shown_tasks_misc, "change"),
194
+ (model_types_misc, "change"),
195
+ ]:
196
  getattr(comp, fn)(
197
  core.update_df,
198
+ [shown_tasks_misc, search_bar_misc, langs_bar_misc, model_types_misc, gr.State(value=False)],
199
  leaderboard_table_misc,
200
  )
201
 
202
  gr.Blocks.load(
203
  block=demo,
204
  fn=core.update_df,
205
+ inputs=[shown_tasks, search_bar, langs_bar, model_types, gr.State(value=True)],
206
  outputs=leaderboard_table,
207
  )
208
 
209
  gr.Blocks.load(
210
  block=demo,
211
  fn=core.update_df,
212
+ inputs=[shown_tasks_misc, search_bar_misc, langs_bar_misc, model_types_misc, gr.State(value=False)],
213
  outputs=leaderboard_table_misc,
214
  )
215
 
core.py CHANGED
@@ -1,7 +1,6 @@
1
  import itertools
2
  import os
3
 
4
- import gradio as gr
5
  import numpy as np
6
  import pandas as pd
7
  from datasets import load_dataset
@@ -114,7 +113,6 @@ def update_df(
114
 
115
  # aggregate results over languages per task
116
  df = aggregate_langs(df, tasks, langs)
117
-
118
  df = df.sort_values(by="Average", ascending=False)
119
 
120
  # filter models by search bar and model type
@@ -127,54 +125,6 @@ def update_df(
127
  return sort_cols(df, fewshot)
128
 
129
 
130
- def update_task_groups_and_fewshot(
131
- current_selected_tab: int,
132
- model_types,
133
- langs_bar,
134
- is_fewshot_current: bool = False,
135
- ):
136
- selected_task_type = get_selected_task_type(current_selected_tab)
137
- available_tasks = get_available_task_groups(selected_task_type, is_fewshot_current)
138
- new_selected_tasks = available_tasks.copy()
139
-
140
- tasks_checkbox_group_update = gr.CheckboxGroup(
141
- choices=available_tasks,
142
- value=new_selected_tasks,
143
- )
144
-
145
- if current_selected_tab == 0:
146
- is_fewshot_new = is_fewshot_current
147
- fewshot_available = True
148
- elif current_selected_tab == 1:
149
- is_fewshot_new = False
150
- fewshot_available = False
151
-
152
- fewshot_radio_update = gr.Radio(
153
- value=is_fewshot_new,
154
- interactive=fewshot_available,
155
- )
156
-
157
- model_types = gr.CheckboxGroup(
158
- label="Select model type",
159
- choices=[
160
- (
161
- f"Pretrained {T_SYMBOLS['pretrained']}",
162
- T_SYMBOLS["pretrained"],
163
- ),
164
- (f"Chat {T_SYMBOLS['chat']}", T_SYMBOLS["chat"]),
165
- ],
166
- value=list(T_SYMBOLS.values()),
167
- interactive=True,
168
- )
169
- langs_bar = gr.CheckboxGroup(
170
- choices=[(LANG_SYMBOLS.get(l, l), l) for l in languages_list],
171
- value=languages_list,
172
- interactive=True,
173
- )
174
-
175
- return [tasks_checkbox_group_update, fewshot_radio_update, current_selected_tab, model_types, langs_bar]
176
-
177
-
178
  def get_selected_task_type(task_type_id):
179
  task_types = {0: "accuracy", 1: "misc"}
180
  selected_task_type = task_types[task_type_id]
 
1
  import itertools
2
  import os
3
 
 
4
  import numpy as np
5
  import pandas as pd
6
  from datasets import load_dataset
 
113
 
114
  # aggregate results over languages per task
115
  df = aggregate_langs(df, tasks, langs)
 
116
  df = df.sort_values(by="Average", ascending=False)
117
 
118
  # filter models by search bar and model type
 
125
  return sort_cols(df, fewshot)
126
 
127
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
128
  def get_selected_task_type(task_type_id):
129
  task_types = {0: "accuracy", 1: "misc"}
130
  selected_task_type = task_types[task_type_id]
style.py CHANGED
@@ -11,6 +11,100 @@ CSS = """
11
  }
12
  """
13
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  T_SYMBOLS = {"pretrained": "🟒", "chat": "πŸ’¬"}
15
 
16
  LANG_SYMBOLS = {
 
11
  }
12
  """
13
 
14
+ OPEN_LLM_LEADERBOARD_CSS = """
15
+ /* Limit the width of the first AutoEvalColumn so that names don't expand too much */
16
+ table td:first-child,
17
+ table th:first-child {
18
+ max-width: 400px;
19
+ overflow: auto;
20
+ white-space: nowrap;
21
+ }
22
+ /* Full width space */
23
+ .gradio-container {
24
+ max-width: 95% !important;
25
+ }
26
+ /* Text style and margins */
27
+ .markdown-text {
28
+ font-size: 16px !important;
29
+ }
30
+ #models-to-add-text {
31
+ font-size: 18px !important;
32
+ }
33
+ #citation-button span {
34
+ font-size: 16px !important;
35
+ }
36
+ #citation-button textarea {
37
+ font-size: 16px !important;
38
+ }
39
+ #citation-button > label > button {
40
+ margin: 6px;
41
+ transform: scale(1.3);
42
+ }
43
+ #search-bar-table-box > div:first-child {
44
+ background: none;
45
+ border: none;
46
+ }
47
+ #search-bar {
48
+ padding: 0px;
49
+ }
50
+ .tab-buttons button {
51
+ font-size: 20px;
52
+ }
53
+ /* Filters style */
54
+ #filter_type {
55
+ border: 0;
56
+ padding-left: 0;
57
+ padding-top: 0;
58
+ }
59
+ #filter_type label {
60
+ display: flex;
61
+ }
62
+ #filter_type label > span {
63
+ margin-top: var(--spacing-lg);
64
+ margin-right: 0.5em;
65
+ }
66
+ #filter_type label > .wrap {
67
+ width: 103px;
68
+ }
69
+ #filter_type label > .wrap .wrap-inner {
70
+ padding: 2px;
71
+ }
72
+ #filter_type label > .wrap .wrap-inner input {
73
+ width: 1px;
74
+ }
75
+ #filter-columns-type {
76
+ border: 0;
77
+ padding: 0.5;
78
+ }
79
+ #filter-columns-size {
80
+ border: 0;
81
+ padding: 0.5;
82
+ }
83
+ #box-filter > .form {
84
+ border: 0;
85
+ }
86
+ /* Header styles */
87
+ #header-title {
88
+ text-align: left;
89
+ display: inline-block;
90
+ }
91
+ #header-row {
92
+ display: flex;
93
+ justify-content: space-between;
94
+ align-items: center;
95
+ }
96
+ #header-row .gradio-html {
97
+ flex-grow: 1;
98
+ }
99
+ #oauth-button {
100
+ height: auto;
101
+ min-width: max-content;
102
+ white-space: nowrap;
103
+ padding: 10px 20px;
104
+ border-radius: 4px;
105
+ }
106
+ """
107
+
108
  T_SYMBOLS = {"pretrained": "🟒", "chat": "πŸ’¬"}
109
 
110
  LANG_SYMBOLS = {