albertvillanova HF staff commited on
Commit
611a3ed
·
verified ·
1 Parent(s): e3edf6d
Files changed (5) hide show
  1. app.py +43 -14
  2. src/constants.py +1 -1
  3. src/details.py +9 -4
  4. src/hub.py +2 -1
  5. src/results.py +18 -8
app.py CHANGED
@@ -3,12 +3,27 @@ from functools import partial
3
  import gradio as gr
4
 
5
  import src.constants as constants
6
- from src.details import update_subtasks_component, update_load_details_component, load_details_dataframes, \
7
- display_details, update_sample_idx_component, clear_details, update_task_description_component, \
8
- display_loading_message_for_details
9
- from src.results import update_load_results_component, \
10
- load_results_dataframes, display_results, update_tasks_component, clear_results, \
11
- sort_result_paths_per_model, fetch_result_paths, display_loading_message_for_results
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
  # if __name__ == "__main__":
14
  result_paths_per_model = sort_result_paths_per_model(fetch_result_paths())
@@ -67,7 +82,7 @@ with gr.Blocks(fill_height=True, fill_width=True) as demo:
67
  configs = gr.HTML()
68
  with gr.Tab("Details"):
69
  details_task = gr.Radio(
70
- list(value for value in constants.TASKS.values() if value[1] != "leaderboard_gpqa"),
71
  label="Tasks",
72
  info="Evaluation tasks to be loaded",
73
  interactive=True,
@@ -84,11 +99,7 @@ with gr.Blocks(fill_height=True, fill_width=True) as demo:
84
  load_details_btn = gr.Button("Load Details", interactive=False)
85
  clear_details_btn = gr.Button("Clear Details")
86
  sample_idx = gr.Number(
87
- label="Sample Index",
88
- info="Index of the sample to be displayed",
89
- value=0,
90
- minimum=0,
91
- visible=False
92
  )
93
  details = gr.HTML()
94
  details_dataframe_1 = gr.Dataframe(visible=False)
@@ -135,7 +146,16 @@ with gr.Blocks(fill_height=True, fill_width=True) as demo:
135
  gr.on(
136
  triggers=[clear_results_btn.click, clear_configs_btn.click],
137
  fn=clear_results,
138
- outputs=[model_id_1, model_id_2, dataframe_1, dataframe_2, load_results_btn, load_configs_btn, results_task, configs_task],
 
 
 
 
 
 
 
 
 
139
  )
140
 
141
  # DETAILS:
@@ -174,7 +194,16 @@ with gr.Blocks(fill_height=True, fill_width=True) as demo:
174
  )
175
  clear_details_btn.click(
176
  fn=clear_details,
177
- outputs=[model_id_1, model_id_2, details_dataframe_1, details_dataframe_2, details_task, subtask, load_details_btn, sample_idx],
 
 
 
 
 
 
 
 
 
178
  )
179
 
180
  demo.launch()
 
3
  import gradio as gr
4
 
5
  import src.constants as constants
6
+ from src.details import (
7
+ clear_details,
8
+ display_details,
9
+ display_loading_message_for_details,
10
+ load_details_dataframes,
11
+ update_load_details_component,
12
+ update_sample_idx_component,
13
+ update_subtasks_component,
14
+ update_task_description_component,
15
+ )
16
+ from src.results import (
17
+ clear_results,
18
+ display_loading_message_for_results,
19
+ display_results,
20
+ fetch_result_paths,
21
+ load_results_dataframes,
22
+ sort_result_paths_per_model,
23
+ update_load_results_component,
24
+ update_tasks_component,
25
+ )
26
+
27
 
28
  # if __name__ == "__main__":
29
  result_paths_per_model = sort_result_paths_per_model(fetch_result_paths())
 
82
  configs = gr.HTML()
83
  with gr.Tab("Details"):
84
  details_task = gr.Radio(
85
+ [value for value in constants.TASKS.values() if value[1] != "leaderboard_gpqa"],
86
  label="Tasks",
87
  info="Evaluation tasks to be loaded",
88
  interactive=True,
 
99
  load_details_btn = gr.Button("Load Details", interactive=False)
100
  clear_details_btn = gr.Button("Clear Details")
101
  sample_idx = gr.Number(
102
+ label="Sample Index", info="Index of the sample to be displayed", value=0, minimum=0, visible=False
 
 
 
 
103
  )
104
  details = gr.HTML()
105
  details_dataframe_1 = gr.Dataframe(visible=False)
 
146
  gr.on(
147
  triggers=[clear_results_btn.click, clear_configs_btn.click],
148
  fn=clear_results,
149
+ outputs=[
150
+ model_id_1,
151
+ model_id_2,
152
+ dataframe_1,
153
+ dataframe_2,
154
+ load_results_btn,
155
+ load_configs_btn,
156
+ results_task,
157
+ configs_task,
158
+ ],
159
  )
160
 
161
  # DETAILS:
 
194
  )
195
  clear_details_btn.click(
196
  fn=clear_details,
197
+ outputs=[
198
+ model_id_1,
199
+ model_id_2,
200
+ details_dataframe_1,
201
+ details_dataframe_2,
202
+ details_task,
203
+ subtask,
204
+ load_details_btn,
205
+ sample_idx,
206
+ ],
207
  )
208
 
209
  demo.launch()
src/constants.py CHANGED
@@ -70,4 +70,4 @@ TASK_DESCRIPTIONS = {
70
  "leaderboard_math": "MATH is a compilation of high-school level competition problems gathered from several sources, formatted consistently using Latex for equations and Asymptote for figures. Generations must fit a very specific output format. We keep only level 5 MATH questions and call it MATH Lvl 5.",
71
  "leaderboard_mmlu_pro": "MMLU-Pro is a refined version of the MMLU dataset, which has been a standard for multiple-choice knowledge assessment. Recent research identified issues with the original MMLU, such as noisy data (some unanswerable questions) and decreasing difficulty due to advances in model capabilities and increased data contamination. MMLU-Pro addresses these issues by presenting models with 10 choices instead of 4, requiring reasoning on more questions, and undergoing expert review to reduce noise. As a result, MMLU-Pro is of higher quality and currently more challenging than the original.",
72
  "leaderboard_musr": "MuSR is a new dataset consisting of algorithmically generated complex problems, each around 1,000 words in length. The problems include murder mysteries, object placement questions, and team allocation optimizations. Solving these problems requires models to integrate reasoning with long-range context parsing. Few models achieve better than random performance on this dataset.",
73
- }
 
70
  "leaderboard_math": "MATH is a compilation of high-school level competition problems gathered from several sources, formatted consistently using Latex for equations and Asymptote for figures. Generations must fit a very specific output format. We keep only level 5 MATH questions and call it MATH Lvl 5.",
71
  "leaderboard_mmlu_pro": "MMLU-Pro is a refined version of the MMLU dataset, which has been a standard for multiple-choice knowledge assessment. Recent research identified issues with the original MMLU, such as noisy data (some unanswerable questions) and decreasing difficulty due to advances in model capabilities and increased data contamination. MMLU-Pro addresses these issues by presenting models with 10 choices instead of 4, requiring reasoning on more questions, and undergoing expert review to reduce noise. As a result, MMLU-Pro is of higher quality and currently more challenging than the original.",
72
  "leaderboard_musr": "MuSR is a new dataset consisting of algorithmically generated complex problems, each around 1,000 words in length. The problems include murder mysteries, object placement questions, and team allocation optimizations. Solving these problems requires models to integrate reasoning with long-range context parsing. Few models achieve better than random performance on this dataset.",
73
+ }
src/details.py CHANGED
@@ -67,6 +67,7 @@ def display_details(sample_idx, *dfs):
67
  return
68
  # Pop model_name and add it to the column name
69
  df = pd.concat([row.rename(row.pop("model_name")) for row in rows], axis="columns")
 
70
  # Wrap long strings to avoid overflow; e.g. URLs in "doc.Websites visited_NEV_2"
71
  def wrap(row):
72
  try:
@@ -78,8 +79,7 @@ def display_details(sample_idx, *dfs):
78
  df = df.apply(wrap, axis=1)
79
  # Style
80
  return (
81
- df.style
82
- .format(escape="html", na_rep="")
83
  # .hide(axis="index")
84
  .to_html()
85
  )
@@ -100,9 +100,14 @@ def update_sample_idx_component(*dfs):
100
  def clear_details():
101
  # model_id_1, model_id_2, details_dataframe_1, details_dataframe_2, details_task, subtask, load_details_btn, sample_idx
102
  return (
103
- None, None, None, None, None, None,
 
 
 
 
 
104
  gr.Button("Load Details", interactive=False),
105
- gr.Number(label="Sample Index", info="Index of the sample to be displayed", value=0, minimum=0,visible=False),
106
  )
107
 
108
 
 
67
  return
68
  # Pop model_name and add it to the column name
69
  df = pd.concat([row.rename(row.pop("model_name")) for row in rows], axis="columns")
70
+
71
  # Wrap long strings to avoid overflow; e.g. URLs in "doc.Websites visited_NEV_2"
72
  def wrap(row):
73
  try:
 
79
  df = df.apply(wrap, axis=1)
80
  # Style
81
  return (
82
+ df.style.format(escape="html", na_rep="")
 
83
  # .hide(axis="index")
84
  .to_html()
85
  )
 
100
  def clear_details():
101
  # model_id_1, model_id_2, details_dataframe_1, details_dataframe_2, details_task, subtask, load_details_btn, sample_idx
102
  return (
103
+ None,
104
+ None,
105
+ None,
106
+ None,
107
+ None,
108
+ None,
109
  gr.Button("Load Details", interactive=False),
110
+ gr.Number(label="Sample Index", info="Index of the sample to be displayed", value=0, minimum=0, visible=False),
111
  )
112
 
113
 
src/hub.py CHANGED
@@ -1,8 +1,9 @@
 
 
1
  import httpx
2
  from huggingface_hub import hf_hub_url
3
  from huggingface_hub.utils import build_hf_headers
4
 
5
- import json
6
 
7
  client = httpx.AsyncClient()
8
 
 
1
+ import json
2
+
3
  import httpx
4
  from huggingface_hub import hf_hub_url
5
  from huggingface_hub.utils import build_hf_headers
6
 
 
7
 
8
  client = httpx.AsyncClient()
9
 
src/results.py CHANGED
@@ -20,13 +20,13 @@ def sort_result_paths_per_model(paths):
20
 
21
  d = defaultdict(list)
22
  for path in paths:
23
- model_id, _ = path[len(constants.RESULTS_DATASET_ID) + 1:].rsplit("/", 1)
24
  d[model_id].append(path)
25
  return {model_id: sorted(paths) for model_id, paths in d.items()}
26
 
27
 
28
  def update_load_results_component():
29
- return (gr.Button("Load", interactive=True), ) * 2
30
 
31
 
32
  async def load_results_dataframe(model_id, result_paths_per_model=None):
@@ -45,7 +45,9 @@ async def load_results_dataframe(model_id, result_paths_per_model=None):
45
 
46
 
47
  async def load_results_dataframes(*model_ids, result_paths_per_model=None):
48
- result = await asyncio.gather(*[load_results_dataframe(model_id, result_paths_per_model) for model_id in model_ids])
 
 
49
  return result
50
 
51
 
@@ -68,7 +70,11 @@ def display_tab(tab, df, task):
68
  not row.startswith(f"{tab}.")
69
  or row.startswith(f"{tab}.leaderboard.")
70
  or row.endswith(".alias")
71
- or (not row.startswith(f"{tab}.{task}") if task != "All" else row.startswith(f"{tab}.leaderboard_arc_challenge"))
 
 
 
 
72
  )
73
  ],
74
  axis="index",
@@ -94,8 +100,11 @@ def update_tasks_component():
94
  def clear_results():
95
  # model_id_1, model_id_2, dataframe_1, dataframe_2, load_results_btn, load_configs_btn, results_task, configs_task
96
  return (
97
- None, None, None, None,
98
- *(gr.Button("Load", interactive=False), ) * 2,
 
 
 
99
  *(
100
  gr.Radio(
101
  ["All"] + list(constants.TASKS.values()),
@@ -104,7 +113,8 @@ def clear_results():
104
  value="All",
105
  visible=False,
106
  ),
107
- ) * 2,
 
108
  )
109
 
110
 
@@ -116,4 +126,4 @@ def highlight_min_max(s):
116
 
117
 
118
  def display_loading_message_for_results():
119
- return ("<h3 style='text-align: center;'>Loading...</h3>", ) * 2
 
20
 
21
  d = defaultdict(list)
22
  for path in paths:
23
+ model_id, _ = path[len(constants.RESULTS_DATASET_ID) + 1 :].rsplit("/", 1)
24
  d[model_id].append(path)
25
  return {model_id: sorted(paths) for model_id, paths in d.items()}
26
 
27
 
28
  def update_load_results_component():
29
+ return (gr.Button("Load", interactive=True),) * 2
30
 
31
 
32
  async def load_results_dataframe(model_id, result_paths_per_model=None):
 
45
 
46
 
47
  async def load_results_dataframes(*model_ids, result_paths_per_model=None):
48
+ result = await asyncio.gather(
49
+ *[load_results_dataframe(model_id, result_paths_per_model) for model_id in model_ids]
50
+ )
51
  return result
52
 
53
 
 
70
  not row.startswith(f"{tab}.")
71
  or row.startswith(f"{tab}.leaderboard.")
72
  or row.endswith(".alias")
73
+ or (
74
+ not row.startswith(f"{tab}.{task}")
75
+ if task != "All"
76
+ else row.startswith(f"{tab}.leaderboard_arc_challenge")
77
+ )
78
  )
79
  ],
80
  axis="index",
 
100
  def clear_results():
101
  # model_id_1, model_id_2, dataframe_1, dataframe_2, load_results_btn, load_configs_btn, results_task, configs_task
102
  return (
103
+ None,
104
+ None,
105
+ None,
106
+ None,
107
+ *(gr.Button("Load", interactive=False),) * 2,
108
  *(
109
  gr.Radio(
110
  ["All"] + list(constants.TASKS.values()),
 
113
  value="All",
114
  visible=False,
115
  ),
116
+ )
117
+ * 2,
118
  )
119
 
120
 
 
126
 
127
 
128
  def display_loading_message_for_results():
129
+ return ("<h3 style='text-align: center;'>Loading...</h3>",) * 2