Spaces:
Sleeping
Sleeping
Add description of Tasks
Browse files- app.py +30 -1
- src/constants.py +10 -11
- src/details.py +10 -1
app.py
CHANGED
@@ -4,7 +4,7 @@ import gradio as gr
|
|
4 |
|
5 |
from src.constants import SUBTASKS, TASKS
|
6 |
from src.details import update_subtasks_component, update_load_details_component, load_details_dataframes, \
|
7 |
-
display_details, update_sample_idx_component, clear_details
|
8 |
from src.results import update_load_results_component, \
|
9 |
load_results_dataframes, display_results, update_tasks_component, clear_results, \
|
10 |
sort_result_paths_per_model, fetch_result_paths
|
@@ -40,6 +40,11 @@ with gr.Blocks(fill_height=True, fill_width=True) as demo:
|
|
40 |
value="All",
|
41 |
visible=False,
|
42 |
)
|
|
|
|
|
|
|
|
|
|
|
43 |
results = gr.HTML()
|
44 |
with gr.Tab("Configs"):
|
45 |
load_configs_btn = gr.Button("Load", interactive=False)
|
@@ -51,6 +56,11 @@ with gr.Blocks(fill_height=True, fill_width=True) as demo:
|
|
51 |
value="All",
|
52 |
visible=False,
|
53 |
)
|
|
|
|
|
|
|
|
|
|
|
54 |
configs = gr.HTML()
|
55 |
with gr.Tab("Details"):
|
56 |
details_task = gr.Radio(
|
@@ -59,6 +69,10 @@ with gr.Blocks(fill_height=True, fill_width=True) as demo:
|
|
59 |
info="Evaluation tasks to be loaded",
|
60 |
interactive=True,
|
61 |
)
|
|
|
|
|
|
|
|
|
62 |
subtask = gr.Radio(
|
63 |
SUBTASKS.get(details_task.value),
|
64 |
label="Subtasks",
|
@@ -95,6 +109,17 @@ with gr.Blocks(fill_height=True, fill_width=True) as demo:
|
|
95 |
# Synchronize the results_task and configs_task radio buttons
|
96 |
results_task.input(fn=lambda task: task, inputs=results_task, outputs=configs_task)
|
97 |
configs_task.input(fn=lambda task: task, inputs=configs_task, outputs=results_task)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
98 |
gr.on(
|
99 |
triggers=[dataframe_1.change, dataframe_2.change, results_task.change],
|
100 |
fn=display_results,
|
@@ -108,6 +133,10 @@ with gr.Blocks(fill_height=True, fill_width=True) as demo:
|
|
108 |
)
|
109 |
|
110 |
details_task.change(
|
|
|
|
|
|
|
|
|
111 |
fn=update_subtasks_component,
|
112 |
inputs=details_task,
|
113 |
outputs=subtask,
|
|
|
4 |
|
5 |
from src.constants import SUBTASKS, TASKS
|
6 |
from src.details import update_subtasks_component, update_load_details_component, load_details_dataframes, \
|
7 |
+
display_details, update_sample_idx_component, clear_details, update_task_description_component
|
8 |
from src.results import update_load_results_component, \
|
9 |
load_results_dataframes, display_results, update_tasks_component, clear_results, \
|
10 |
sort_result_paths_per_model, fetch_result_paths
|
|
|
40 |
value="All",
|
41 |
visible=False,
|
42 |
)
|
43 |
+
results_task_description = gr.Textbox(
|
44 |
+
label="Task Description",
|
45 |
+
lines=3,
|
46 |
+
visible=False,
|
47 |
+
)
|
48 |
results = gr.HTML()
|
49 |
with gr.Tab("Configs"):
|
50 |
load_configs_btn = gr.Button("Load", interactive=False)
|
|
|
56 |
value="All",
|
57 |
visible=False,
|
58 |
)
|
59 |
+
configs_task_description = gr.Textbox(
|
60 |
+
label="Task Description",
|
61 |
+
lines=3,
|
62 |
+
visible=False,
|
63 |
+
)
|
64 |
configs = gr.HTML()
|
65 |
with gr.Tab("Details"):
|
66 |
details_task = gr.Radio(
|
|
|
69 |
info="Evaluation tasks to be loaded",
|
70 |
interactive=True,
|
71 |
)
|
72 |
+
details_task_description = gr.Textbox(
|
73 |
+
label="Task Description",
|
74 |
+
lines=3,
|
75 |
+
)
|
76 |
subtask = gr.Radio(
|
77 |
SUBTASKS.get(details_task.value),
|
78 |
label="Subtasks",
|
|
|
109 |
# Synchronize the results_task and configs_task radio buttons
|
110 |
results_task.input(fn=lambda task: task, inputs=results_task, outputs=configs_task)
|
111 |
configs_task.input(fn=lambda task: task, inputs=configs_task, outputs=results_task)
|
112 |
+
# Update task descriptions
|
113 |
+
results_task.change(
|
114 |
+
fn=update_task_description_component,
|
115 |
+
inputs=results_task,
|
116 |
+
outputs=results_task_description,
|
117 |
+
).then(
|
118 |
+
fn=update_task_description_component,
|
119 |
+
inputs=results_task,
|
120 |
+
outputs=configs_task_description,
|
121 |
+
)
|
122 |
+
# Display results
|
123 |
gr.on(
|
124 |
triggers=[dataframe_1.change, dataframe_2.change, results_task.change],
|
125 |
fn=display_results,
|
|
|
133 |
)
|
134 |
|
135 |
details_task.change(
|
136 |
+
fn=update_task_description_component,
|
137 |
+
inputs=details_task,
|
138 |
+
outputs=details_task_description,
|
139 |
+
).then(
|
140 |
fn=update_subtasks_component,
|
141 |
inputs=details_task,
|
142 |
outputs=subtask,
|
src/constants.py
CHANGED
@@ -1,15 +1,4 @@
|
|
1 |
RESULTS_DATASET_ID = "datasets/open-llm-leaderboard/results"
|
2 |
-
# EXCLUDED_KEYS = {
|
3 |
-
# "pretty_env_info",
|
4 |
-
# "chat_template",
|
5 |
-
# "group_subtasks",
|
6 |
-
# }
|
7 |
-
# EXCLUDED_RESULTS_KEYS = {
|
8 |
-
# "leaderboard",
|
9 |
-
# }
|
10 |
-
# EXCLUDED_RESULTS_LEADERBOARDS_KEYS = {
|
11 |
-
# "alias",
|
12 |
-
# }
|
13 |
|
14 |
DETAILS_DATASET_ID = "datasets/open-llm-leaderboard/{model_name_sanitized}-details"
|
15 |
DETAILS_FILENAME = "samples_{subtask}_*.json"
|
@@ -72,3 +61,13 @@ SUBTASKS = {
|
|
72 |
"leaderboard_musr_team_allocation",
|
73 |
],
|
74 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
RESULTS_DATASET_ID = "datasets/open-llm-leaderboard/results"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
|
3 |
DETAILS_DATASET_ID = "datasets/open-llm-leaderboard/{model_name_sanitized}-details"
|
4 |
DETAILS_FILENAME = "samples_{subtask}_*.json"
|
|
|
61 |
"leaderboard_musr_team_allocation",
|
62 |
],
|
63 |
}
|
64 |
+
|
65 |
+
TASK_DESCRIPTIONS = {
|
66 |
+
"leaderboard_bbh": "BBH is a subset of 23 challenging tasks from the BigBench dataset to evaluate language models. The tasks use objective metrics, are highly difficult, and have sufficient sample sizes for statistical significance. They include multistep arithmetic, algorithmic reasoning (e.g., boolean expressions, SVG shapes), language understanding (e.g., sarcasm detection, name disambiguation), and world knowledge. BBH performance correlates well with human preferences, providing valuable insights into model capabilities.",
|
67 |
+
"leaderboard_gpqa": "GPQA is a highly challenging knowledge dataset with questions crafted by PhD-level domain experts in fields like biology, physics, and chemistry. These questions are designed to be difficult for laypersons but relatively easy for experts. The dataset has undergone multiple rounds of validation to ensure both difficulty and factual accuracy. Access to GPQA is restricted through gating mechanisms to minimize the risk of data contamination. Consequently, we do not provide plain text examples from this dataset, as requested by the authors.",
|
68 |
+
"leaderboard_ifeval": "IFEval is a dataset designed to test a model’s ability to follow explicit instructions, such as “include keyword x” or “use format y.” The focus is on the model’s adherence to formatting instructions rather than the content generated, allowing for the use of strict and rigorous metrics.",
|
69 |
+
# "leaderboard_math_hard": "MATH is a compilation of high-school level competition problems gathered from several sources, formatted consistently using Latex for equations and Asymptote for figures. Generations must fit a very specific output format. We keep only level 5 MATH questions and call it MATH Lvl 5.",
|
70 |
+
"leaderboard_math": "MATH is a compilation of high-school level competition problems gathered from several sources, formatted consistently using Latex for equations and Asymptote for figures. Generations must fit a very specific output format. We keep only level 5 MATH questions and call it MATH Lvl 5.",
|
71 |
+
"leaderboard_mmlu_pro": "MMLU-Pro is a refined version of the MMLU dataset, which has been a standard for multiple-choice knowledge assessment. Recent research identified issues with the original MMLU, such as noisy data (some unanswerable questions) and decreasing difficulty due to advances in model capabilities and increased data contamination. MMLU-Pro addresses these issues by presenting models with 10 choices instead of 4, requiring reasoning on more questions, and undergoing expert review to reduce noise. As a result, MMLU-Pro is of higher quality and currently more challenging than the original.",
|
72 |
+
"leaderboard_musr": "MuSR is a new dataset consisting of algorithmically generated complex problems, each around 1,000 words in length. The problems include murder mysteries, object placement questions, and team allocation optimizations. Solving these problems requires models to integrate reasoning with long-range context parsing. Few models achieve better than random performance on this dataset.",
|
73 |
+
}
|
src/details.py
CHANGED
@@ -4,10 +4,19 @@ import gradio as gr
|
|
4 |
import pandas as pd
|
5 |
from huggingface_hub import HfFileSystem
|
6 |
|
7 |
-
from src.constants import SUBTASKS, DETAILS_DATASET_ID, DETAILS_FILENAME
|
8 |
from src.hub import load_details_file
|
9 |
|
10 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
def update_subtasks_component(task):
|
12 |
return gr.Radio(
|
13 |
SUBTASKS.get(task),
|
|
|
4 |
import pandas as pd
|
5 |
from huggingface_hub import HfFileSystem
|
6 |
|
7 |
+
from src.constants import SUBTASKS, DETAILS_DATASET_ID, DETAILS_FILENAME, TASK_DESCRIPTIONS
|
8 |
from src.hub import load_details_file
|
9 |
|
10 |
|
11 |
+
def update_task_description_component(task):
|
12 |
+
return gr.Textbox(
|
13 |
+
TASK_DESCRIPTIONS.get(task),
|
14 |
+
label="Task Description",
|
15 |
+
lines=3,
|
16 |
+
visible=True,
|
17 |
+
)
|
18 |
+
|
19 |
+
|
20 |
def update_subtasks_component(task):
|
21 |
return gr.Radio(
|
22 |
SUBTASKS.get(task),
|