Spaces:
Running
Running
modify paper names and paths to datasets
Browse files- README.md +5 -4
- _header.md +1 -2
- app.py +36 -36
- constants.py +31 -32
- eval_utils.py +1 -1
README.md
CHANGED
@@ -10,12 +10,12 @@ pinned: true
|
|
10 |
fullWidth: true
|
11 |
hf_oauth: true
|
12 |
api: false
|
13 |
-
tags:
|
14 |
- leaderboard
|
15 |
-
datasets:
|
16 |
- allenai/ZebraLogicBench
|
17 |
-
-
|
18 |
-
models:
|
19 |
- Qwen/Qwen2-72B-Instruct
|
20 |
- Qwen/Qwen1.5-72B-Chat
|
21 |
- Qwen/Qwen1.5-7B-Chat
|
@@ -58,3 +58,4 @@ models:
|
|
58 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
59 |
|
60 |
Paper: arxiv.org/abs/2406.04770
|
|
|
|
10 |
fullWidth: true
|
11 |
hf_oauth: true
|
12 |
api: false
|
13 |
+
tags:
|
14 |
- leaderboard
|
15 |
+
datasets:
|
16 |
- allenai/ZebraLogicBench
|
17 |
+
- WildEval/ZebraLogic
|
18 |
+
models:
|
19 |
- Qwen/Qwen2-72B-Instruct
|
20 |
- Qwen/Qwen1.5-72B-Chat
|
21 |
- Qwen/Qwen1.5-7B-Chat
|
|
|
58 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
59 |
|
60 |
Paper: arxiv.org/abs/2406.04770
|
61 |
+
Paper: arxiv.org/abs/2502.01100
|
_header.md
CHANGED
@@ -1,6 +1,5 @@
|
|
1 |
<br/>
|
2 |
|
3 |
-
# 🦓 ZebraLogic:
|
4 |
<!-- [📑 FnF Paper](https://arxiv.org/abs/2305.18654) | -->
|
5 |
[📰 Blog](https://huggingface.co/blog/yuchenlin/zebra-logic) [💻 GitHub](https://github.com/WildEval/ZeroEval) | [🤗 HuggingFace](https://huggingface.co/collections/allenai/zebra-logic-bench-6697137cbaad0b91e635e7b0) | [🐦 X](https://twitter.com/billyuchenlin/) | [💬 Discussion](https://huggingface.co/spaces/allenai/ZebraLogicBench-Leaderboard/discussions) | Updated: **{LAST_UPDATED}**
|
6 |
-
|
|
|
1 |
<br/>
|
2 |
|
3 |
+
# 🦓 ZebraLogic: On the Scaling Limits of LLMs for Logical Reasoning
|
4 |
<!-- [📑 FnF Paper](https://arxiv.org/abs/2305.18654) | -->
|
5 |
[📰 Blog](https://huggingface.co/blog/yuchenlin/zebra-logic) [💻 GitHub](https://github.com/WildEval/ZeroEval) | [🤗 HuggingFace](https://huggingface.co/collections/allenai/zebra-logic-bench-6697137cbaad0b91e635e7b0) | [🐦 X](https://twitter.com/billyuchenlin/) | [💬 Discussion](https://huggingface.co/spaces/allenai/ZebraLogicBench-Leaderboard/discussions) | Updated: **{LAST_UPDATED}**
|
|
app.py
CHANGED
@@ -12,16 +12,16 @@ import pandas as pd
|
|
12 |
from pathlib import Path
|
13 |
import json
|
14 |
from constants import *
|
15 |
-
from datetime import datetime, timezone
|
16 |
# from datasets import Dataset, load_dataset, concatenate_datasets
|
17 |
-
import os, uuid
|
18 |
from utils_display import model_info
|
19 |
from constants import column_names, LEADERBOARD_REMARKS, DEFAULT_K, LEADERBOARD_REMARKS_MAIN
|
20 |
import pytz
|
21 |
from data_utils import post_processing, get_random_item
|
22 |
|
23 |
# get the last updated time from the elo_ranks.all.jsonl file
|
24 |
-
LAST_UPDATED = None
|
25 |
# with open("_intro.md", "r") as f:
|
26 |
# INTRO_MD = f.read()
|
27 |
INTRO_MD = ""
|
@@ -33,11 +33,11 @@ with open("_header.md", "r") as f:
|
|
33 |
|
34 |
with open("_metrics.md", "r") as f:
|
35 |
METRICS_MD = f.read()
|
36 |
-
|
37 |
-
raw_data = None
|
38 |
-
original_df = None
|
39 |
# available_models = [] # to be filled in later
|
40 |
-
available_models = list(model_info.keys())
|
41 |
|
42 |
def df_filters(mode_selection_radio, show_open_source_model_only):
|
43 |
global original_df
|
@@ -59,19 +59,19 @@ def _gstr(text):
|
|
59 |
|
60 |
def _tab_leaderboard():
|
61 |
global original_df, available_models
|
62 |
-
# with gr.TabItem("📊 Main", elem_id="od-benchmark-tab-table-ablation", id=0, elem_classes="subtab"):
|
63 |
if True:
|
64 |
-
default_main_df = original_df.copy()
|
65 |
# default_main_df.insert(0, "", range(1, 1 + len(default_main_df)))
|
66 |
-
# default_main_df_no_task = default_main_df.copy()
|
67 |
default_mode = "greedy"
|
68 |
default_main_df = df_filters(default_mode, False)
|
69 |
-
with gr.Row():
|
70 |
-
with gr.Column(scale=5):
|
71 |
mode_selection_radio = gr.Radio(["greedy", "all"], show_label=False, elem_id="rank-column-radio", value=default_mode)
|
72 |
# with gr.Row():
|
73 |
# with gr.Column(scale=2):
|
74 |
-
|
75 |
leaderboard_table = gr.components.Dataframe(
|
76 |
value=default_main_df,
|
77 |
datatype= ["number", "markdown", "markdown", "number"],
|
@@ -83,7 +83,7 @@ def _tab_leaderboard():
|
|
83 |
column_widths=[50, 260, 100, 100, 120, 120, 100,100,110,100],
|
84 |
wrap=True
|
85 |
# min_width=60,
|
86 |
-
)
|
87 |
# checkbox_show_task_categorized.change(fn=length_margin_change, inputs=[length_margin_choices, gr.Text("main", visible=False), checkbox_show_task_categorized, show_open_source_model_only, rank_column_radio], outputs=[leaderboard_table])
|
88 |
# show_open_source_model_only.change(fn=length_margin_change, inputs=[length_margin_choices, gr.Text("main", visible=False), checkbox_show_task_categorized, show_open_source_model_only, rank_column_radio], outputs=[leaderboard_table])
|
89 |
# rank_column_radio.change(fn=length_margin_change, inputs=[length_margin_choices, gr.Text("main", visible=False), checkbox_show_task_categorized, show_open_source_model_only, rank_column_radio], outputs=[leaderboard_table])
|
@@ -121,14 +121,14 @@ def _tab_explore():
|
|
121 |
# greedy_or_sample = gr.Radio(["greedy", "sampling"], show_label=False, elem_id="greedy-or-sample", value="greedy", interactive=True)
|
122 |
gr.Markdown("### 🚀 Click below to sample a puzzle. ⬇️ ")
|
123 |
explore_button = gr.Button("🦓 Sample a Zebra Puzzle!", elem_id="explore-button")
|
124 |
-
|
125 |
puzzle_md = gr.Markdown("### 🦓 Puzzle: \n\nTo be loaded", elem_id="puzzle-md", elem_classes="box_md")
|
126 |
model_reasoning_md = gr.Markdown("### 🤖 Reasoning: \n\nTo be loaded", elem_id="model-reasoning-md", elem_classes="box_md")
|
127 |
model_prediction_md = gr.Markdown("### 💬 Answer: \n\nTo be loaded", elem_id="model-prediction-md", elem_classes="box_md")
|
128 |
turht_solution_md = gr.Markdown("### ✅ Truth Solution: \n\nTo be loaded", elem_id="truth-solution-md", elem_classes="box_md")
|
129 |
model_eval_md = gr.Markdown("### 🆚 Evaluation: \n\nTo be loaded", elem_id="model-eval-md", elem_classes="box_md")
|
130 |
-
explore_button.click(fn=sample_explore_item,
|
131 |
-
inputs=[model_selection, size_H_selection, size_W_selection],
|
132 |
outputs=[puzzle_md, model_reasoning_md, model_prediction_md, model_eval_md, turht_solution_md])
|
133 |
|
134 |
|
@@ -136,8 +136,8 @@ def _tab_explore():
|
|
136 |
def _tab_submit():
|
137 |
markdown_text = """
|
138 |
Please create an issue on our [Github](https://github.com/WildEval/ZeroEval/) repository to talk about your model. Then, we can test it for you and report the results here on the Leaderboard.
|
139 |
-
If you would like to do local testing, please read our code [here](https://github.com/WildEval/ZeroEval/blob/main/src/evaluation/zebra_grid_eval.py)
|
140 |
-
and apply for the access for the [private dataset](https://huggingface.co/datasets/
|
141 |
"""
|
142 |
|
143 |
gr.Markdown("## 🚀 Submit Your Results\n\n" + markdown_text, elem_classes="markdown-text")
|
@@ -149,33 +149,33 @@ def build_demo():
|
|
149 |
|
150 |
with gr.Blocks(theme=gr.themes.Soft(), css=css, js=js_light) as demo:
|
151 |
gr.HTML(BANNER, elem_id="banner")
|
152 |
-
# convert LAST_UPDATED to the PDT time
|
153 |
LAST_UPDATED = datetime.now(pytz.timezone('US/Pacific')).strftime("%Y-%m-%d %H:%M:%S")
|
154 |
header_md_text = HEADER_MD.replace("{LAST_UPDATED}", str(LAST_UPDATED))
|
155 |
-
gr.Markdown(header_md_text, elem_classes="markdown-text")
|
156 |
|
157 |
-
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
158 |
with gr.TabItem("🏅 Leaderboard", elem_id="od-benchmark-tab-table", id=0):
|
159 |
-
_tab_leaderboard()
|
160 |
with gr.TabItem("🔍 Explore", elem_id="od-benchmark-tab-table", id=1):
|
161 |
_tab_explore()
|
162 |
with gr.TabItem("🚀 Submit Your Results", elem_id="od-benchmark-tab-table", id=3):
|
163 |
-
_tab_submit()
|
164 |
|
165 |
with gr.TabItem("📮 About Us", elem_id="od-benchmark-tab-table", id=4):
|
166 |
gr.Markdown(ABOUT_MD, elem_classes="markdown-text")
|
167 |
-
|
168 |
with gr.Row():
|
169 |
with gr.Accordion("📙 Citation", open=False, elem_classes="accordion-label"):
|
170 |
gr.Textbox(
|
171 |
-
value=CITATION_TEXT,
|
172 |
lines=7,
|
173 |
label="Copy the BibTeX snippet to cite this source",
|
174 |
elem_id="citation-button",
|
175 |
show_copy_button=True)
|
176 |
# ).style(show_copy_button=True)
|
177 |
|
178 |
-
return demo
|
179 |
|
180 |
|
181 |
|
@@ -184,11 +184,11 @@ def data_load(result_file):
|
|
184 |
print(f"Loading {result_file}")
|
185 |
column_names_main = column_names.copy()
|
186 |
# column_names_main.update({})
|
187 |
-
main_ordered_columns = ORDERED_COLUMN_NAMES
|
188 |
-
# filter the data with Total Puzzles == 1000
|
189 |
-
|
190 |
-
click_url = True
|
191 |
-
# read json file from the result_file
|
192 |
with open(result_file, "r") as f:
|
193 |
raw_data = json.load(f)
|
194 |
# floatify the data, if possible
|
@@ -201,16 +201,16 @@ def data_load(result_file):
|
|
201 |
original_df = pd.DataFrame(raw_data)
|
202 |
original_df = original_df[original_df["Total Puzzles"] == 1000]
|
203 |
original_df = post_processing(original_df, column_names_main, ordered_columns=main_ordered_columns, click_url=click_url, rank_column=RANKING_COLUMN)
|
204 |
-
# print(original_df.columns)
|
205 |
-
|
206 |
|
207 |
if __name__ == "__main__":
|
208 |
parser = argparse.ArgumentParser()
|
209 |
parser.add_argument("--share", action="store_true")
|
210 |
parser.add_argument("--result_file", help="Path to results table", default="ZeroEval-main/result_dirs/zebra-grid.summary.json")
|
211 |
-
|
212 |
args = parser.parse_args()
|
213 |
-
data_load(args.result_file)
|
214 |
print(original_df)
|
215 |
demo = build_demo()
|
216 |
demo.launch(share=args.share, height=3000, width="100%")
|
|
|
12 |
from pathlib import Path
|
13 |
import json
|
14 |
from constants import *
|
15 |
+
from datetime import datetime, timezone
|
16 |
# from datasets import Dataset, load_dataset, concatenate_datasets
|
17 |
+
import os, uuid
|
18 |
from utils_display import model_info
|
19 |
from constants import column_names, LEADERBOARD_REMARKS, DEFAULT_K, LEADERBOARD_REMARKS_MAIN
|
20 |
import pytz
|
21 |
from data_utils import post_processing, get_random_item
|
22 |
|
23 |
# get the last updated time from the elo_ranks.all.jsonl file
|
24 |
+
LAST_UPDATED = None
|
25 |
# with open("_intro.md", "r") as f:
|
26 |
# INTRO_MD = f.read()
|
27 |
INTRO_MD = ""
|
|
|
33 |
|
34 |
with open("_metrics.md", "r") as f:
|
35 |
METRICS_MD = f.read()
|
36 |
+
|
37 |
+
raw_data = None
|
38 |
+
original_df = None
|
39 |
# available_models = [] # to be filled in later
|
40 |
+
available_models = list(model_info.keys())
|
41 |
|
42 |
def df_filters(mode_selection_radio, show_open_source_model_only):
|
43 |
global original_df
|
|
|
59 |
|
60 |
def _tab_leaderboard():
|
61 |
global original_df, available_models
|
62 |
+
# with gr.TabItem("📊 Main", elem_id="od-benchmark-tab-table-ablation", id=0, elem_classes="subtab"):
|
63 |
if True:
|
64 |
+
default_main_df = original_df.copy()
|
65 |
# default_main_df.insert(0, "", range(1, 1 + len(default_main_df)))
|
66 |
+
# default_main_df_no_task = default_main_df.copy()
|
67 |
default_mode = "greedy"
|
68 |
default_main_df = df_filters(default_mode, False)
|
69 |
+
with gr.Row():
|
70 |
+
with gr.Column(scale=5):
|
71 |
mode_selection_radio = gr.Radio(["greedy", "all"], show_label=False, elem_id="rank-column-radio", value=default_mode)
|
72 |
# with gr.Row():
|
73 |
# with gr.Column(scale=2):
|
74 |
+
|
75 |
leaderboard_table = gr.components.Dataframe(
|
76 |
value=default_main_df,
|
77 |
datatype= ["number", "markdown", "markdown", "number"],
|
|
|
83 |
column_widths=[50, 260, 100, 100, 120, 120, 100,100,110,100],
|
84 |
wrap=True
|
85 |
# min_width=60,
|
86 |
+
)
|
87 |
# checkbox_show_task_categorized.change(fn=length_margin_change, inputs=[length_margin_choices, gr.Text("main", visible=False), checkbox_show_task_categorized, show_open_source_model_only, rank_column_radio], outputs=[leaderboard_table])
|
88 |
# show_open_source_model_only.change(fn=length_margin_change, inputs=[length_margin_choices, gr.Text("main", visible=False), checkbox_show_task_categorized, show_open_source_model_only, rank_column_radio], outputs=[leaderboard_table])
|
89 |
# rank_column_radio.change(fn=length_margin_change, inputs=[length_margin_choices, gr.Text("main", visible=False), checkbox_show_task_categorized, show_open_source_model_only, rank_column_radio], outputs=[leaderboard_table])
|
|
|
121 |
# greedy_or_sample = gr.Radio(["greedy", "sampling"], show_label=False, elem_id="greedy-or-sample", value="greedy", interactive=True)
|
122 |
gr.Markdown("### 🚀 Click below to sample a puzzle. ⬇️ ")
|
123 |
explore_button = gr.Button("🦓 Sample a Zebra Puzzle!", elem_id="explore-button")
|
124 |
+
|
125 |
puzzle_md = gr.Markdown("### 🦓 Puzzle: \n\nTo be loaded", elem_id="puzzle-md", elem_classes="box_md")
|
126 |
model_reasoning_md = gr.Markdown("### 🤖 Reasoning: \n\nTo be loaded", elem_id="model-reasoning-md", elem_classes="box_md")
|
127 |
model_prediction_md = gr.Markdown("### 💬 Answer: \n\nTo be loaded", elem_id="model-prediction-md", elem_classes="box_md")
|
128 |
turht_solution_md = gr.Markdown("### ✅ Truth Solution: \n\nTo be loaded", elem_id="truth-solution-md", elem_classes="box_md")
|
129 |
model_eval_md = gr.Markdown("### 🆚 Evaluation: \n\nTo be loaded", elem_id="model-eval-md", elem_classes="box_md")
|
130 |
+
explore_button.click(fn=sample_explore_item,
|
131 |
+
inputs=[model_selection, size_H_selection, size_W_selection],
|
132 |
outputs=[puzzle_md, model_reasoning_md, model_prediction_md, model_eval_md, turht_solution_md])
|
133 |
|
134 |
|
|
|
136 |
def _tab_submit():
|
137 |
markdown_text = """
|
138 |
Please create an issue on our [Github](https://github.com/WildEval/ZeroEval/) repository to talk about your model. Then, we can test it for you and report the results here on the Leaderboard.
|
139 |
+
If you would like to do local testing, please read our code [here](https://github.com/WildEval/ZeroEval/blob/main/src/evaluation/zebra_grid_eval.py)
|
140 |
+
and apply for the access for the [private dataset](https://huggingface.co/datasets/WildEval/ZebraLogic) that contains the truth solutions.
|
141 |
"""
|
142 |
|
143 |
gr.Markdown("## 🚀 Submit Your Results\n\n" + markdown_text, elem_classes="markdown-text")
|
|
|
149 |
|
150 |
with gr.Blocks(theme=gr.themes.Soft(), css=css, js=js_light) as demo:
|
151 |
gr.HTML(BANNER, elem_id="banner")
|
152 |
+
# convert LAST_UPDATED to the PDT time
|
153 |
LAST_UPDATED = datetime.now(pytz.timezone('US/Pacific')).strftime("%Y-%m-%d %H:%M:%S")
|
154 |
header_md_text = HEADER_MD.replace("{LAST_UPDATED}", str(LAST_UPDATED))
|
155 |
+
gr.Markdown(header_md_text, elem_classes="markdown-text")
|
156 |
|
157 |
+
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
158 |
with gr.TabItem("🏅 Leaderboard", elem_id="od-benchmark-tab-table", id=0):
|
159 |
+
_tab_leaderboard()
|
160 |
with gr.TabItem("🔍 Explore", elem_id="od-benchmark-tab-table", id=1):
|
161 |
_tab_explore()
|
162 |
with gr.TabItem("🚀 Submit Your Results", elem_id="od-benchmark-tab-table", id=3):
|
163 |
+
_tab_submit()
|
164 |
|
165 |
with gr.TabItem("📮 About Us", elem_id="od-benchmark-tab-table", id=4):
|
166 |
gr.Markdown(ABOUT_MD, elem_classes="markdown-text")
|
167 |
+
|
168 |
with gr.Row():
|
169 |
with gr.Accordion("📙 Citation", open=False, elem_classes="accordion-label"):
|
170 |
gr.Textbox(
|
171 |
+
value=CITATION_TEXT,
|
172 |
lines=7,
|
173 |
label="Copy the BibTeX snippet to cite this source",
|
174 |
elem_id="citation-button",
|
175 |
show_copy_button=True)
|
176 |
# ).style(show_copy_button=True)
|
177 |
|
178 |
+
return demo
|
179 |
|
180 |
|
181 |
|
|
|
184 |
print(f"Loading {result_file}")
|
185 |
column_names_main = column_names.copy()
|
186 |
# column_names_main.update({})
|
187 |
+
main_ordered_columns = ORDERED_COLUMN_NAMES
|
188 |
+
# filter the data with Total Puzzles == 1000
|
189 |
+
|
190 |
+
click_url = True
|
191 |
+
# read json file from the result_file
|
192 |
with open(result_file, "r") as f:
|
193 |
raw_data = json.load(f)
|
194 |
# floatify the data, if possible
|
|
|
201 |
original_df = pd.DataFrame(raw_data)
|
202 |
original_df = original_df[original_df["Total Puzzles"] == 1000]
|
203 |
original_df = post_processing(original_df, column_names_main, ordered_columns=main_ordered_columns, click_url=click_url, rank_column=RANKING_COLUMN)
|
204 |
+
# print(original_df.columns)
|
205 |
+
|
206 |
|
207 |
if __name__ == "__main__":
|
208 |
parser = argparse.ArgumentParser()
|
209 |
parser.add_argument("--share", action="store_true")
|
210 |
parser.add_argument("--result_file", help="Path to results table", default="ZeroEval-main/result_dirs/zebra-grid.summary.json")
|
211 |
+
|
212 |
args = parser.parse_args()
|
213 |
+
data_load(args.result_file)
|
214 |
print(original_df)
|
215 |
demo = build_demo()
|
216 |
demo.launch(share=args.share, height=3000, width="100%")
|
constants.py
CHANGED
@@ -8,15 +8,15 @@ banner_url = "https://github.com/WildEval/ZeroEval/blob/main/docs/zebra/zebra_ba
|
|
8 |
BANNER = f'<div style="display: flex; justify-content: flex-start;"><img src="{banner_url}" alt="Banner" style="width: 70vw; min-width: 300px; max-width: 1000px;border: 3px solid gray; border-color: gray black;"> </div>'
|
9 |
|
10 |
# TITLE = "<html> <head> <style> h1 {text-align: center;} </style> </head> <body> <h1> 🦁 AI2 WildBench Leaderboard </b> </body> </html>"
|
11 |
-
|
12 |
|
13 |
CITATION_TEXT = """
|
14 |
|
15 |
-
@
|
16 |
-
title={ZebraLogic:
|
17 |
-
author={Bill Yuchen Lin and Ronan Le Bras and Peter Clark and Yejin Choi},
|
18 |
-
|
19 |
-
|
20 |
}
|
21 |
|
22 |
|
@@ -27,15 +27,15 @@ CITATION_TEXT = """
|
|
27 |
volume={36},
|
28 |
year={2024}
|
29 |
}
|
30 |
-
|
31 |
"""
|
32 |
|
33 |
# make column_names as an ordered dict
|
34 |
-
|
35 |
|
36 |
|
37 |
column_names = OrderedDict({
|
38 |
-
"Model": "Model",
|
39 |
"Mode": "Mode",
|
40 |
"Puzzle Acc": "Puzzle Acc",
|
41 |
"Cell Acc": "Cell Acc",
|
@@ -48,29 +48,29 @@ column_names = OrderedDict({
|
|
48 |
|
49 |
|
50 |
|
51 |
-
LEADERBOARD_REMARKS = """**WB Reward**: for each comparison (A vs B), a reward for A is **+/-1** if A is **much better/worse** than B, and **+/-0.5** if A is **slightly better/worse** than B; when there is a **Tie**, the reward is **0**.
|
52 |
"""
|
53 |
|
54 |
# **WB Reward**: for each pairwise comparison, a reward for A is **+/-1** if A is **much better/worse** than B, and **+/-0.5** if A is **slightly better/worse** than B; 0 for a **Tie**.
|
55 |
-
# The baseline models are GPT4-Turbo, Haiku, and Llama2-70B, and Mix is the average of the three.
|
56 |
# **WB Score** individually scores each model based on checklists.
|
57 |
# Evaluator is GPT-4-Turbo.
|
58 |
-
LEADERBOARD_REMARKS_MAIN = """
|
59 |
"""
|
60 |
-
|
61 |
RANKING_COLUMN = "Puzzle Acc"
|
62 |
|
63 |
ORDERED_COLUMN_NAMES = [
|
64 |
-
"Model",
|
65 |
"Mode",
|
66 |
"Puzzle Acc",
|
67 |
"Easy Puzzle Acc",
|
68 |
"Hard Puzzle Acc",
|
69 |
"Cell Acc",
|
70 |
-
"No answer",
|
71 |
]
|
72 |
|
73 |
-
|
74 |
js_light = """
|
75 |
function refresh() {
|
76 |
const url = new URL(window.location);
|
@@ -110,15 +110,15 @@ function refresh() {
|
|
110 |
|
111 |
js_code = """
|
112 |
function scroll_top() {
|
113 |
-
console.log("Hello from Gradio!");
|
114 |
const bubbles = document.querySelectorAll('.bubble-wrap');
|
115 |
bubbles.forEach((bubble, index) => {
|
116 |
setTimeout(() => {
|
117 |
bubble.scrollTop = 0;
|
118 |
}, index * 100); // Delay of 100ms between each iteration
|
119 |
});
|
120 |
-
|
121 |
-
}
|
122 |
"""
|
123 |
|
124 |
|
@@ -126,7 +126,7 @@ TASK_TYPE_STR = "**Tasks**: Info seeking (**InfoSek**), Creative Writing (**CrtW
|
|
126 |
|
127 |
css = """
|
128 |
|
129 |
-
|
130 |
|
131 |
code {
|
132 |
font-size: large;
|
@@ -179,17 +179,17 @@ td {
|
|
179 |
.chat-common{
|
180 |
height: auto;
|
181 |
max-height: 400px;
|
182 |
-
min-height: 100px;
|
183 |
}
|
184 |
.chat-specific{
|
185 |
height: auto;
|
186 |
max-height: 600px;
|
187 |
-
min-height: 200px;
|
188 |
}
|
189 |
#od-benchmark-tab-table-button{
|
190 |
font-size: 15pt;
|
191 |
font-weight: bold;
|
192 |
-
}
|
193 |
|
194 |
.btn_boderline{
|
195 |
border: 1px solid #000000;
|
@@ -197,7 +197,7 @@ td {
|
|
197 |
padding: 5px;
|
198 |
margin: 5px;
|
199 |
font-size: 15pt;
|
200 |
-
font-weight: bold;
|
201 |
}
|
202 |
|
203 |
.btn_boderline_next{
|
@@ -206,7 +206,7 @@ td {
|
|
206 |
padding: 5px;
|
207 |
margin: 5px;
|
208 |
font-size: 15pt;
|
209 |
-
font-weight: bold;
|
210 |
}
|
211 |
|
212 |
.btn_boderline_gray{
|
@@ -215,7 +215,7 @@ td {
|
|
215 |
padding: 5px;
|
216 |
margin: 5px;
|
217 |
font-size: 15pt;
|
218 |
-
font-weight: italic;
|
219 |
}
|
220 |
.btn_boderline_selected{
|
221 |
border: 2px solid purple;
|
@@ -224,12 +224,12 @@ td {
|
|
224 |
padding: 5px;
|
225 |
margin: 5px;
|
226 |
font-size: 15pt;
|
227 |
-
font-weight: bold;
|
228 |
}
|
229 |
.accordion-label button span{
|
230 |
font-size: 14pt;
|
231 |
font-weight: bold;
|
232 |
-
}
|
233 |
|
234 |
#show-task-categorized span{
|
235 |
font-size: 13pt;
|
@@ -269,7 +269,7 @@ button.selected[role="tab"][aria-selected="true"] {
|
|
269 |
.plotly-plot{
|
270 |
height: auto;
|
271 |
max-height: 600px;
|
272 |
-
min-height: 600px;
|
273 |
}
|
274 |
|
275 |
#length-margin-radio{
|
@@ -279,12 +279,12 @@ button.selected[role="tab"][aria-selected="true"] {
|
|
279 |
}
|
280 |
|
281 |
#show-task-categorized{
|
282 |
-
font-size: 12pt;
|
283 |
font-decoration: bold;
|
284 |
}
|
285 |
|
286 |
#show-open-source-models{
|
287 |
-
font-size: 12pt;
|
288 |
font-decoration: bold;
|
289 |
}
|
290 |
|
@@ -296,4 +296,3 @@ button.selected[role="tab"][aria-selected="true"] {
|
|
296 |
margin: 5px;
|
297 |
}
|
298 |
"""
|
299 |
-
|
|
|
8 |
BANNER = f'<div style="display: flex; justify-content: flex-start;"><img src="{banner_url}" alt="Banner" style="width: 70vw; min-width: 300px; max-width: 1000px;border: 3px solid gray; border-color: gray black;"> </div>'
|
9 |
|
10 |
# TITLE = "<html> <head> <style> h1 {text-align: center;} </style> </head> <body> <h1> 🦁 AI2 WildBench Leaderboard </b> </body> </html>"
|
11 |
+
|
12 |
|
13 |
CITATION_TEXT = """
|
14 |
|
15 |
+
@article{zebralogic2025,
|
16 |
+
title={ZebraLogic: On the Scaling Limits of LLMs for Logical Reasoning},
|
17 |
+
author={Bill Yuchen Lin and Ronan Le Bras and Kyle Richardson and Ashish Sabharwal and Radha Poovendran and Peter Clark and Yejin Choi},
|
18 |
+
year={2025},
|
19 |
+
url={https://arxiv.org/abs/2502.01100},
|
20 |
}
|
21 |
|
22 |
|
|
|
27 |
volume={36},
|
28 |
year={2024}
|
29 |
}
|
30 |
+
|
31 |
"""
|
32 |
|
33 |
# make column_names as an ordered dict
|
34 |
+
|
35 |
|
36 |
|
37 |
column_names = OrderedDict({
|
38 |
+
"Model": "Model",
|
39 |
"Mode": "Mode",
|
40 |
"Puzzle Acc": "Puzzle Acc",
|
41 |
"Cell Acc": "Cell Acc",
|
|
|
48 |
|
49 |
|
50 |
|
51 |
+
LEADERBOARD_REMARKS = """**WB Reward**: for each comparison (A vs B), a reward for A is **+/-1** if A is **much better/worse** than B, and **+/-0.5** if A is **slightly better/worse** than B; when there is a **Tie**, the reward is **0**.
|
52 |
"""
|
53 |
|
54 |
# **WB Reward**: for each pairwise comparison, a reward for A is **+/-1** if A is **much better/worse** than B, and **+/-0.5** if A is **slightly better/worse** than B; 0 for a **Tie**.
|
55 |
+
# The baseline models are GPT4-Turbo, Haiku, and Llama2-70B, and Mix is the average of the three.
|
56 |
# **WB Score** individually scores each model based on checklists.
|
57 |
# Evaluator is GPT-4-Turbo.
|
58 |
+
LEADERBOARD_REMARKS_MAIN = """
|
59 |
"""
|
60 |
+
|
61 |
RANKING_COLUMN = "Puzzle Acc"
|
62 |
|
63 |
ORDERED_COLUMN_NAMES = [
|
64 |
+
"Model",
|
65 |
"Mode",
|
66 |
"Puzzle Acc",
|
67 |
"Easy Puzzle Acc",
|
68 |
"Hard Puzzle Acc",
|
69 |
"Cell Acc",
|
70 |
+
"No answer",
|
71 |
]
|
72 |
|
73 |
+
|
74 |
js_light = """
|
75 |
function refresh() {
|
76 |
const url = new URL(window.location);
|
|
|
110 |
|
111 |
js_code = """
|
112 |
function scroll_top() {
|
113 |
+
console.log("Hello from Gradio!");
|
114 |
const bubbles = document.querySelectorAll('.bubble-wrap');
|
115 |
bubbles.forEach((bubble, index) => {
|
116 |
setTimeout(() => {
|
117 |
bubble.scrollTop = 0;
|
118 |
}, index * 100); // Delay of 100ms between each iteration
|
119 |
});
|
120 |
+
|
121 |
+
}
|
122 |
"""
|
123 |
|
124 |
|
|
|
126 |
|
127 |
css = """
|
128 |
|
129 |
+
|
130 |
|
131 |
code {
|
132 |
font-size: large;
|
|
|
179 |
.chat-common{
|
180 |
height: auto;
|
181 |
max-height: 400px;
|
182 |
+
min-height: 100px;
|
183 |
}
|
184 |
.chat-specific{
|
185 |
height: auto;
|
186 |
max-height: 600px;
|
187 |
+
min-height: 200px;
|
188 |
}
|
189 |
#od-benchmark-tab-table-button{
|
190 |
font-size: 15pt;
|
191 |
font-weight: bold;
|
192 |
+
}
|
193 |
|
194 |
.btn_boderline{
|
195 |
border: 1px solid #000000;
|
|
|
197 |
padding: 5px;
|
198 |
margin: 5px;
|
199 |
font-size: 15pt;
|
200 |
+
font-weight: bold;
|
201 |
}
|
202 |
|
203 |
.btn_boderline_next{
|
|
|
206 |
padding: 5px;
|
207 |
margin: 5px;
|
208 |
font-size: 15pt;
|
209 |
+
font-weight: bold;
|
210 |
}
|
211 |
|
212 |
.btn_boderline_gray{
|
|
|
215 |
padding: 5px;
|
216 |
margin: 5px;
|
217 |
font-size: 15pt;
|
218 |
+
font-weight: italic;
|
219 |
}
|
220 |
.btn_boderline_selected{
|
221 |
border: 2px solid purple;
|
|
|
224 |
padding: 5px;
|
225 |
margin: 5px;
|
226 |
font-size: 15pt;
|
227 |
+
font-weight: bold;
|
228 |
}
|
229 |
.accordion-label button span{
|
230 |
font-size: 14pt;
|
231 |
font-weight: bold;
|
232 |
+
}
|
233 |
|
234 |
#show-task-categorized span{
|
235 |
font-size: 13pt;
|
|
|
269 |
.plotly-plot{
|
270 |
height: auto;
|
271 |
max-height: 600px;
|
272 |
+
min-height: 600px;
|
273 |
}
|
274 |
|
275 |
#length-margin-radio{
|
|
|
279 |
}
|
280 |
|
281 |
#show-task-categorized{
|
282 |
+
font-size: 12pt;
|
283 |
font-decoration: bold;
|
284 |
}
|
285 |
|
286 |
#show-open-source-models{
|
287 |
+
font-size: 12pt;
|
288 |
font-decoration: bold;
|
289 |
}
|
290 |
|
|
|
296 |
margin: 5px;
|
297 |
}
|
298 |
"""
|
|
eval_utils.py
CHANGED
@@ -8,7 +8,7 @@ private_solutions = {}
|
|
8 |
|
9 |
def load_private_solutions():
|
10 |
global private_solutions
|
11 |
-
private_zebra_data = load_dataset("
|
12 |
for item in private_zebra_data:
|
13 |
private_solutions[item["id"]] = item["solution"]
|
14 |
return
|
|
|
8 |
|
9 |
def load_private_solutions():
|
10 |
global private_solutions
|
11 |
+
private_zebra_data = load_dataset("WildEval/ZebraLogic", "grid_mode", split="test")
|
12 |
for item in private_zebra_data:
|
13 |
private_solutions[item["id"]] = item["solution"]
|
14 |
return
|