yuchenlin commited on
Commit
b2c3610
·
1 Parent(s): 9afc022

modify paper names and paths to datasets

Browse files
Files changed (5) hide show
  1. README.md +5 -4
  2. _header.md +1 -2
  3. app.py +36 -36
  4. constants.py +31 -32
  5. eval_utils.py +1 -1
README.md CHANGED
@@ -10,12 +10,12 @@ pinned: true
10
  fullWidth: true
11
  hf_oauth: true
12
  api: false
13
- tags:
14
  - leaderboard
15
- datasets:
16
  - allenai/ZebraLogicBench
17
- - allenai/ZebraLogicBench-private
18
- models:
19
  - Qwen/Qwen2-72B-Instruct
20
  - Qwen/Qwen1.5-72B-Chat
21
  - Qwen/Qwen1.5-7B-Chat
@@ -58,3 +58,4 @@ models:
58
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
59
 
60
  Paper: arxiv.org/abs/2406.04770
 
 
10
  fullWidth: true
11
  hf_oauth: true
12
  api: false
13
+ tags:
14
  - leaderboard
15
+ datasets:
16
  - allenai/ZebraLogicBench
17
+ - WildEval/ZebraLogic
18
+ models:
19
  - Qwen/Qwen2-72B-Instruct
20
  - Qwen/Qwen1.5-72B-Chat
21
  - Qwen/Qwen1.5-7B-Chat
 
58
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
59
 
60
  Paper: arxiv.org/abs/2406.04770
61
+ Paper: arxiv.org/abs/2502.01100
_header.md CHANGED
@@ -1,6 +1,5 @@
1
  <br/>
2
 
3
- # 🦓 ZebraLogic: Benchmarking the Logical Reasoning Ability of Language Models
4
  <!-- [📑 FnF Paper](https://arxiv.org/abs/2305.18654) | -->
5
  [📰 Blog](https://huggingface.co/blog/yuchenlin/zebra-logic) [💻 GitHub](https://github.com/WildEval/ZeroEval) | [🤗 HuggingFace](https://huggingface.co/collections/allenai/zebra-logic-bench-6697137cbaad0b91e635e7b0) | [🐦 X](https://twitter.com/billyuchenlin/) | [💬 Discussion](https://huggingface.co/spaces/allenai/ZebraLogicBench-Leaderboard/discussions) | Updated: **{LAST_UPDATED}**
6
-
 
1
  <br/>
2
 
3
+ # 🦓 ZebraLogic: On the Scaling Limits of LLMs for Logical Reasoning
4
  <!-- [📑 FnF Paper](https://arxiv.org/abs/2305.18654) | -->
5
  [📰 Blog](https://huggingface.co/blog/yuchenlin/zebra-logic) [💻 GitHub](https://github.com/WildEval/ZeroEval) | [🤗 HuggingFace](https://huggingface.co/collections/allenai/zebra-logic-bench-6697137cbaad0b91e635e7b0) | [🐦 X](https://twitter.com/billyuchenlin/) | [💬 Discussion](https://huggingface.co/spaces/allenai/ZebraLogicBench-Leaderboard/discussions) | Updated: **{LAST_UPDATED}**
 
app.py CHANGED
@@ -12,16 +12,16 @@ import pandas as pd
12
  from pathlib import Path
13
  import json
14
  from constants import *
15
- from datetime import datetime, timezone
16
  # from datasets import Dataset, load_dataset, concatenate_datasets
17
- import os, uuid
18
  from utils_display import model_info
19
  from constants import column_names, LEADERBOARD_REMARKS, DEFAULT_K, LEADERBOARD_REMARKS_MAIN
20
  import pytz
21
  from data_utils import post_processing, get_random_item
22
 
23
  # get the last updated time from the elo_ranks.all.jsonl file
24
- LAST_UPDATED = None
25
  # with open("_intro.md", "r") as f:
26
  # INTRO_MD = f.read()
27
  INTRO_MD = ""
@@ -33,11 +33,11 @@ with open("_header.md", "r") as f:
33
 
34
  with open("_metrics.md", "r") as f:
35
  METRICS_MD = f.read()
36
-
37
- raw_data = None
38
- original_df = None
39
  # available_models = [] # to be filled in later
40
- available_models = list(model_info.keys())
41
 
42
  def df_filters(mode_selection_radio, show_open_source_model_only):
43
  global original_df
@@ -59,19 +59,19 @@ def _gstr(text):
59
 
60
  def _tab_leaderboard():
61
  global original_df, available_models
62
- # with gr.TabItem("📊 Main", elem_id="od-benchmark-tab-table-ablation", id=0, elem_classes="subtab"):
63
  if True:
64
- default_main_df = original_df.copy()
65
  # default_main_df.insert(0, "", range(1, 1 + len(default_main_df)))
66
- # default_main_df_no_task = default_main_df.copy()
67
  default_mode = "greedy"
68
  default_main_df = df_filters(default_mode, False)
69
- with gr.Row():
70
- with gr.Column(scale=5):
71
  mode_selection_radio = gr.Radio(["greedy", "all"], show_label=False, elem_id="rank-column-radio", value=default_mode)
72
  # with gr.Row():
73
  # with gr.Column(scale=2):
74
-
75
  leaderboard_table = gr.components.Dataframe(
76
  value=default_main_df,
77
  datatype= ["number", "markdown", "markdown", "number"],
@@ -83,7 +83,7 @@ def _tab_leaderboard():
83
  column_widths=[50, 260, 100, 100, 120, 120, 100,100,110,100],
84
  wrap=True
85
  # min_width=60,
86
- )
87
  # checkbox_show_task_categorized.change(fn=length_margin_change, inputs=[length_margin_choices, gr.Text("main", visible=False), checkbox_show_task_categorized, show_open_source_model_only, rank_column_radio], outputs=[leaderboard_table])
88
  # show_open_source_model_only.change(fn=length_margin_change, inputs=[length_margin_choices, gr.Text("main", visible=False), checkbox_show_task_categorized, show_open_source_model_only, rank_column_radio], outputs=[leaderboard_table])
89
  # rank_column_radio.change(fn=length_margin_change, inputs=[length_margin_choices, gr.Text("main", visible=False), checkbox_show_task_categorized, show_open_source_model_only, rank_column_radio], outputs=[leaderboard_table])
@@ -121,14 +121,14 @@ def _tab_explore():
121
  # greedy_or_sample = gr.Radio(["greedy", "sampling"], show_label=False, elem_id="greedy-or-sample", value="greedy", interactive=True)
122
  gr.Markdown("### 🚀 Click below to sample a puzzle. ⬇️ ")
123
  explore_button = gr.Button("🦓 Sample a Zebra Puzzle!", elem_id="explore-button")
124
-
125
  puzzle_md = gr.Markdown("### 🦓 Puzzle: \n\nTo be loaded", elem_id="puzzle-md", elem_classes="box_md")
126
  model_reasoning_md = gr.Markdown("### 🤖 Reasoning: \n\nTo be loaded", elem_id="model-reasoning-md", elem_classes="box_md")
127
  model_prediction_md = gr.Markdown("### 💬 Answer: \n\nTo be loaded", elem_id="model-prediction-md", elem_classes="box_md")
128
  turht_solution_md = gr.Markdown("### ✅ Truth Solution: \n\nTo be loaded", elem_id="truth-solution-md", elem_classes="box_md")
129
  model_eval_md = gr.Markdown("### 🆚 Evaluation: \n\nTo be loaded", elem_id="model-eval-md", elem_classes="box_md")
130
- explore_button.click(fn=sample_explore_item,
131
- inputs=[model_selection, size_H_selection, size_W_selection],
132
  outputs=[puzzle_md, model_reasoning_md, model_prediction_md, model_eval_md, turht_solution_md])
133
 
134
 
@@ -136,8 +136,8 @@ def _tab_explore():
136
  def _tab_submit():
137
  markdown_text = """
138
  Please create an issue on our [Github](https://github.com/WildEval/ZeroEval/) repository to talk about your model. Then, we can test it for you and report the results here on the Leaderboard.
139
- If you would like to do local testing, please read our code [here](https://github.com/WildEval/ZeroEval/blob/main/src/evaluation/zebra_grid_eval.py)
140
- and apply for the access for the [private dataset](https://huggingface.co/datasets/allenai/ZebraLogicBench-private) that contains the truth solutions.
141
  """
142
 
143
  gr.Markdown("## 🚀 Submit Your Results\n\n" + markdown_text, elem_classes="markdown-text")
@@ -149,33 +149,33 @@ def build_demo():
149
 
150
  with gr.Blocks(theme=gr.themes.Soft(), css=css, js=js_light) as demo:
151
  gr.HTML(BANNER, elem_id="banner")
152
- # convert LAST_UPDATED to the PDT time
153
  LAST_UPDATED = datetime.now(pytz.timezone('US/Pacific')).strftime("%Y-%m-%d %H:%M:%S")
154
  header_md_text = HEADER_MD.replace("{LAST_UPDATED}", str(LAST_UPDATED))
155
- gr.Markdown(header_md_text, elem_classes="markdown-text")
156
 
157
- with gr.Tabs(elem_classes="tab-buttons") as tabs:
158
  with gr.TabItem("🏅 Leaderboard", elem_id="od-benchmark-tab-table", id=0):
159
- _tab_leaderboard()
160
  with gr.TabItem("🔍 Explore", elem_id="od-benchmark-tab-table", id=1):
161
  _tab_explore()
162
  with gr.TabItem("🚀 Submit Your Results", elem_id="od-benchmark-tab-table", id=3):
163
- _tab_submit()
164
 
165
  with gr.TabItem("📮 About Us", elem_id="od-benchmark-tab-table", id=4):
166
  gr.Markdown(ABOUT_MD, elem_classes="markdown-text")
167
-
168
  with gr.Row():
169
  with gr.Accordion("📙 Citation", open=False, elem_classes="accordion-label"):
170
  gr.Textbox(
171
- value=CITATION_TEXT,
172
  lines=7,
173
  label="Copy the BibTeX snippet to cite this source",
174
  elem_id="citation-button",
175
  show_copy_button=True)
176
  # ).style(show_copy_button=True)
177
 
178
- return demo
179
 
180
 
181
 
@@ -184,11 +184,11 @@ def data_load(result_file):
184
  print(f"Loading {result_file}")
185
  column_names_main = column_names.copy()
186
  # column_names_main.update({})
187
- main_ordered_columns = ORDERED_COLUMN_NAMES
188
- # filter the data with Total Puzzles == 1000
189
-
190
- click_url = True
191
- # read json file from the result_file
192
  with open(result_file, "r") as f:
193
  raw_data = json.load(f)
194
  # floatify the data, if possible
@@ -201,16 +201,16 @@ def data_load(result_file):
201
  original_df = pd.DataFrame(raw_data)
202
  original_df = original_df[original_df["Total Puzzles"] == 1000]
203
  original_df = post_processing(original_df, column_names_main, ordered_columns=main_ordered_columns, click_url=click_url, rank_column=RANKING_COLUMN)
204
- # print(original_df.columns)
205
-
206
 
207
  if __name__ == "__main__":
208
  parser = argparse.ArgumentParser()
209
  parser.add_argument("--share", action="store_true")
210
  parser.add_argument("--result_file", help="Path to results table", default="ZeroEval-main/result_dirs/zebra-grid.summary.json")
211
-
212
  args = parser.parse_args()
213
- data_load(args.result_file)
214
  print(original_df)
215
  demo = build_demo()
216
  demo.launch(share=args.share, height=3000, width="100%")
 
12
  from pathlib import Path
13
  import json
14
  from constants import *
15
+ from datetime import datetime, timezone
16
  # from datasets import Dataset, load_dataset, concatenate_datasets
17
+ import os, uuid
18
  from utils_display import model_info
19
  from constants import column_names, LEADERBOARD_REMARKS, DEFAULT_K, LEADERBOARD_REMARKS_MAIN
20
  import pytz
21
  from data_utils import post_processing, get_random_item
22
 
23
  # get the last updated time from the elo_ranks.all.jsonl file
24
+ LAST_UPDATED = None
25
  # with open("_intro.md", "r") as f:
26
  # INTRO_MD = f.read()
27
  INTRO_MD = ""
 
33
 
34
  with open("_metrics.md", "r") as f:
35
  METRICS_MD = f.read()
36
+
37
+ raw_data = None
38
+ original_df = None
39
  # available_models = [] # to be filled in later
40
+ available_models = list(model_info.keys())
41
 
42
  def df_filters(mode_selection_radio, show_open_source_model_only):
43
  global original_df
 
59
 
60
  def _tab_leaderboard():
61
  global original_df, available_models
62
+ # with gr.TabItem("📊 Main", elem_id="od-benchmark-tab-table-ablation", id=0, elem_classes="subtab"):
63
  if True:
64
+ default_main_df = original_df.copy()
65
  # default_main_df.insert(0, "", range(1, 1 + len(default_main_df)))
66
+ # default_main_df_no_task = default_main_df.copy()
67
  default_mode = "greedy"
68
  default_main_df = df_filters(default_mode, False)
69
+ with gr.Row():
70
+ with gr.Column(scale=5):
71
  mode_selection_radio = gr.Radio(["greedy", "all"], show_label=False, elem_id="rank-column-radio", value=default_mode)
72
  # with gr.Row():
73
  # with gr.Column(scale=2):
74
+
75
  leaderboard_table = gr.components.Dataframe(
76
  value=default_main_df,
77
  datatype= ["number", "markdown", "markdown", "number"],
 
83
  column_widths=[50, 260, 100, 100, 120, 120, 100,100,110,100],
84
  wrap=True
85
  # min_width=60,
86
+ )
87
  # checkbox_show_task_categorized.change(fn=length_margin_change, inputs=[length_margin_choices, gr.Text("main", visible=False), checkbox_show_task_categorized, show_open_source_model_only, rank_column_radio], outputs=[leaderboard_table])
88
  # show_open_source_model_only.change(fn=length_margin_change, inputs=[length_margin_choices, gr.Text("main", visible=False), checkbox_show_task_categorized, show_open_source_model_only, rank_column_radio], outputs=[leaderboard_table])
89
  # rank_column_radio.change(fn=length_margin_change, inputs=[length_margin_choices, gr.Text("main", visible=False), checkbox_show_task_categorized, show_open_source_model_only, rank_column_radio], outputs=[leaderboard_table])
 
121
  # greedy_or_sample = gr.Radio(["greedy", "sampling"], show_label=False, elem_id="greedy-or-sample", value="greedy", interactive=True)
122
  gr.Markdown("### 🚀 Click below to sample a puzzle. ⬇️ ")
123
  explore_button = gr.Button("🦓 Sample a Zebra Puzzle!", elem_id="explore-button")
124
+
125
  puzzle_md = gr.Markdown("### 🦓 Puzzle: \n\nTo be loaded", elem_id="puzzle-md", elem_classes="box_md")
126
  model_reasoning_md = gr.Markdown("### 🤖 Reasoning: \n\nTo be loaded", elem_id="model-reasoning-md", elem_classes="box_md")
127
  model_prediction_md = gr.Markdown("### 💬 Answer: \n\nTo be loaded", elem_id="model-prediction-md", elem_classes="box_md")
128
  turht_solution_md = gr.Markdown("### ✅ Truth Solution: \n\nTo be loaded", elem_id="truth-solution-md", elem_classes="box_md")
129
  model_eval_md = gr.Markdown("### 🆚 Evaluation: \n\nTo be loaded", elem_id="model-eval-md", elem_classes="box_md")
130
+ explore_button.click(fn=sample_explore_item,
131
+ inputs=[model_selection, size_H_selection, size_W_selection],
132
  outputs=[puzzle_md, model_reasoning_md, model_prediction_md, model_eval_md, turht_solution_md])
133
 
134
 
 
136
  def _tab_submit():
137
  markdown_text = """
138
  Please create an issue on our [Github](https://github.com/WildEval/ZeroEval/) repository to talk about your model. Then, we can test it for you and report the results here on the Leaderboard.
139
+ If you would like to do local testing, please read our code [here](https://github.com/WildEval/ZeroEval/blob/main/src/evaluation/zebra_grid_eval.py)
140
+ and apply for the access for the [private dataset](https://huggingface.co/datasets/WildEval/ZebraLogic) that contains the truth solutions.
141
  """
142
 
143
  gr.Markdown("## 🚀 Submit Your Results\n\n" + markdown_text, elem_classes="markdown-text")
 
149
 
150
  with gr.Blocks(theme=gr.themes.Soft(), css=css, js=js_light) as demo:
151
  gr.HTML(BANNER, elem_id="banner")
152
+ # convert LAST_UPDATED to the PDT time
153
  LAST_UPDATED = datetime.now(pytz.timezone('US/Pacific')).strftime("%Y-%m-%d %H:%M:%S")
154
  header_md_text = HEADER_MD.replace("{LAST_UPDATED}", str(LAST_UPDATED))
155
+ gr.Markdown(header_md_text, elem_classes="markdown-text")
156
 
157
+ with gr.Tabs(elem_classes="tab-buttons") as tabs:
158
  with gr.TabItem("🏅 Leaderboard", elem_id="od-benchmark-tab-table", id=0):
159
+ _tab_leaderboard()
160
  with gr.TabItem("🔍 Explore", elem_id="od-benchmark-tab-table", id=1):
161
  _tab_explore()
162
  with gr.TabItem("🚀 Submit Your Results", elem_id="od-benchmark-tab-table", id=3):
163
+ _tab_submit()
164
 
165
  with gr.TabItem("📮 About Us", elem_id="od-benchmark-tab-table", id=4):
166
  gr.Markdown(ABOUT_MD, elem_classes="markdown-text")
167
+
168
  with gr.Row():
169
  with gr.Accordion("📙 Citation", open=False, elem_classes="accordion-label"):
170
  gr.Textbox(
171
+ value=CITATION_TEXT,
172
  lines=7,
173
  label="Copy the BibTeX snippet to cite this source",
174
  elem_id="citation-button",
175
  show_copy_button=True)
176
  # ).style(show_copy_button=True)
177
 
178
+ return demo
179
 
180
 
181
 
 
184
  print(f"Loading {result_file}")
185
  column_names_main = column_names.copy()
186
  # column_names_main.update({})
187
+ main_ordered_columns = ORDERED_COLUMN_NAMES
188
+ # filter the data with Total Puzzles == 1000
189
+
190
+ click_url = True
191
+ # read json file from the result_file
192
  with open(result_file, "r") as f:
193
  raw_data = json.load(f)
194
  # floatify the data, if possible
 
201
  original_df = pd.DataFrame(raw_data)
202
  original_df = original_df[original_df["Total Puzzles"] == 1000]
203
  original_df = post_processing(original_df, column_names_main, ordered_columns=main_ordered_columns, click_url=click_url, rank_column=RANKING_COLUMN)
204
+ # print(original_df.columns)
205
+
206
 
207
  if __name__ == "__main__":
208
  parser = argparse.ArgumentParser()
209
  parser.add_argument("--share", action="store_true")
210
  parser.add_argument("--result_file", help="Path to results table", default="ZeroEval-main/result_dirs/zebra-grid.summary.json")
211
+
212
  args = parser.parse_args()
213
+ data_load(args.result_file)
214
  print(original_df)
215
  demo = build_demo()
216
  demo.launch(share=args.share, height=3000, width="100%")
constants.py CHANGED
@@ -8,15 +8,15 @@ banner_url = "https://github.com/WildEval/ZeroEval/blob/main/docs/zebra/zebra_ba
8
  BANNER = f'<div style="display: flex; justify-content: flex-start;"><img src="{banner_url}" alt="Banner" style="width: 70vw; min-width: 300px; max-width: 1000px;border: 3px solid gray; border-color: gray black;"> </div>'
9
 
10
  # TITLE = "<html> <head> <style> h1 {text-align: center;} </style> </head> <body> <h1> 🦁 AI2 WildBench Leaderboard </b> </body> </html>"
11
-
12
 
13
  CITATION_TEXT = """
14
 
15
- @misc{zebralogic2024,
16
- title={ZebraLogic: Benchmarking the Logical Reasoning Ability of Language Models},
17
- author={Bill Yuchen Lin and Ronan Le Bras and Peter Clark and Yejin Choi},
18
- url={https://huggingface.co/spaces/allenai/ZebraLogic},
19
- year={2024}
20
  }
21
 
22
 
@@ -27,15 +27,15 @@ CITATION_TEXT = """
27
  volume={36},
28
  year={2024}
29
  }
30
-
31
  """
32
 
33
  # make column_names as an ordered dict
34
-
35
 
36
 
37
  column_names = OrderedDict({
38
- "Model": "Model",
39
  "Mode": "Mode",
40
  "Puzzle Acc": "Puzzle Acc",
41
  "Cell Acc": "Cell Acc",
@@ -48,29 +48,29 @@ column_names = OrderedDict({
48
 
49
 
50
 
51
- LEADERBOARD_REMARKS = """**WB Reward**: for each comparison (A vs B), a reward for A is **+/-1** if A is **much better/worse** than B, and **+/-0.5** if A is **slightly better/worse** than B; when there is a **Tie**, the reward is **0**.
52
  """
53
 
54
  # **WB Reward**: for each pairwise comparison, a reward for A is **+/-1** if A is **much better/worse** than B, and **+/-0.5** if A is **slightly better/worse** than B; 0 for a **Tie**.
55
- # The baseline models are GPT4-Turbo, Haiku, and Llama2-70B, and Mix is the average of the three.
56
  # **WB Score** individually scores each model based on checklists.
57
  # Evaluator is GPT-4-Turbo.
58
- LEADERBOARD_REMARKS_MAIN = """
59
  """
60
-
61
  RANKING_COLUMN = "Puzzle Acc"
62
 
63
  ORDERED_COLUMN_NAMES = [
64
- "Model",
65
  "Mode",
66
  "Puzzle Acc",
67
  "Easy Puzzle Acc",
68
  "Hard Puzzle Acc",
69
  "Cell Acc",
70
- "No answer",
71
  ]
72
 
73
-
74
  js_light = """
75
  function refresh() {
76
  const url = new URL(window.location);
@@ -110,15 +110,15 @@ function refresh() {
110
 
111
  js_code = """
112
  function scroll_top() {
113
- console.log("Hello from Gradio!");
114
  const bubbles = document.querySelectorAll('.bubble-wrap');
115
  bubbles.forEach((bubble, index) => {
116
  setTimeout(() => {
117
  bubble.scrollTop = 0;
118
  }, index * 100); // Delay of 100ms between each iteration
119
  });
120
-
121
- }
122
  """
123
 
124
 
@@ -126,7 +126,7 @@ TASK_TYPE_STR = "**Tasks**: Info seeking (**InfoSek**), Creative Writing (**CrtW
126
 
127
  css = """
128
 
129
-
130
 
131
  code {
132
  font-size: large;
@@ -179,17 +179,17 @@ td {
179
  .chat-common{
180
  height: auto;
181
  max-height: 400px;
182
- min-height: 100px;
183
  }
184
  .chat-specific{
185
  height: auto;
186
  max-height: 600px;
187
- min-height: 200px;
188
  }
189
  #od-benchmark-tab-table-button{
190
  font-size: 15pt;
191
  font-weight: bold;
192
- }
193
 
194
  .btn_boderline{
195
  border: 1px solid #000000;
@@ -197,7 +197,7 @@ td {
197
  padding: 5px;
198
  margin: 5px;
199
  font-size: 15pt;
200
- font-weight: bold;
201
  }
202
 
203
  .btn_boderline_next{
@@ -206,7 +206,7 @@ td {
206
  padding: 5px;
207
  margin: 5px;
208
  font-size: 15pt;
209
- font-weight: bold;
210
  }
211
 
212
  .btn_boderline_gray{
@@ -215,7 +215,7 @@ td {
215
  padding: 5px;
216
  margin: 5px;
217
  font-size: 15pt;
218
- font-weight: italic;
219
  }
220
  .btn_boderline_selected{
221
  border: 2px solid purple;
@@ -224,12 +224,12 @@ td {
224
  padding: 5px;
225
  margin: 5px;
226
  font-size: 15pt;
227
- font-weight: bold;
228
  }
229
  .accordion-label button span{
230
  font-size: 14pt;
231
  font-weight: bold;
232
- }
233
 
234
  #show-task-categorized span{
235
  font-size: 13pt;
@@ -269,7 +269,7 @@ button.selected[role="tab"][aria-selected="true"] {
269
  .plotly-plot{
270
  height: auto;
271
  max-height: 600px;
272
- min-height: 600px;
273
  }
274
 
275
  #length-margin-radio{
@@ -279,12 +279,12 @@ button.selected[role="tab"][aria-selected="true"] {
279
  }
280
 
281
  #show-task-categorized{
282
- font-size: 12pt;
283
  font-decoration: bold;
284
  }
285
 
286
  #show-open-source-models{
287
- font-size: 12pt;
288
  font-decoration: bold;
289
  }
290
 
@@ -296,4 +296,3 @@ button.selected[role="tab"][aria-selected="true"] {
296
  margin: 5px;
297
  }
298
  """
299
-
 
8
  BANNER = f'<div style="display: flex; justify-content: flex-start;"><img src="{banner_url}" alt="Banner" style="width: 70vw; min-width: 300px; max-width: 1000px;border: 3px solid gray; border-color: gray black;"> </div>'
9
 
10
  # TITLE = "<html> <head> <style> h1 {text-align: center;} </style> </head> <body> <h1> 🦁 AI2 WildBench Leaderboard </b> </body> </html>"
11
+
12
 
13
  CITATION_TEXT = """
14
 
15
+ @article{zebralogic2025,
16
+ title={ZebraLogic: On the Scaling Limits of LLMs for Logical Reasoning},
17
+ author={Bill Yuchen Lin and Ronan Le Bras and Kyle Richardson and Ashish Sabharwal and Radha Poovendran and Peter Clark and Yejin Choi},
18
+ year={2025},
19
+ url={https://arxiv.org/abs/2502.01100},
20
  }
21
 
22
 
 
27
  volume={36},
28
  year={2024}
29
  }
30
+
31
  """
32
 
33
  # make column_names as an ordered dict
34
+
35
 
36
 
37
  column_names = OrderedDict({
38
+ "Model": "Model",
39
  "Mode": "Mode",
40
  "Puzzle Acc": "Puzzle Acc",
41
  "Cell Acc": "Cell Acc",
 
48
 
49
 
50
 
51
+ LEADERBOARD_REMARKS = """**WB Reward**: for each comparison (A vs B), a reward for A is **+/-1** if A is **much better/worse** than B, and **+/-0.5** if A is **slightly better/worse** than B; when there is a **Tie**, the reward is **0**.
52
  """
53
 
54
  # **WB Reward**: for each pairwise comparison, a reward for A is **+/-1** if A is **much better/worse** than B, and **+/-0.5** if A is **slightly better/worse** than B; 0 for a **Tie**.
55
+ # The baseline models are GPT4-Turbo, Haiku, and Llama2-70B, and Mix is the average of the three.
56
  # **WB Score** individually scores each model based on checklists.
57
  # Evaluator is GPT-4-Turbo.
58
+ LEADERBOARD_REMARKS_MAIN = """
59
  """
60
+
61
  RANKING_COLUMN = "Puzzle Acc"
62
 
63
  ORDERED_COLUMN_NAMES = [
64
+ "Model",
65
  "Mode",
66
  "Puzzle Acc",
67
  "Easy Puzzle Acc",
68
  "Hard Puzzle Acc",
69
  "Cell Acc",
70
+ "No answer",
71
  ]
72
 
73
+
74
  js_light = """
75
  function refresh() {
76
  const url = new URL(window.location);
 
110
 
111
  js_code = """
112
  function scroll_top() {
113
+ console.log("Hello from Gradio!");
114
  const bubbles = document.querySelectorAll('.bubble-wrap');
115
  bubbles.forEach((bubble, index) => {
116
  setTimeout(() => {
117
  bubble.scrollTop = 0;
118
  }, index * 100); // Delay of 100ms between each iteration
119
  });
120
+
121
+ }
122
  """
123
 
124
 
 
126
 
127
  css = """
128
 
129
+
130
 
131
  code {
132
  font-size: large;
 
179
  .chat-common{
180
  height: auto;
181
  max-height: 400px;
182
+ min-height: 100px;
183
  }
184
  .chat-specific{
185
  height: auto;
186
  max-height: 600px;
187
+ min-height: 200px;
188
  }
189
  #od-benchmark-tab-table-button{
190
  font-size: 15pt;
191
  font-weight: bold;
192
+ }
193
 
194
  .btn_boderline{
195
  border: 1px solid #000000;
 
197
  padding: 5px;
198
  margin: 5px;
199
  font-size: 15pt;
200
+ font-weight: bold;
201
  }
202
 
203
  .btn_boderline_next{
 
206
  padding: 5px;
207
  margin: 5px;
208
  font-size: 15pt;
209
+ font-weight: bold;
210
  }
211
 
212
  .btn_boderline_gray{
 
215
  padding: 5px;
216
  margin: 5px;
217
  font-size: 15pt;
218
+ font-weight: italic;
219
  }
220
  .btn_boderline_selected{
221
  border: 2px solid purple;
 
224
  padding: 5px;
225
  margin: 5px;
226
  font-size: 15pt;
227
+ font-weight: bold;
228
  }
229
  .accordion-label button span{
230
  font-size: 14pt;
231
  font-weight: bold;
232
+ }
233
 
234
  #show-task-categorized span{
235
  font-size: 13pt;
 
269
  .plotly-plot{
270
  height: auto;
271
  max-height: 600px;
272
+ min-height: 600px;
273
  }
274
 
275
  #length-margin-radio{
 
279
  }
280
 
281
  #show-task-categorized{
282
+ font-size: 12pt;
283
  font-decoration: bold;
284
  }
285
 
286
  #show-open-source-models{
287
+ font-size: 12pt;
288
  font-decoration: bold;
289
  }
290
 
 
296
  margin: 5px;
297
  }
298
  """
 
eval_utils.py CHANGED
@@ -8,7 +8,7 @@ private_solutions = {}
8
 
9
  def load_private_solutions():
10
  global private_solutions
11
- private_zebra_data = load_dataset("allenai/ZebraLogicBench-private", "grid_mode", split="test")
12
  for item in private_zebra_data:
13
  private_solutions[item["id"]] = item["solution"]
14
  return
 
8
 
9
  def load_private_solutions():
10
  global private_solutions
11
+ private_zebra_data = load_dataset("WildEval/ZebraLogic", "grid_mode", split="test")
12
  for item in private_zebra_data:
13
  private_solutions[item["id"]] = item["solution"]
14
  return