ZebraLogic

Running

App Files Files Community

yuchenlin commited on Feb 4

Commit

b2c3610

1 Parent(s): 9afc022

modify paper names and paths to datasets

Browse files

Files changed (5) hide show

README.md +5 -4
_header.md +1 -2
app.py +36 -36
constants.py +31 -32
eval_utils.py +1 -1

README.md CHANGED Viewed

@@ -10,12 +10,12 @@ pinned: true
 fullWidth: true
 hf_oauth: true
 api: false
-tags:
     - leaderboard
-datasets:
     - allenai/ZebraLogicBench
-    - allenai/ZebraLogicBench-private
-models:
     - Qwen/Qwen2-72B-Instruct
     - Qwen/Qwen1.5-72B-Chat
     - Qwen/Qwen1.5-7B-Chat
@@ -58,3 +58,4 @@ models:
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 Paper: arxiv.org/abs/2406.04770

 fullWidth: true
 hf_oauth: true
 api: false
+tags:
     - leaderboard
+datasets:
     - allenai/ZebraLogicBench
+    - WildEval/ZebraLogic
+models:
     - Qwen/Qwen2-72B-Instruct
     - Qwen/Qwen1.5-72B-Chat
     - Qwen/Qwen1.5-7B-Chat
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 Paper: arxiv.org/abs/2406.04770
+Paper: arxiv.org/abs/2502.01100

_header.md CHANGED Viewed

@@ -1,6 +1,5 @@
 <br/>
-# 🦓 ZebraLogic: Benchmarking the Logical Reasoning Ability of Language Models
 <!-- [📑 FnF Paper](https://arxiv.org/abs/2305.18654) |  -->
 [📰 Blog](https://huggingface.co/blog/yuchenlin/zebra-logic) [💻 GitHub](https://github.com/WildEval/ZeroEval) | [🤗 HuggingFace](https://huggingface.co/collections/allenai/zebra-logic-bench-6697137cbaad0b91e635e7b0) | [🐦 X](https://twitter.com/billyuchenlin/) | [💬 Discussion](https://huggingface.co/spaces/allenai/ZebraLogicBench-Leaderboard/discussions) | Updated: **{LAST_UPDATED}**

 <br/>
+# 🦓 ZebraLogic: On the Scaling Limits of LLMs for Logical Reasoning
 <!-- [📑 FnF Paper](https://arxiv.org/abs/2305.18654) |  -->
 [📰 Blog](https://huggingface.co/blog/yuchenlin/zebra-logic) [💻 GitHub](https://github.com/WildEval/ZeroEval) | [🤗 HuggingFace](https://huggingface.co/collections/allenai/zebra-logic-bench-6697137cbaad0b91e635e7b0) | [🐦 X](https://twitter.com/billyuchenlin/) | [💬 Discussion](https://huggingface.co/spaces/allenai/ZebraLogicBench-Leaderboard/discussions) | Updated: **{LAST_UPDATED}**

app.py CHANGED Viewed

@@ -12,16 +12,16 @@ import pandas as pd
 from pathlib import Path
 import json
 from constants import *
-from datetime import datetime, timezone
 # from datasets import Dataset, load_dataset, concatenate_datasets
-import os, uuid
 from utils_display import model_info
 from constants import column_names,  LEADERBOARD_REMARKS, DEFAULT_K, LEADERBOARD_REMARKS_MAIN
 import pytz
 from data_utils import post_processing, get_random_item
 # get the last updated time from the elo_ranks.all.jsonl file
-LAST_UPDATED = None
 # with open("_intro.md", "r") as f:
 #     INTRO_MD = f.read()
 INTRO_MD = ""
@@ -33,11 +33,11 @@ with open("_header.md", "r") as f:
 with open("_metrics.md", "r") as f:
     METRICS_MD = f.read()
-raw_data = None
-original_df = None
 # available_models = [] # to be filled in later
-available_models = list(model_info.keys())
 def df_filters(mode_selection_radio, show_open_source_model_only):
     global original_df
@@ -59,19 +59,19 @@ def _gstr(text):
 def _tab_leaderboard():
     global original_df, available_models
-    # with gr.TabItem("📊 Main", elem_id="od-benchmark-tab-table-ablation", id=0, elem_classes="subtab"):
     if True:
-        default_main_df = original_df.copy()
         # default_main_df.insert(0, "", range(1, 1 + len(default_main_df)))
-        # default_main_df_no_task = default_main_df.copy()
         default_mode = "greedy"
         default_main_df = df_filters(default_mode, False)
-        with gr.Row():
-            with gr.Column(scale=5):
                 mode_selection_radio = gr.Radio(["greedy", "all"], show_label=False, elem_id="rank-column-radio", value=default_mode)
         # with gr.Row():
         #     with gr.Column(scale=2):
         leaderboard_table = gr.components.Dataframe(
             value=default_main_df,
             datatype= ["number", "markdown", "markdown", "number"],
@@ -83,7 +83,7 @@ def _tab_leaderboard():
             column_widths=[50, 260, 100, 100, 120, 120, 100,100,110,100],
             wrap=True
             # min_width=60,
-        )
         # checkbox_show_task_categorized.change(fn=length_margin_change, inputs=[length_margin_choices, gr.Text("main", visible=False), checkbox_show_task_categorized, show_open_source_model_only, rank_column_radio], outputs=[leaderboard_table])
         # show_open_source_model_only.change(fn=length_margin_change, inputs=[length_margin_choices, gr.Text("main", visible=False), checkbox_show_task_categorized, show_open_source_model_only, rank_column_radio], outputs=[leaderboard_table])
         # rank_column_radio.change(fn=length_margin_change, inputs=[length_margin_choices, gr.Text("main", visible=False), checkbox_show_task_categorized, show_open_source_model_only, rank_column_radio], outputs=[leaderboard_table])
@@ -121,14 +121,14 @@ def _tab_explore():
             # greedy_or_sample = gr.Radio(["greedy", "sampling"], show_label=False, elem_id="greedy-or-sample", value="greedy", interactive=True)
             gr.Markdown("### 🚀 Click below to sample a puzzle. ⬇️ ")
             explore_button = gr.Button("🦓 Sample a Zebra Puzzle!", elem_id="explore-button")
     puzzle_md = gr.Markdown("### 🦓 Puzzle: \n\nTo be loaded", elem_id="puzzle-md", elem_classes="box_md")
     model_reasoning_md = gr.Markdown("### 🤖 Reasoning: \n\nTo be loaded", elem_id="model-reasoning-md", elem_classes="box_md")
     model_prediction_md = gr.Markdown("### 💬 Answer: \n\nTo be loaded", elem_id="model-prediction-md", elem_classes="box_md")
     turht_solution_md = gr.Markdown("### ✅ Truth Solution: \n\nTo be loaded", elem_id="truth-solution-md", elem_classes="box_md")
     model_eval_md = gr.Markdown("### 🆚 Evaluation: \n\nTo be loaded", elem_id="model-eval-md", elem_classes="box_md")
-    explore_button.click(fn=sample_explore_item,
-                         inputs=[model_selection, size_H_selection, size_W_selection],
                          outputs=[puzzle_md, model_reasoning_md, model_prediction_md, model_eval_md, turht_solution_md])
@@ -136,8 +136,8 @@ def _tab_explore():
 def _tab_submit():
     markdown_text = """
     Please create an issue on our [Github](https://github.com/WildEval/ZeroEval/) repository to talk about your model. Then, we can test it for you and report the results here on the Leaderboard.
-    If you would like to do local testing, please read our code [here](https://github.com/WildEval/ZeroEval/blob/main/src/evaluation/zebra_grid_eval.py)
-    and apply for the access for the [private dataset](https://huggingface.co/datasets/allenai/ZebraLogicBench-private) that contains the truth solutions.
     """
     gr.Markdown("## 🚀 Submit Your Results\n\n" + markdown_text, elem_classes="markdown-text")
@@ -149,33 +149,33 @@ def build_demo():
     with gr.Blocks(theme=gr.themes.Soft(), css=css, js=js_light) as demo:
         gr.HTML(BANNER, elem_id="banner")
-        # convert LAST_UPDATED to the PDT time
         LAST_UPDATED = datetime.now(pytz.timezone('US/Pacific')).strftime("%Y-%m-%d %H:%M:%S")
         header_md_text = HEADER_MD.replace("{LAST_UPDATED}", str(LAST_UPDATED))
-        gr.Markdown(header_md_text, elem_classes="markdown-text")
-        with gr.Tabs(elem_classes="tab-buttons") as tabs:
             with gr.TabItem("🏅 Leaderboard", elem_id="od-benchmark-tab-table", id=0):
-                _tab_leaderboard()
             with gr.TabItem("🔍 Explore", elem_id="od-benchmark-tab-table", id=1):
                 _tab_explore()
             with gr.TabItem("🚀 Submit Your Results", elem_id="od-benchmark-tab-table", id=3):
-                _tab_submit()
             with gr.TabItem("📮 About Us", elem_id="od-benchmark-tab-table", id=4):
                 gr.Markdown(ABOUT_MD, elem_classes="markdown-text")
         with gr.Row():
             with gr.Accordion("📙 Citation", open=False, elem_classes="accordion-label"):
                 gr.Textbox(
-                    value=CITATION_TEXT,
                     lines=7,
                     label="Copy the BibTeX snippet to cite this source",
                     elem_id="citation-button",
                     show_copy_button=True)
                 # ).style(show_copy_button=True)
-    return demo
@@ -184,11 +184,11 @@ def data_load(result_file):
     print(f"Loading {result_file}")
     column_names_main = column_names.copy()
     # column_names_main.update({})
-    main_ordered_columns = ORDERED_COLUMN_NAMES
-    # filter the data with Total Puzzles == 1000
-    click_url = True
-    # read json file from the result_file
     with open(result_file, "r") as f:
         raw_data = json.load(f)
     # floatify the data, if possible
@@ -201,16 +201,16 @@ def data_load(result_file):
     original_df = pd.DataFrame(raw_data)
     original_df = original_df[original_df["Total Puzzles"] == 1000]
     original_df = post_processing(original_df, column_names_main, ordered_columns=main_ordered_columns, click_url=click_url, rank_column=RANKING_COLUMN)
-    # print(original_df.columns)
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument("--share", action="store_true")
     parser.add_argument("--result_file", help="Path to results table", default="ZeroEval-main/result_dirs/zebra-grid.summary.json")
     args = parser.parse_args()
-    data_load(args.result_file)
     print(original_df)
     demo = build_demo()
     demo.launch(share=args.share, height=3000, width="100%")

 from pathlib import Path
 import json
 from constants import *
+from datetime import datetime, timezone
 # from datasets import Dataset, load_dataset, concatenate_datasets
+import os, uuid
 from utils_display import model_info
 from constants import column_names,  LEADERBOARD_REMARKS, DEFAULT_K, LEADERBOARD_REMARKS_MAIN
 import pytz
 from data_utils import post_processing, get_random_item
 # get the last updated time from the elo_ranks.all.jsonl file
+LAST_UPDATED = None
 # with open("_intro.md", "r") as f:
 #     INTRO_MD = f.read()
 INTRO_MD = ""
 with open("_metrics.md", "r") as f:
     METRICS_MD = f.read()
+raw_data = None
+original_df = None
 # available_models = [] # to be filled in later
+available_models = list(model_info.keys())
 def df_filters(mode_selection_radio, show_open_source_model_only):
     global original_df
 def _tab_leaderboard():
     global original_df, available_models
+    # with gr.TabItem("📊 Main", elem_id="od-benchmark-tab-table-ablation", id=0, elem_classes="subtab"):
     if True:
+        default_main_df = original_df.copy()
         # default_main_df.insert(0, "", range(1, 1 + len(default_main_df)))
+        # default_main_df_no_task = default_main_df.copy()
         default_mode = "greedy"
         default_main_df = df_filters(default_mode, False)
+        with gr.Row():
+            with gr.Column(scale=5):
                 mode_selection_radio = gr.Radio(["greedy", "all"], show_label=False, elem_id="rank-column-radio", value=default_mode)
         # with gr.Row():
         #     with gr.Column(scale=2):
         leaderboard_table = gr.components.Dataframe(
             value=default_main_df,
             datatype= ["number", "markdown", "markdown", "number"],
             column_widths=[50, 260, 100, 100, 120, 120, 100,100,110,100],
             wrap=True
             # min_width=60,
+        )
         # checkbox_show_task_categorized.change(fn=length_margin_change, inputs=[length_margin_choices, gr.Text("main", visible=False), checkbox_show_task_categorized, show_open_source_model_only, rank_column_radio], outputs=[leaderboard_table])
         # show_open_source_model_only.change(fn=length_margin_change, inputs=[length_margin_choices, gr.Text("main", visible=False), checkbox_show_task_categorized, show_open_source_model_only, rank_column_radio], outputs=[leaderboard_table])
         # rank_column_radio.change(fn=length_margin_change, inputs=[length_margin_choices, gr.Text("main", visible=False), checkbox_show_task_categorized, show_open_source_model_only, rank_column_radio], outputs=[leaderboard_table])
             # greedy_or_sample = gr.Radio(["greedy", "sampling"], show_label=False, elem_id="greedy-or-sample", value="greedy", interactive=True)
             gr.Markdown("### 🚀 Click below to sample a puzzle. ⬇️ ")
             explore_button = gr.Button("🦓 Sample a Zebra Puzzle!", elem_id="explore-button")
     puzzle_md = gr.Markdown("### 🦓 Puzzle: \n\nTo be loaded", elem_id="puzzle-md", elem_classes="box_md")
     model_reasoning_md = gr.Markdown("### 🤖 Reasoning: \n\nTo be loaded", elem_id="model-reasoning-md", elem_classes="box_md")
     model_prediction_md = gr.Markdown("### 💬 Answer: \n\nTo be loaded", elem_id="model-prediction-md", elem_classes="box_md")
     turht_solution_md = gr.Markdown("### ✅ Truth Solution: \n\nTo be loaded", elem_id="truth-solution-md", elem_classes="box_md")
     model_eval_md = gr.Markdown("### 🆚 Evaluation: \n\nTo be loaded", elem_id="model-eval-md", elem_classes="box_md")
+    explore_button.click(fn=sample_explore_item,
+                         inputs=[model_selection, size_H_selection, size_W_selection],
                          outputs=[puzzle_md, model_reasoning_md, model_prediction_md, model_eval_md, turht_solution_md])
 def _tab_submit():
     markdown_text = """
     Please create an issue on our [Github](https://github.com/WildEval/ZeroEval/) repository to talk about your model. Then, we can test it for you and report the results here on the Leaderboard.
+    If you would like to do local testing, please read our code [here](https://github.com/WildEval/ZeroEval/blob/main/src/evaluation/zebra_grid_eval.py)
+    and apply for the access for the [private dataset](https://huggingface.co/datasets/WildEval/ZebraLogic) that contains the truth solutions.
     """
     gr.Markdown("## 🚀 Submit Your Results\n\n" + markdown_text, elem_classes="markdown-text")
     with gr.Blocks(theme=gr.themes.Soft(), css=css, js=js_light) as demo:
         gr.HTML(BANNER, elem_id="banner")
+        # convert LAST_UPDATED to the PDT time
         LAST_UPDATED = datetime.now(pytz.timezone('US/Pacific')).strftime("%Y-%m-%d %H:%M:%S")
         header_md_text = HEADER_MD.replace("{LAST_UPDATED}", str(LAST_UPDATED))
+        gr.Markdown(header_md_text, elem_classes="markdown-text")
+        with gr.Tabs(elem_classes="tab-buttons") as tabs:
             with gr.TabItem("🏅 Leaderboard", elem_id="od-benchmark-tab-table", id=0):
+                _tab_leaderboard()
             with gr.TabItem("🔍 Explore", elem_id="od-benchmark-tab-table", id=1):
                 _tab_explore()
             with gr.TabItem("🚀 Submit Your Results", elem_id="od-benchmark-tab-table", id=3):
+                _tab_submit()
             with gr.TabItem("📮 About Us", elem_id="od-benchmark-tab-table", id=4):
                 gr.Markdown(ABOUT_MD, elem_classes="markdown-text")
         with gr.Row():
             with gr.Accordion("📙 Citation", open=False, elem_classes="accordion-label"):
                 gr.Textbox(
+                    value=CITATION_TEXT,
                     lines=7,
                     label="Copy the BibTeX snippet to cite this source",
                     elem_id="citation-button",
                     show_copy_button=True)
                 # ).style(show_copy_button=True)
+    return demo
     print(f"Loading {result_file}")
     column_names_main = column_names.copy()
     # column_names_main.update({})
+    main_ordered_columns = ORDERED_COLUMN_NAMES
+    # filter the data with Total Puzzles == 1000
+    click_url = True
+    # read json file from the result_file
     with open(result_file, "r") as f:
         raw_data = json.load(f)
     # floatify the data, if possible
     original_df = pd.DataFrame(raw_data)
     original_df = original_df[original_df["Total Puzzles"] == 1000]
     original_df = post_processing(original_df, column_names_main, ordered_columns=main_ordered_columns, click_url=click_url, rank_column=RANKING_COLUMN)
+    # print(original_df.columns)
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument("--share", action="store_true")
     parser.add_argument("--result_file", help="Path to results table", default="ZeroEval-main/result_dirs/zebra-grid.summary.json")
     args = parser.parse_args()
+    data_load(args.result_file)
     print(original_df)
     demo = build_demo()
     demo.launch(share=args.share, height=3000, width="100%")

constants.py CHANGED Viewed

@@ -8,15 +8,15 @@ banner_url = "https://github.com/WildEval/ZeroEval/blob/main/docs/zebra/zebra_ba
 BANNER = f'<div style="display: flex; justify-content: flex-start;"><img src="{banner_url}" alt="Banner" style="width: 70vw; min-width: 300px; max-width: 1000px;border: 3px solid gray; border-color: gray black;"> </div>'
 # TITLE = "<html> <head> <style> h1 {text-align: center;} </style> </head> <body> <h1> 🦁 AI2 WildBench Leaderboard </b> </body> </html>"
 CITATION_TEXT = """
-@misc{zebralogic2024,
-    title={ZebraLogic: Benchmarking the Logical Reasoning Ability of Language Models},
-    author={Bill Yuchen Lin and Ronan Le Bras and Peter Clark and Yejin Choi},
-    url={https://huggingface.co/spaces/allenai/ZebraLogic},
-    year={2024}
 }
@@ -27,15 +27,15 @@ CITATION_TEXT = """
   volume={36},
   year={2024}
 }
 """
 # make column_names as an ordered dict
 column_names = OrderedDict({
-    "Model": "Model",
     "Mode": "Mode",
     "Puzzle Acc": "Puzzle Acc",
     "Cell Acc": "Cell Acc",
@@ -48,29 +48,29 @@ column_names = OrderedDict({
-LEADERBOARD_REMARKS = """**WB Reward**: for each comparison (A vs B), a reward for A is **+/-1** if A is **much better/worse** than B, and **+/-0.5** if A is **slightly better/worse** than B; when there is a **Tie**, the reward is **0**.
 """
 # **WB Reward**: for each pairwise comparison, a reward for A is **+/-1** if A is **much better/worse** than B, and **+/-0.5** if A is **slightly better/worse** than B; 0 for a **Tie**.
-# The baseline models are GPT4-Turbo, Haiku, and Llama2-70B, and Mix is the average of the three.
 # **WB Score** individually scores each model based on checklists.
 # Evaluator is GPT-4-Turbo.
-LEADERBOARD_REMARKS_MAIN = """
 """
 RANKING_COLUMN = "Puzzle Acc"
 ORDERED_COLUMN_NAMES = [
-    "Model",
     "Mode",
     "Puzzle Acc",
     "Easy Puzzle Acc",
     "Hard Puzzle Acc",
     "Cell Acc",
-    "No answer",
 ]
 js_light = """
 function refresh() {
     const url = new URL(window.location);
@@ -110,15 +110,15 @@ function refresh() {
 js_code = """
 function scroll_top() {
-    console.log("Hello from Gradio!");
     const bubbles = document.querySelectorAll('.bubble-wrap');
     bubbles.forEach((bubble, index) => {
         setTimeout(() => {
             bubble.scrollTop = 0;
         }, index * 100); // Delay of 100ms between each iteration
     });
-}
 """
@@ -126,7 +126,7 @@ TASK_TYPE_STR = "**Tasks**: Info seeking (**InfoSek**), Creative Writing (**CrtW
 css = """
 code {
     font-size: large;
@@ -179,17 +179,17 @@ td {
 .chat-common{
     height: auto;
     max-height: 400px;
-    min-height: 100px;
 }
 .chat-specific{
     height: auto;
     max-height: 600px;
-    min-height: 200px;
 }
 #od-benchmark-tab-table-button{
     font-size: 15pt;
     font-weight: bold;
-}
 .btn_boderline{
     border: 1px solid #000000;
@@ -197,7 +197,7 @@ td {
     padding: 5px;
     margin: 5px;
     font-size: 15pt;
-    font-weight: bold;
 }
 .btn_boderline_next{
@@ -206,7 +206,7 @@ td {
     padding: 5px;
     margin: 5px;
     font-size: 15pt;
-    font-weight: bold;
 }
 .btn_boderline_gray{
@@ -215,7 +215,7 @@ td {
     padding: 5px;
     margin: 5px;
     font-size: 15pt;
-    font-weight: italic;
 }
 .btn_boderline_selected{
     border: 2px solid purple;
@@ -224,12 +224,12 @@ td {
     padding: 5px;
     margin: 5px;
     font-size: 15pt;
-    font-weight: bold;
 }
 .accordion-label button span{
     font-size: 14pt;
     font-weight: bold;
-}
 #show-task-categorized span{
     font-size: 13pt;
@@ -269,7 +269,7 @@ button.selected[role="tab"][aria-selected="true"] {
 .plotly-plot{
     height: auto;
     max-height: 600px;
-    min-height: 600px;
 }
 #length-margin-radio{
@@ -279,12 +279,12 @@ button.selected[role="tab"][aria-selected="true"] {
 }
 #show-task-categorized{
-    font-size: 12pt;
     font-decoration: bold;
 }
 #show-open-source-models{
-    font-size: 12pt;
     font-decoration: bold;
 }
@@ -296,4 +296,3 @@ button.selected[role="tab"][aria-selected="true"] {
     margin: 5px;
 }
 """

 BANNER = f'<div style="display: flex; justify-content: flex-start;"><img src="{banner_url}" alt="Banner" style="width: 70vw; min-width: 300px; max-width: 1000px;border: 3px solid gray; border-color: gray black;"> </div>'
 # TITLE = "<html> <head> <style> h1 {text-align: center;} </style> </head> <body> <h1> 🦁 AI2 WildBench Leaderboard </b> </body> </html>"
 CITATION_TEXT = """
+@article{zebralogic2025,
+    title={ZebraLogic: On the Scaling Limits of LLMs for Logical Reasoning},
+    author={Bill Yuchen Lin and Ronan Le Bras and Kyle Richardson and Ashish Sabharwal and Radha Poovendran and Peter Clark and Yejin Choi},
+    year={2025},
+    url={https://arxiv.org/abs/2502.01100},
 }
   volume={36},
   year={2024}
 }
 """
 # make column_names as an ordered dict
 column_names = OrderedDict({
+    "Model": "Model",
     "Mode": "Mode",
     "Puzzle Acc": "Puzzle Acc",
     "Cell Acc": "Cell Acc",
+LEADERBOARD_REMARKS = """**WB Reward**: for each comparison (A vs B), a reward for A is **+/-1** if A is **much better/worse** than B, and **+/-0.5** if A is **slightly better/worse** than B; when there is a **Tie**, the reward is **0**.
 """
 # **WB Reward**: for each pairwise comparison, a reward for A is **+/-1** if A is **much better/worse** than B, and **+/-0.5** if A is **slightly better/worse** than B; 0 for a **Tie**.
+# The baseline models are GPT4-Turbo, Haiku, and Llama2-70B, and Mix is the average of the three.
 # **WB Score** individually scores each model based on checklists.
 # Evaluator is GPT-4-Turbo.
+LEADERBOARD_REMARKS_MAIN = """
 """
 RANKING_COLUMN = "Puzzle Acc"
 ORDERED_COLUMN_NAMES = [
+    "Model",
     "Mode",
     "Puzzle Acc",
     "Easy Puzzle Acc",
     "Hard Puzzle Acc",
     "Cell Acc",
+    "No answer",
 ]
 js_light = """
 function refresh() {
     const url = new URL(window.location);
 js_code = """
 function scroll_top() {
+    console.log("Hello from Gradio!");
     const bubbles = document.querySelectorAll('.bubble-wrap');
     bubbles.forEach((bubble, index) => {
         setTimeout(() => {
             bubble.scrollTop = 0;
         }, index * 100); // Delay of 100ms between each iteration
     });
+}
 """
 css = """
 code {
     font-size: large;
 .chat-common{
     height: auto;
     max-height: 400px;
+    min-height: 100px;
 }
 .chat-specific{
     height: auto;
     max-height: 600px;
+    min-height: 200px;
 }
 #od-benchmark-tab-table-button{
     font-size: 15pt;
     font-weight: bold;
+}
 .btn_boderline{
     border: 1px solid #000000;
     padding: 5px;
     margin: 5px;
     font-size: 15pt;
+    font-weight: bold;
 }
 .btn_boderline_next{
     padding: 5px;
     margin: 5px;
     font-size: 15pt;
+    font-weight: bold;
 }
 .btn_boderline_gray{
     padding: 5px;
     margin: 5px;
     font-size: 15pt;
+    font-weight: italic;
 }
 .btn_boderline_selected{
     border: 2px solid purple;
     padding: 5px;
     margin: 5px;
     font-size: 15pt;
+    font-weight: bold;
 }
 .accordion-label button span{
     font-size: 14pt;
     font-weight: bold;
+}
 #show-task-categorized span{
     font-size: 13pt;
 .plotly-plot{
     height: auto;
     max-height: 600px;
+    min-height: 600px;
 }
 #length-margin-radio{
 }
 #show-task-categorized{
+    font-size: 12pt;
     font-decoration: bold;
 }
 #show-open-source-models{
+    font-size: 12pt;
     font-decoration: bold;
 }
     margin: 5px;
 }
 """

eval_utils.py CHANGED Viewed

@@ -8,7 +8,7 @@ private_solutions = {}
 def load_private_solutions():
     global private_solutions
-    private_zebra_data = load_dataset("allenai/ZebraLogicBench-private", "grid_mode", split="test")
     for item in private_zebra_data:
         private_solutions[item["id"]] = item["solution"]
     return

 def load_private_solutions():
     global private_solutions
+    private_zebra_data = load_dataset("WildEval/ZebraLogic", "grid_mode", split="test")
     for item in private_zebra_data:
         private_solutions[item["id"]] = item["solution"]
     return