Spaces:

omlab
/

open-agent-leaderboard

Running

App Files Files Community

qq-hzlh commited on 8 days ago

Commit

610b4b4

1 Parent(s): a7d1809

update algo names

Browse files

Files changed (4) hide show

app.py +7 -26
meta_data.py +14 -7
src/detail_math_score.json +8 -8
src/overall_math_score.json +12 -12

app.py CHANGED Viewed

@@ -7,32 +7,13 @@ from meta_data import *
 # import pandas as pd
 # pd.set_option('display.max_colwidth', 0)
-# head_style = """
-# <style>
-# @media (min-width: 1536px)
-# {
-#     .gradio-container {
-#         min-width: var(--size-full) !important;
-#     }
-# }
-# /* Add checkbox styles */
-# .gr-checkbox {
-#     accent-color: rgb(59, 130, 246) !important;  /* blue */
-# }
-# .gr-checkbox-group label input[type="checkbox"] {
-#     accent-color: rgb(59, 130, 246) !important;
-# }
-# .gr-checkbox-group input[type="checkbox"]:checked {
-#     background-color: rgb(59, 130, 246) !important;
-#     border-color: rgb(59, 130, 246) !important;
-# }
-# </style>
-# """
-with gr.Blocks(title="Open Agent Leaderboard") as demo:
     struct = load_results(OVERALL_MATH_SCORE_FILE)
     timestamp = struct['time']
     EVAL_TIME = format_timestamp(timestamp)

 # import pandas as pd
 # pd.set_option('display.max_colwidth', 0)
+with gr.Blocks(title="Open Agent Leaderboard", css="""
+    .gradio-container p {
+        white-space: pre-line !important;
+        margin: 0 !important;
+    }
+""") as demo:
     struct = load_results(OVERALL_MATH_SCORE_FILE)
     timestamp = struct['time']
     EVAL_TIME = format_timestamp(timestamp)

meta_data.py CHANGED Viewed

@@ -3,7 +3,7 @@ OVERALL_MATH_SCORE_FILE = "src/overall_math_score.json"
 DETAIL_MATH_SCORE_FILE = "src/detail_math_score.json"
 # CONSTANTS-TEXT
 LEADERBORAD_INTRODUCTION = """# Open Agent Leaderboard
-### Welcome to the Open Agent Leaderboard! We share the evaluation results of open agents: COT, SC_COT, POT, ReAct, etc. The agents are impletemented by the OpenSource Framework: [*OmAgent*](https://github.com/om-ai-lab/OmAgent)
 This leaderboard was last updated: {}.
@@ -26,6 +26,9 @@ LEADERBOARD_MD['MATH_MAIN'] = f"""
   - Cost: The cost on each math Benchmarks (the lower the better).
 - By default, we present the overall evaluation results based on {', '.join(DEFAULT_MATH_BENCH)}, sorted by the descending order of Avg Score.
 - ReAct-Pro*: We modified ReAct to ReAct-Pro, following the Reflexion repository. Implementation details can be found in the [*OmAgent*](https://github.com/om-ai-lab/OmAgent) repository.
 """
@@ -47,6 +50,9 @@ LEADERBOARD_MD['MATH_DETAIL'] = f"""
   - Doubao-lite-32k (1 USD = 7.3249 CNY):
       - 0.00004096$/1M tokens (input)
     - 0.0001$/1M tokens (output)
 - ReAct-Pro*: We modified ReAct to ReAct-Pro, following the Reflexion repository. Implementation details can be found in the [*OmAgent*](https://github.com/om-ai-lab/OmAgent) repository.
 """
@@ -63,12 +69,13 @@ LLM = [
 ]
 ALGORITHMS = [
-  'IO',  'COT', 'SC_COT', 'POT', 'ReAct-Pro*'
 ]
-CITATION_BUTTON_TEXT = r"""@article{zhang2024omagent,
-  title={OmAgent: A Multi-modal Agent Framework for Complex Video Understanding with Task Divide-and-Conquer},
-  author={Zhang, Lu and Zhao, Tiancheng and Ying, Heting and Ma, Yibo and Lee, Kyusong},
-  journal={arXiv preprint arXiv:2406.16620},
-  year={2024}
 }"""

 DETAIL_MATH_SCORE_FILE = "src/detail_math_score.json"
 # CONSTANTS-TEXT
 LEADERBORAD_INTRODUCTION = """# Open Agent Leaderboard
+### Welcome to the Open Agent Leaderboard! We share the evaluation results of open agents: CoT, SC-CoT, PoT, ReAct, etc. The agents are impletemented by the OpenSource Framework: [*OmAgent*](https://github.com/om-ai-lab/OmAgent)
 This leaderboard was last updated: {}.
   - Cost: The cost on each math Benchmarks (the lower the better).
 - By default, we present the overall evaluation results based on {', '.join(DEFAULT_MATH_BENCH)}, sorted by the descending order of Avg Score.
+- IO (Input-Output): The baseline method that directly prompts the model with the question and expects an answer without any intermediate reasoning steps.
 - ReAct-Pro*: We modified ReAct to ReAct-Pro, following the Reflexion repository. Implementation details can be found in the [*OmAgent*](https://github.com/om-ai-lab/OmAgent) repository.
 """
   - Doubao-lite-32k (1 USD = 7.3249 CNY):
       - 0.00004096$/1M tokens (input)
     - 0.0001$/1M tokens (output)
+- IO (Input-Output) is the baseline method that directly prompts the model with the question and expects an answer without any intermediate reasoning steps. It represents the most basic way of using language models and serves as a reference point for evaluating the effectiveness of other algorithms.
 - ReAct-Pro*: We modified ReAct to ReAct-Pro, following the Reflexion repository. Implementation details can be found in the [*OmAgent*](https://github.com/om-ai-lab/OmAgent) repository.
 """
 ]
 ALGORITHMS = [
+  'IO',  'CoT', 'SC-CoT', 'PoT', 'ReAct-Pro*'
 ]
+CITATION_BUTTON_TEXT = r"""@misc{open-agent-leaderboard,
+    title={Open Agent Leaderboard},
+    author={Om AI Lab},
+    year={2025},
+    publisher={GitHub},
+    howpublished={\url{https://github.com/om-ai-lab/open-agent-leaderboard}}
 }"""

src/detail_math_score.json CHANGED Viewed

@@ -69,10 +69,10 @@
                 }
             }
         },
-        "COT": {
             "gpt-3.5-turbo": {
                 "META": {
-                    "Algorithm": "COT",
                     "LLM": "gpt-3.5-turbo",
                     "Eval Date": "2025/01/07"
                 },
@@ -105,7 +105,7 @@
             },
             "Doubao-lite-32k": {
                 "META": {
-                    "Algorithm": "COT",
                     "LLM": "Doubao-lite-32k",
                     "Eval Date": "2025/01/07"
                 },
@@ -140,7 +140,7 @@
         "SC-COT": {
             "gpt-3.5-turbo": {
                 "META": {
-                    "Algorithm": "SC-COT",
                     "LLM": "gpt-3.5-turbo",
                     "Eval Date": "2025/01/07"
                 },
@@ -173,7 +173,7 @@
             },
             "Doubao-lite-32k": {
                 "META": {
-                    "Algorithm": "SC-COT",
                     "LLM": "Doubao-lite-32k",
                     "Eval Date": "2025/01/07"
                 },
@@ -205,10 +205,10 @@
                 }
             }
         },
-        "POT": {
             "gpt-3.5-turbo": {
                 "META": {
-                    "Algorithm": "POT",
                     "LLM": "gpt-3.5-turbo",
                     "Eval Date": "2025/01/07"
                 },
@@ -241,7 +241,7 @@
             },
             "Doubao-lite-32k": {
                 "META": {
-                    "Algorithm": "POT",
                     "LLM": "Doubao-lite-32k",
                     "Eval Date": "2025/01/07"
                 },

                 }
             }
         },
+        "CoT": {
             "gpt-3.5-turbo": {
                 "META": {
+                    "Algorithm": "CoT",
                     "LLM": "gpt-3.5-turbo",
                     "Eval Date": "2025/01/07"
                 },
             },
             "Doubao-lite-32k": {
                 "META": {
+                    "Algorithm": "CoT",
                     "LLM": "Doubao-lite-32k",
                     "Eval Date": "2025/01/07"
                 },
         "SC-COT": {
             "gpt-3.5-turbo": {
                 "META": {
+                    "Algorithm": "SC-CoT",
                     "LLM": "gpt-3.5-turbo",
                     "Eval Date": "2025/01/07"
                 },
             },
             "Doubao-lite-32k": {
                 "META": {
+                    "Algorithm": "SC-CoT",
                     "LLM": "Doubao-lite-32k",
                     "Eval Date": "2025/01/07"
                 },
                 }
             }
         },
+        "PoT": {
             "gpt-3.5-turbo": {
                 "META": {
+                    "Algorithm": "PoT",
                     "LLM": "gpt-3.5-turbo",
                     "Eval Date": "2025/01/07"
                 },
             },
             "Doubao-lite-32k": {
                 "META": {
+                    "Algorithm": "PoT",
                     "LLM": "Doubao-lite-32k",
                     "Eval Date": "2025/01/07"
                 },

src/overall_math_score.json CHANGED Viewed

@@ -16,9 +16,9 @@
                 "Cost($)": 0.0380
             }
         },
-        "COT": {
             "META": {
-                "Algorithm": "COT",
                 "LLM": "gpt-3.5-turbo",
                 "Eval Date": "2025/01/07"
             },
@@ -31,9 +31,9 @@
                 "Cost($)": 0.0957
             }
         },
-        "SC-COT": {
             "META": {
-                "Algorithm": "SC-COT",
                 "LLM": "gpt-3.5-turbo",
                 "Eval Date": "2025/01/07"
             },
@@ -46,9 +46,9 @@
                 "Cost($)": 0.6491
             }
         },
-        "POT": {
             "META": {
-                "Algorithm": "POT",
                 "LLM": "gpt-3.5-turbo",
                 "Eval Date": "2025/01/07"
             },
@@ -91,9 +91,9 @@
                 "Cost($)": 0.0058
             }
         },
-        "COT-Doubao": {
             "META": {
-                "Algorithm": "COT",
                 "LLM": "Doubao-lite-32k",
                 "Eval Date": "2025/01/07"
             },
@@ -106,9 +106,9 @@
                 "Cost($)": 0.0066
             }
         },
-        "SC-COT-Doubao": {
             "META": {
-                "Algorithm": "SC-COT",
                 "LLM": "Doubao-lite-32k",
                 "Eval Date": "2025/01/07"
             },
@@ -121,9 +121,9 @@
                 "Cost($)": 0.0409
             }
         },
-        "POT-Doubao": {
             "META": {
-                "Algorithm": "POT",
                 "LLM": "Doubao-lite-32k",
                 "Eval Date": "2025/01/07"
             },

                 "Cost($)": 0.0380
             }
         },
+        "CoT": {
             "META": {
+                "Algorithm": "CoT",
                 "LLM": "gpt-3.5-turbo",
                 "Eval Date": "2025/01/07"
             },
                 "Cost($)": 0.0957
             }
         },
+        "SC-CoT": {
             "META": {
+                "Algorithm": "SC-CoT",
                 "LLM": "gpt-3.5-turbo",
                 "Eval Date": "2025/01/07"
             },
                 "Cost($)": 0.6491
             }
         },
+        "PoT": {
             "META": {
+                "Algorithm": "PoT",
                 "LLM": "gpt-3.5-turbo",
                 "Eval Date": "2025/01/07"
             },
                 "Cost($)": 0.0058
             }
         },
+        "CoT-Doubao": {
             "META": {
+                "Algorithm": "CoT",
                 "LLM": "Doubao-lite-32k",
                 "Eval Date": "2025/01/07"
             },
                 "Cost($)": 0.0066
             }
         },
+        "SC-CoT-Doubao": {
             "META": {
+                "Algorithm": "SC-CoT",
                 "LLM": "Doubao-lite-32k",
                 "Eval Date": "2025/01/07"
             },
                 "Cost($)": 0.0409
             }
         },
+        "PoT-Doubao": {
             "META": {
+                "Algorithm": "PoT",
                 "LLM": "Doubao-lite-32k",
                 "Eval Date": "2025/01/07"
             },