Spaces:
Running
Running
update algo names
Browse files- app.py +7 -26
- meta_data.py +14 -7
- src/detail_math_score.json +8 -8
- src/overall_math_score.json +12 -12
app.py
CHANGED
@@ -7,32 +7,13 @@ from meta_data import *
|
|
7 |
# import pandas as pd
|
8 |
# pd.set_option('display.max_colwidth', 0)
|
9 |
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
# }
|
18 |
-
|
19 |
-
# /* Add checkbox styles */
|
20 |
-
# .gr-checkbox {
|
21 |
-
# accent-color: rgb(59, 130, 246) !important; /* blue */
|
22 |
-
# }
|
23 |
-
|
24 |
-
# .gr-checkbox-group label input[type="checkbox"] {
|
25 |
-
# accent-color: rgb(59, 130, 246) !important;
|
26 |
-
# }
|
27 |
-
|
28 |
-
# .gr-checkbox-group input[type="checkbox"]:checked {
|
29 |
-
# background-color: rgb(59, 130, 246) !important;
|
30 |
-
# border-color: rgb(59, 130, 246) !important;
|
31 |
-
# }
|
32 |
-
# </style>
|
33 |
-
# """
|
34 |
-
|
35 |
-
with gr.Blocks(title="Open Agent Leaderboard") as demo:
|
36 |
struct = load_results(OVERALL_MATH_SCORE_FILE)
|
37 |
timestamp = struct['time']
|
38 |
EVAL_TIME = format_timestamp(timestamp)
|
|
|
7 |
# import pandas as pd
|
8 |
# pd.set_option('display.max_colwidth', 0)
|
9 |
|
10 |
+
|
11 |
+
with gr.Blocks(title="Open Agent Leaderboard", css="""
|
12 |
+
.gradio-container p {
|
13 |
+
white-space: pre-line !important;
|
14 |
+
margin: 0 !important;
|
15 |
+
}
|
16 |
+
""") as demo:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
struct = load_results(OVERALL_MATH_SCORE_FILE)
|
18 |
timestamp = struct['time']
|
19 |
EVAL_TIME = format_timestamp(timestamp)
|
meta_data.py
CHANGED
@@ -3,7 +3,7 @@ OVERALL_MATH_SCORE_FILE = "src/overall_math_score.json"
|
|
3 |
DETAIL_MATH_SCORE_FILE = "src/detail_math_score.json"
|
4 |
# CONSTANTS-TEXT
|
5 |
LEADERBORAD_INTRODUCTION = """# Open Agent Leaderboard
|
6 |
-
### Welcome to the Open Agent Leaderboard! We share the evaluation results of open agents:
|
7 |
|
8 |
This leaderboard was last updated: {}.
|
9 |
|
@@ -26,6 +26,9 @@ LEADERBOARD_MD['MATH_MAIN'] = f"""
|
|
26 |
- Cost: The cost on each math Benchmarks (the lower the better).
|
27 |
|
28 |
- By default, we present the overall evaluation results based on {', '.join(DEFAULT_MATH_BENCH)}, sorted by the descending order of Avg Score.
|
|
|
|
|
|
|
29 |
- ReAct-Pro*: We modified ReAct to ReAct-Pro, following the Reflexion repository. Implementation details can be found in the [*OmAgent*](https://github.com/om-ai-lab/OmAgent) repository.
|
30 |
"""
|
31 |
|
@@ -47,6 +50,9 @@ LEADERBOARD_MD['MATH_DETAIL'] = f"""
|
|
47 |
- Doubao-lite-32k (1 USD = 7.3249 CNY):
|
48 |
- 0.00004096$/1M tokens (input)
|
49 |
- 0.0001$/1M tokens (output)
|
|
|
|
|
|
|
50 |
- ReAct-Pro*: We modified ReAct to ReAct-Pro, following the Reflexion repository. Implementation details can be found in the [*OmAgent*](https://github.com/om-ai-lab/OmAgent) repository.
|
51 |
"""
|
52 |
|
@@ -63,12 +69,13 @@ LLM = [
|
|
63 |
]
|
64 |
|
65 |
ALGORITHMS = [
|
66 |
-
'IO', '
|
67 |
]
|
68 |
|
69 |
-
CITATION_BUTTON_TEXT = r"""@
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
|
|
74 |
}"""
|
|
|
3 |
DETAIL_MATH_SCORE_FILE = "src/detail_math_score.json"
|
4 |
# CONSTANTS-TEXT
|
5 |
LEADERBORAD_INTRODUCTION = """# Open Agent Leaderboard
|
6 |
+
### Welcome to the Open Agent Leaderboard! We share the evaluation results of open agents: CoT, SC-CoT, PoT, ReAct, etc. The agents are impletemented by the OpenSource Framework: [*OmAgent*](https://github.com/om-ai-lab/OmAgent)
|
7 |
|
8 |
This leaderboard was last updated: {}.
|
9 |
|
|
|
26 |
- Cost: The cost on each math Benchmarks (the lower the better).
|
27 |
|
28 |
- By default, we present the overall evaluation results based on {', '.join(DEFAULT_MATH_BENCH)}, sorted by the descending order of Avg Score.
|
29 |
+
|
30 |
+
- IO (Input-Output): The baseline method that directly prompts the model with the question and expects an answer without any intermediate reasoning steps.
|
31 |
+
|
32 |
- ReAct-Pro*: We modified ReAct to ReAct-Pro, following the Reflexion repository. Implementation details can be found in the [*OmAgent*](https://github.com/om-ai-lab/OmAgent) repository.
|
33 |
"""
|
34 |
|
|
|
50 |
- Doubao-lite-32k (1 USD = 7.3249 CNY):
|
51 |
- 0.00004096$/1M tokens (input)
|
52 |
- 0.0001$/1M tokens (output)
|
53 |
+
|
54 |
+
- IO (Input-Output) is the baseline method that directly prompts the model with the question and expects an answer without any intermediate reasoning steps. It represents the most basic way of using language models and serves as a reference point for evaluating the effectiveness of other algorithms.
|
55 |
+
|
56 |
- ReAct-Pro*: We modified ReAct to ReAct-Pro, following the Reflexion repository. Implementation details can be found in the [*OmAgent*](https://github.com/om-ai-lab/OmAgent) repository.
|
57 |
"""
|
58 |
|
|
|
69 |
]
|
70 |
|
71 |
ALGORITHMS = [
|
72 |
+
'IO', 'CoT', 'SC-CoT', 'PoT', 'ReAct-Pro*'
|
73 |
]
|
74 |
|
75 |
+
CITATION_BUTTON_TEXT = r"""@misc{open-agent-leaderboard,
|
76 |
+
title={Open Agent Leaderboard},
|
77 |
+
author={Om AI Lab},
|
78 |
+
year={2025},
|
79 |
+
publisher={GitHub},
|
80 |
+
howpublished={\url{https://github.com/om-ai-lab/open-agent-leaderboard}}
|
81 |
}"""
|
src/detail_math_score.json
CHANGED
@@ -69,10 +69,10 @@
|
|
69 |
}
|
70 |
}
|
71 |
},
|
72 |
-
"
|
73 |
"gpt-3.5-turbo": {
|
74 |
"META": {
|
75 |
-
"Algorithm": "
|
76 |
"LLM": "gpt-3.5-turbo",
|
77 |
"Eval Date": "2025/01/07"
|
78 |
},
|
@@ -105,7 +105,7 @@
|
|
105 |
},
|
106 |
"Doubao-lite-32k": {
|
107 |
"META": {
|
108 |
-
"Algorithm": "
|
109 |
"LLM": "Doubao-lite-32k",
|
110 |
"Eval Date": "2025/01/07"
|
111 |
},
|
@@ -140,7 +140,7 @@
|
|
140 |
"SC-COT": {
|
141 |
"gpt-3.5-turbo": {
|
142 |
"META": {
|
143 |
-
"Algorithm": "SC-
|
144 |
"LLM": "gpt-3.5-turbo",
|
145 |
"Eval Date": "2025/01/07"
|
146 |
},
|
@@ -173,7 +173,7 @@
|
|
173 |
},
|
174 |
"Doubao-lite-32k": {
|
175 |
"META": {
|
176 |
-
"Algorithm": "SC-
|
177 |
"LLM": "Doubao-lite-32k",
|
178 |
"Eval Date": "2025/01/07"
|
179 |
},
|
@@ -205,10 +205,10 @@
|
|
205 |
}
|
206 |
}
|
207 |
},
|
208 |
-
"
|
209 |
"gpt-3.5-turbo": {
|
210 |
"META": {
|
211 |
-
"Algorithm": "
|
212 |
"LLM": "gpt-3.5-turbo",
|
213 |
"Eval Date": "2025/01/07"
|
214 |
},
|
@@ -241,7 +241,7 @@
|
|
241 |
},
|
242 |
"Doubao-lite-32k": {
|
243 |
"META": {
|
244 |
-
"Algorithm": "
|
245 |
"LLM": "Doubao-lite-32k",
|
246 |
"Eval Date": "2025/01/07"
|
247 |
},
|
|
|
69 |
}
|
70 |
}
|
71 |
},
|
72 |
+
"CoT": {
|
73 |
"gpt-3.5-turbo": {
|
74 |
"META": {
|
75 |
+
"Algorithm": "CoT",
|
76 |
"LLM": "gpt-3.5-turbo",
|
77 |
"Eval Date": "2025/01/07"
|
78 |
},
|
|
|
105 |
},
|
106 |
"Doubao-lite-32k": {
|
107 |
"META": {
|
108 |
+
"Algorithm": "CoT",
|
109 |
"LLM": "Doubao-lite-32k",
|
110 |
"Eval Date": "2025/01/07"
|
111 |
},
|
|
|
140 |
"SC-COT": {
|
141 |
"gpt-3.5-turbo": {
|
142 |
"META": {
|
143 |
+
"Algorithm": "SC-CoT",
|
144 |
"LLM": "gpt-3.5-turbo",
|
145 |
"Eval Date": "2025/01/07"
|
146 |
},
|
|
|
173 |
},
|
174 |
"Doubao-lite-32k": {
|
175 |
"META": {
|
176 |
+
"Algorithm": "SC-CoT",
|
177 |
"LLM": "Doubao-lite-32k",
|
178 |
"Eval Date": "2025/01/07"
|
179 |
},
|
|
|
205 |
}
|
206 |
}
|
207 |
},
|
208 |
+
"PoT": {
|
209 |
"gpt-3.5-turbo": {
|
210 |
"META": {
|
211 |
+
"Algorithm": "PoT",
|
212 |
"LLM": "gpt-3.5-turbo",
|
213 |
"Eval Date": "2025/01/07"
|
214 |
},
|
|
|
241 |
},
|
242 |
"Doubao-lite-32k": {
|
243 |
"META": {
|
244 |
+
"Algorithm": "PoT",
|
245 |
"LLM": "Doubao-lite-32k",
|
246 |
"Eval Date": "2025/01/07"
|
247 |
},
|
src/overall_math_score.json
CHANGED
@@ -16,9 +16,9 @@
|
|
16 |
"Cost($)": 0.0380
|
17 |
}
|
18 |
},
|
19 |
-
"
|
20 |
"META": {
|
21 |
-
"Algorithm": "
|
22 |
"LLM": "gpt-3.5-turbo",
|
23 |
"Eval Date": "2025/01/07"
|
24 |
},
|
@@ -31,9 +31,9 @@
|
|
31 |
"Cost($)": 0.0957
|
32 |
}
|
33 |
},
|
34 |
-
"SC-
|
35 |
"META": {
|
36 |
-
"Algorithm": "SC-
|
37 |
"LLM": "gpt-3.5-turbo",
|
38 |
"Eval Date": "2025/01/07"
|
39 |
},
|
@@ -46,9 +46,9 @@
|
|
46 |
"Cost($)": 0.6491
|
47 |
}
|
48 |
},
|
49 |
-
"
|
50 |
"META": {
|
51 |
-
"Algorithm": "
|
52 |
"LLM": "gpt-3.5-turbo",
|
53 |
"Eval Date": "2025/01/07"
|
54 |
},
|
@@ -91,9 +91,9 @@
|
|
91 |
"Cost($)": 0.0058
|
92 |
}
|
93 |
},
|
94 |
-
"
|
95 |
"META": {
|
96 |
-
"Algorithm": "
|
97 |
"LLM": "Doubao-lite-32k",
|
98 |
"Eval Date": "2025/01/07"
|
99 |
},
|
@@ -106,9 +106,9 @@
|
|
106 |
"Cost($)": 0.0066
|
107 |
}
|
108 |
},
|
109 |
-
"SC-
|
110 |
"META": {
|
111 |
-
"Algorithm": "SC-
|
112 |
"LLM": "Doubao-lite-32k",
|
113 |
"Eval Date": "2025/01/07"
|
114 |
},
|
@@ -121,9 +121,9 @@
|
|
121 |
"Cost($)": 0.0409
|
122 |
}
|
123 |
},
|
124 |
-
"
|
125 |
"META": {
|
126 |
-
"Algorithm": "
|
127 |
"LLM": "Doubao-lite-32k",
|
128 |
"Eval Date": "2025/01/07"
|
129 |
},
|
|
|
16 |
"Cost($)": 0.0380
|
17 |
}
|
18 |
},
|
19 |
+
"CoT": {
|
20 |
"META": {
|
21 |
+
"Algorithm": "CoT",
|
22 |
"LLM": "gpt-3.5-turbo",
|
23 |
"Eval Date": "2025/01/07"
|
24 |
},
|
|
|
31 |
"Cost($)": 0.0957
|
32 |
}
|
33 |
},
|
34 |
+
"SC-CoT": {
|
35 |
"META": {
|
36 |
+
"Algorithm": "SC-CoT",
|
37 |
"LLM": "gpt-3.5-turbo",
|
38 |
"Eval Date": "2025/01/07"
|
39 |
},
|
|
|
46 |
"Cost($)": 0.6491
|
47 |
}
|
48 |
},
|
49 |
+
"PoT": {
|
50 |
"META": {
|
51 |
+
"Algorithm": "PoT",
|
52 |
"LLM": "gpt-3.5-turbo",
|
53 |
"Eval Date": "2025/01/07"
|
54 |
},
|
|
|
91 |
"Cost($)": 0.0058
|
92 |
}
|
93 |
},
|
94 |
+
"CoT-Doubao": {
|
95 |
"META": {
|
96 |
+
"Algorithm": "CoT",
|
97 |
"LLM": "Doubao-lite-32k",
|
98 |
"Eval Date": "2025/01/07"
|
99 |
},
|
|
|
106 |
"Cost($)": 0.0066
|
107 |
}
|
108 |
},
|
109 |
+
"SC-CoT-Doubao": {
|
110 |
"META": {
|
111 |
+
"Algorithm": "SC-CoT",
|
112 |
"LLM": "Doubao-lite-32k",
|
113 |
"Eval Date": "2025/01/07"
|
114 |
},
|
|
|
121 |
"Cost($)": 0.0409
|
122 |
}
|
123 |
},
|
124 |
+
"PoT-Doubao": {
|
125 |
"META": {
|
126 |
+
"Algorithm": "PoT",
|
127 |
"LLM": "Doubao-lite-32k",
|
128 |
"Eval Date": "2025/01/07"
|
129 |
},
|