qq-hzlh commited on
Commit
610b4b4
·
1 Parent(s): a7d1809

update algo names

Browse files
Files changed (4) hide show
  1. app.py +7 -26
  2. meta_data.py +14 -7
  3. src/detail_math_score.json +8 -8
  4. src/overall_math_score.json +12 -12
app.py CHANGED
@@ -7,32 +7,13 @@ from meta_data import *
7
  # import pandas as pd
8
  # pd.set_option('display.max_colwidth', 0)
9
 
10
- # head_style = """
11
- # <style>
12
- # @media (min-width: 1536px)
13
- # {
14
- # .gradio-container {
15
- # min-width: var(--size-full) !important;
16
- # }
17
- # }
18
-
19
- # /* Add checkbox styles */
20
- # .gr-checkbox {
21
- # accent-color: rgb(59, 130, 246) !important; /* blue */
22
- # }
23
-
24
- # .gr-checkbox-group label input[type="checkbox"] {
25
- # accent-color: rgb(59, 130, 246) !important;
26
- # }
27
-
28
- # .gr-checkbox-group input[type="checkbox"]:checked {
29
- # background-color: rgb(59, 130, 246) !important;
30
- # border-color: rgb(59, 130, 246) !important;
31
- # }
32
- # </style>
33
- # """
34
-
35
- with gr.Blocks(title="Open Agent Leaderboard") as demo:
36
  struct = load_results(OVERALL_MATH_SCORE_FILE)
37
  timestamp = struct['time']
38
  EVAL_TIME = format_timestamp(timestamp)
 
7
  # import pandas as pd
8
  # pd.set_option('display.max_colwidth', 0)
9
 
10
+
11
+ with gr.Blocks(title="Open Agent Leaderboard", css="""
12
+ .gradio-container p {
13
+ white-space: pre-line !important;
14
+ margin: 0 !important;
15
+ }
16
+ """) as demo:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  struct = load_results(OVERALL_MATH_SCORE_FILE)
18
  timestamp = struct['time']
19
  EVAL_TIME = format_timestamp(timestamp)
meta_data.py CHANGED
@@ -3,7 +3,7 @@ OVERALL_MATH_SCORE_FILE = "src/overall_math_score.json"
3
  DETAIL_MATH_SCORE_FILE = "src/detail_math_score.json"
4
  # CONSTANTS-TEXT
5
  LEADERBORAD_INTRODUCTION = """# Open Agent Leaderboard
6
- ### Welcome to the Open Agent Leaderboard! We share the evaluation results of open agents: COT, SC_COT, POT, ReAct, etc. The agents are impletemented by the OpenSource Framework: [*OmAgent*](https://github.com/om-ai-lab/OmAgent)
7
 
8
  This leaderboard was last updated: {}.
9
 
@@ -26,6 +26,9 @@ LEADERBOARD_MD['MATH_MAIN'] = f"""
26
  - Cost: The cost on each math Benchmarks (the lower the better).
27
 
28
  - By default, we present the overall evaluation results based on {', '.join(DEFAULT_MATH_BENCH)}, sorted by the descending order of Avg Score.
 
 
 
29
  - ReAct-Pro*: We modified ReAct to ReAct-Pro, following the Reflexion repository. Implementation details can be found in the [*OmAgent*](https://github.com/om-ai-lab/OmAgent) repository.
30
  """
31
 
@@ -47,6 +50,9 @@ LEADERBOARD_MD['MATH_DETAIL'] = f"""
47
  - Doubao-lite-32k (1 USD = 7.3249 CNY):
48
  - 0.00004096$/1M tokens (input)
49
  - 0.0001$/1M tokens (output)
 
 
 
50
  - ReAct-Pro*: We modified ReAct to ReAct-Pro, following the Reflexion repository. Implementation details can be found in the [*OmAgent*](https://github.com/om-ai-lab/OmAgent) repository.
51
  """
52
 
@@ -63,12 +69,13 @@ LLM = [
63
  ]
64
 
65
  ALGORITHMS = [
66
- 'IO', 'COT', 'SC_COT', 'POT', 'ReAct-Pro*'
67
  ]
68
 
69
- CITATION_BUTTON_TEXT = r"""@article{zhang2024omagent,
70
- title={OmAgent: A Multi-modal Agent Framework for Complex Video Understanding with Task Divide-and-Conquer},
71
- author={Zhang, Lu and Zhao, Tiancheng and Ying, Heting and Ma, Yibo and Lee, Kyusong},
72
- journal={arXiv preprint arXiv:2406.16620},
73
- year={2024}
 
74
  }"""
 
3
  DETAIL_MATH_SCORE_FILE = "src/detail_math_score.json"
4
  # CONSTANTS-TEXT
5
  LEADERBORAD_INTRODUCTION = """# Open Agent Leaderboard
6
+ ### Welcome to the Open Agent Leaderboard! We share the evaluation results of open agents: CoT, SC-CoT, PoT, ReAct, etc. The agents are impletemented by the OpenSource Framework: [*OmAgent*](https://github.com/om-ai-lab/OmAgent)
7
 
8
  This leaderboard was last updated: {}.
9
 
 
26
  - Cost: The cost on each math Benchmarks (the lower the better).
27
 
28
  - By default, we present the overall evaluation results based on {', '.join(DEFAULT_MATH_BENCH)}, sorted by the descending order of Avg Score.
29
+
30
+ - IO (Input-Output): The baseline method that directly prompts the model with the question and expects an answer without any intermediate reasoning steps.
31
+
32
  - ReAct-Pro*: We modified ReAct to ReAct-Pro, following the Reflexion repository. Implementation details can be found in the [*OmAgent*](https://github.com/om-ai-lab/OmAgent) repository.
33
  """
34
 
 
50
  - Doubao-lite-32k (1 USD = 7.3249 CNY):
51
  - 0.00004096$/1M tokens (input)
52
  - 0.0001$/1M tokens (output)
53
+
54
+ - IO (Input-Output) is the baseline method that directly prompts the model with the question and expects an answer without any intermediate reasoning steps. It represents the most basic way of using language models and serves as a reference point for evaluating the effectiveness of other algorithms.
55
+
56
  - ReAct-Pro*: We modified ReAct to ReAct-Pro, following the Reflexion repository. Implementation details can be found in the [*OmAgent*](https://github.com/om-ai-lab/OmAgent) repository.
57
  """
58
 
 
69
  ]
70
 
71
  ALGORITHMS = [
72
+ 'IO', 'CoT', 'SC-CoT', 'PoT', 'ReAct-Pro*'
73
  ]
74
 
75
+ CITATION_BUTTON_TEXT = r"""@misc{open-agent-leaderboard,
76
+ title={Open Agent Leaderboard},
77
+ author={Om AI Lab},
78
+ year={2025},
79
+ publisher={GitHub},
80
+ howpublished={\url{https://github.com/om-ai-lab/open-agent-leaderboard}}
81
  }"""
src/detail_math_score.json CHANGED
@@ -69,10 +69,10 @@
69
  }
70
  }
71
  },
72
- "COT": {
73
  "gpt-3.5-turbo": {
74
  "META": {
75
- "Algorithm": "COT",
76
  "LLM": "gpt-3.5-turbo",
77
  "Eval Date": "2025/01/07"
78
  },
@@ -105,7 +105,7 @@
105
  },
106
  "Doubao-lite-32k": {
107
  "META": {
108
- "Algorithm": "COT",
109
  "LLM": "Doubao-lite-32k",
110
  "Eval Date": "2025/01/07"
111
  },
@@ -140,7 +140,7 @@
140
  "SC-COT": {
141
  "gpt-3.5-turbo": {
142
  "META": {
143
- "Algorithm": "SC-COT",
144
  "LLM": "gpt-3.5-turbo",
145
  "Eval Date": "2025/01/07"
146
  },
@@ -173,7 +173,7 @@
173
  },
174
  "Doubao-lite-32k": {
175
  "META": {
176
- "Algorithm": "SC-COT",
177
  "LLM": "Doubao-lite-32k",
178
  "Eval Date": "2025/01/07"
179
  },
@@ -205,10 +205,10 @@
205
  }
206
  }
207
  },
208
- "POT": {
209
  "gpt-3.5-turbo": {
210
  "META": {
211
- "Algorithm": "POT",
212
  "LLM": "gpt-3.5-turbo",
213
  "Eval Date": "2025/01/07"
214
  },
@@ -241,7 +241,7 @@
241
  },
242
  "Doubao-lite-32k": {
243
  "META": {
244
- "Algorithm": "POT",
245
  "LLM": "Doubao-lite-32k",
246
  "Eval Date": "2025/01/07"
247
  },
 
69
  }
70
  }
71
  },
72
+ "CoT": {
73
  "gpt-3.5-turbo": {
74
  "META": {
75
+ "Algorithm": "CoT",
76
  "LLM": "gpt-3.5-turbo",
77
  "Eval Date": "2025/01/07"
78
  },
 
105
  },
106
  "Doubao-lite-32k": {
107
  "META": {
108
+ "Algorithm": "CoT",
109
  "LLM": "Doubao-lite-32k",
110
  "Eval Date": "2025/01/07"
111
  },
 
140
  "SC-COT": {
141
  "gpt-3.5-turbo": {
142
  "META": {
143
+ "Algorithm": "SC-CoT",
144
  "LLM": "gpt-3.5-turbo",
145
  "Eval Date": "2025/01/07"
146
  },
 
173
  },
174
  "Doubao-lite-32k": {
175
  "META": {
176
+ "Algorithm": "SC-CoT",
177
  "LLM": "Doubao-lite-32k",
178
  "Eval Date": "2025/01/07"
179
  },
 
205
  }
206
  }
207
  },
208
+ "PoT": {
209
  "gpt-3.5-turbo": {
210
  "META": {
211
+ "Algorithm": "PoT",
212
  "LLM": "gpt-3.5-turbo",
213
  "Eval Date": "2025/01/07"
214
  },
 
241
  },
242
  "Doubao-lite-32k": {
243
  "META": {
244
+ "Algorithm": "PoT",
245
  "LLM": "Doubao-lite-32k",
246
  "Eval Date": "2025/01/07"
247
  },
src/overall_math_score.json CHANGED
@@ -16,9 +16,9 @@
16
  "Cost($)": 0.0380
17
  }
18
  },
19
- "COT": {
20
  "META": {
21
- "Algorithm": "COT",
22
  "LLM": "gpt-3.5-turbo",
23
  "Eval Date": "2025/01/07"
24
  },
@@ -31,9 +31,9 @@
31
  "Cost($)": 0.0957
32
  }
33
  },
34
- "SC-COT": {
35
  "META": {
36
- "Algorithm": "SC-COT",
37
  "LLM": "gpt-3.5-turbo",
38
  "Eval Date": "2025/01/07"
39
  },
@@ -46,9 +46,9 @@
46
  "Cost($)": 0.6491
47
  }
48
  },
49
- "POT": {
50
  "META": {
51
- "Algorithm": "POT",
52
  "LLM": "gpt-3.5-turbo",
53
  "Eval Date": "2025/01/07"
54
  },
@@ -91,9 +91,9 @@
91
  "Cost($)": 0.0058
92
  }
93
  },
94
- "COT-Doubao": {
95
  "META": {
96
- "Algorithm": "COT",
97
  "LLM": "Doubao-lite-32k",
98
  "Eval Date": "2025/01/07"
99
  },
@@ -106,9 +106,9 @@
106
  "Cost($)": 0.0066
107
  }
108
  },
109
- "SC-COT-Doubao": {
110
  "META": {
111
- "Algorithm": "SC-COT",
112
  "LLM": "Doubao-lite-32k",
113
  "Eval Date": "2025/01/07"
114
  },
@@ -121,9 +121,9 @@
121
  "Cost($)": 0.0409
122
  }
123
  },
124
- "POT-Doubao": {
125
  "META": {
126
- "Algorithm": "POT",
127
  "LLM": "Doubao-lite-32k",
128
  "Eval Date": "2025/01/07"
129
  },
 
16
  "Cost($)": 0.0380
17
  }
18
  },
19
+ "CoT": {
20
  "META": {
21
+ "Algorithm": "CoT",
22
  "LLM": "gpt-3.5-turbo",
23
  "Eval Date": "2025/01/07"
24
  },
 
31
  "Cost($)": 0.0957
32
  }
33
  },
34
+ "SC-CoT": {
35
  "META": {
36
+ "Algorithm": "SC-CoT",
37
  "LLM": "gpt-3.5-turbo",
38
  "Eval Date": "2025/01/07"
39
  },
 
46
  "Cost($)": 0.6491
47
  }
48
  },
49
+ "PoT": {
50
  "META": {
51
+ "Algorithm": "PoT",
52
  "LLM": "gpt-3.5-turbo",
53
  "Eval Date": "2025/01/07"
54
  },
 
91
  "Cost($)": 0.0058
92
  }
93
  },
94
+ "CoT-Doubao": {
95
  "META": {
96
+ "Algorithm": "CoT",
97
  "LLM": "Doubao-lite-32k",
98
  "Eval Date": "2025/01/07"
99
  },
 
106
  "Cost($)": 0.0066
107
  }
108
  },
109
+ "SC-CoT-Doubao": {
110
  "META": {
111
+ "Algorithm": "SC-CoT",
112
  "LLM": "Doubao-lite-32k",
113
  "Eval Date": "2025/01/07"
114
  },
 
121
  "Cost($)": 0.0409
122
  }
123
  },
124
+ "PoT-Doubao": {
125
  "META": {
126
+ "Algorithm": "PoT",
127
  "LLM": "Doubao-lite-32k",
128
  "Eval Date": "2025/01/07"
129
  },