qq-hzlh commited on
Commit
fc71d05
·
verified ·
1 Parent(s): 3954c20

Upload 5 files

Browse files
Files changed (5) hide show
  1. app.py +188 -189
  2. detail_math_score.json +345 -0
  3. gen_table.py +218 -0
  4. meta_data.py +68 -0
  5. overall_math_score.json +155 -0
app.py CHANGED
@@ -1,204 +1,203 @@
 
1
  import gradio as gr
2
- from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
3
- import pandas as pd
4
- from apscheduler.schedulers.background import BackgroundScheduler
5
- from huggingface_hub import snapshot_download
6
-
7
- from src.about import (
8
- CITATION_BUTTON_LABEL,
9
- CITATION_BUTTON_TEXT,
10
- EVALUATION_QUEUE_TEXT,
11
- INTRODUCTION_TEXT,
12
- LLM_BENCHMARKS_TEXT,
13
- TITLE,
14
- )
15
- from src.display.css_html_js import custom_css
16
- from src.display.utils import (
17
- BENCHMARK_COLS,
18
- COLS,
19
- EVAL_COLS,
20
- EVAL_TYPES,
21
- AutoEvalColumn,
22
- ModelType,
23
- fields,
24
- WeightType,
25
- Precision
26
- )
27
- from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
28
- from src.populate import get_evaluation_queue_df, get_leaderboard_df
29
- from src.submission.submit import add_new_eval
30
-
31
-
32
- def restart_space():
33
- API.restart_space(repo_id=REPO_ID)
34
-
35
- ### Space initialisation
36
- try:
37
- print(EVAL_REQUESTS_PATH)
38
- snapshot_download(
39
- repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
40
- )
41
- except Exception:
42
- restart_space()
43
- try:
44
- print(EVAL_RESULTS_PATH)
45
- snapshot_download(
46
- repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
47
- )
48
- except Exception:
49
- restart_space()
50
-
51
-
52
- LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
53
-
54
- (
55
- finished_eval_queue_df,
56
- running_eval_queue_df,
57
- pending_eval_queue_df,
58
- ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
59
-
60
- def init_leaderboard(dataframe):
61
- if dataframe is None or dataframe.empty:
62
- raise ValueError("Leaderboard DataFrame is empty or None.")
63
- return Leaderboard(
64
- value=dataframe,
65
- datatype=[c.type for c in fields(AutoEvalColumn)],
66
- select_columns=SelectColumns(
67
- default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
68
- cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
69
- label="Select Columns to Display:",
70
- ),
71
- search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
72
- hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
73
- filter_columns=[
74
- ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
75
- ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
76
- ColumnFilter(
77
- AutoEvalColumn.params.name,
78
- type="slider",
79
- min=0.01,
80
- max=150,
81
- label="Select the number of parameters (B)",
82
- ),
83
- ColumnFilter(
84
- AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
85
- ),
86
- ],
87
- bool_checkboxgroup_label="Hide models",
88
- interactive=False,
89
- )
90
-
91
-
92
- demo = gr.Blocks(css=custom_css)
93
- with demo:
94
- gr.HTML(TITLE)
95
- gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
96
-
97
- with gr.Tabs(elem_classes="tab-buttons") as tabs:
98
- with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
99
- leaderboard = init_leaderboard(LEADERBOARD_DF)
100
-
101
- with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
102
- gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
103
-
104
- with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
105
- with gr.Column():
106
- with gr.Row():
107
- gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
108
-
109
- with gr.Column():
110
- with gr.Accordion(
111
- f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
112
- open=False,
113
- ):
114
- with gr.Row():
115
- finished_eval_table = gr.components.Dataframe(
116
- value=finished_eval_queue_df,
117
- headers=EVAL_COLS,
118
- datatype=EVAL_TYPES,
119
- row_count=5,
120
- )
121
- with gr.Accordion(
122
- f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
123
- open=False,
124
- ):
125
- with gr.Row():
126
- running_eval_table = gr.components.Dataframe(
127
- value=running_eval_queue_df,
128
- headers=EVAL_COLS,
129
- datatype=EVAL_TYPES,
130
- row_count=5,
131
- )
132
-
133
- with gr.Accordion(
134
- f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
135
- open=False,
136
- ):
137
- with gr.Row():
138
- pending_eval_table = gr.components.Dataframe(
139
- value=pending_eval_queue_df,
140
- headers=EVAL_COLS,
141
- datatype=EVAL_TYPES,
142
- row_count=5,
143
- )
144
- with gr.Row():
145
- gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
146
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
147
  with gr.Row():
148
- with gr.Column():
149
- model_name_textbox = gr.Textbox(label="Model name")
150
- revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
151
- model_type = gr.Dropdown(
152
- choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
153
- label="Model type",
154
- multiselect=False,
155
- value=None,
156
- interactive=True,
157
  )
158
 
159
- with gr.Column():
160
- precision = gr.Dropdown(
161
- choices=[i.value.name for i in Precision if i != Precision.Unknown],
162
- label="Precision",
163
- multiselect=False,
164
- value="float16",
165
- interactive=True,
166
  )
167
- weight_type = gr.Dropdown(
168
- choices=[i.value.name for i in WeightType],
169
- label="Weights type",
170
- multiselect=False,
171
- value="Original",
172
- interactive=True,
173
  )
174
- base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
175
-
176
- submit_button = gr.Button("Submit Eval")
177
- submission_result = gr.Markdown()
178
- submit_button.click(
179
- add_new_eval,
180
- [
181
- model_name_textbox,
182
- base_model_name_textbox,
183
- revision_name_textbox,
184
- precision,
185
- weight_type,
186
- model_type,
187
- ],
188
- submission_result,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
189
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
190
 
191
  with gr.Row():
192
  with gr.Accordion("📙 Citation", open=False):
193
- citation_button = gr.Textbox(
194
- value=CITATION_BUTTON_TEXT,
195
- label=CITATION_BUTTON_LABEL,
196
- lines=20,
197
  elem_id="citation-button",
198
  show_copy_button=True,
199
  )
200
 
201
- scheduler = BackgroundScheduler()
202
- scheduler.add_job(restart_space, "interval", seconds=1800)
203
- scheduler.start()
204
- demo.queue(default_concurrency_limit=40).launch()
 
1
+ import abc
2
  import gradio as gr
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
 
4
+ from gen_table import *
5
+ from meta_data import *
6
+
7
+ # import pandas as pd
8
+ # pd.set_option('display.max_colwidth', 0)
9
+
10
+ head_style = """
11
+ <style>
12
+ @media (min-width: 1536px)
13
+ {
14
+ .gradio-container {
15
+ min-width: var(--size-full) !important;
16
+ }
17
+ }
18
+ </style>
19
+ """
20
+
21
+ with gr.Blocks(title="Open Agent Leaderboard", head=head_style) as demo:
22
+ struct = load_results(OVERALL_MATH_SCORE_FILE)
23
+ timestamp = struct['time']
24
+ EVAL_TIME = format_timestamp(timestamp)
25
+ results = struct['results']
26
+ N_MODEL = len(results)
27
+ N_DATA = len(results['IO'])
28
+ DATASETS = list(results['IO'])
29
+ DATASETS.remove('META')
30
+ print(DATASETS)
31
+
32
+
33
+
34
+ with gr.Tabs(elem_classes='tab-buttons') as tabs:
35
+ gr.Markdown(LEADERBORAD_INTRODUCTION.format(EVAL_TIME))
36
+
37
+ with gr.TabItem('🏅 Open Agent Overall Math Leaderboard', elem_id='math', id=0):
38
+ gr.Markdown(LEADERBOARD_MD['MATH_MAIN'])
39
+ check_box = BUILD_L1_DF(results, DEFAULT_MATH_BENCH)
40
+ table = generate_table(results, DEFAULT_MATH_BENCH)
41
+
42
+ type_map = check_box['type_map']
43
+ type_map['Rank'] = 'number'
44
+
45
+ checkbox_group = gr.CheckboxGroup(
46
+ choices=check_box['all'],
47
+ value=check_box['required'],
48
+ label='Evaluation Dimension',
49
+ interactive=True,
50
+ )
51
+
52
+ headers = ['Rank'] + check_box['essential'] + checkbox_group.value
53
+ data_component = gr.components.DataFrame(
54
+ value=table[headers],
55
+ type='pandas',
56
+ datatype=[type_map[x] for x in headers],
57
+ interactive=False,
58
+ wrap=True,
59
+ visible=True)
60
+
61
+ def filter_df(fields, *args):
62
+ # 获取基础列和选中的列
63
+ headers = ['Rank'] + check_box['essential'] + fields
64
+ df = table.copy()
65
+
66
+ comp = gr.components.DataFrame(
67
+ value=table[headers], # 只显示选中的列
68
+ type='pandas',
69
+ datatype=[type_map[x] for x in headers],
70
+ interactive=False,
71
+ wrap=True,
72
+ visible=True)
73
+ return comp
74
+
75
+ # checkbox_group的change事件只需要传入checkbox_group
76
+ checkbox_group.change(
77
+ fn=filter_df,
78
+ inputs=[checkbox_group],
79
+ outputs=data_component
80
+ )
81
+
82
+ # detail math leaderboard
83
+ with gr.TabItem('🏅 Open Agent Detail Math Leaderboard', elem_id='math_detail', id=1):
84
+ gr.Markdown(LEADERBOARD_MD['MATH_DETAIL'])
85
+ struct_detail = load_results(DETAIL_MATH_SCORE_FILE)
86
+ timestamp = struct_detail['time']
87
+ EVAL_TIME = format_timestamp(timestamp)
88
+ results_detail = struct_detail['results']
89
+
90
+ table, check_box = BUILD_L2_DF(results_detail, DEFAULT_MATH_BENCH)
91
+ # table = generate_table_detail(results_detail, DEFAULT_MATH_BENCH)
92
+
93
+ type_map = check_box['type_map']
94
+ type_map['Rank'] = 'number'
95
+
96
+ checkbox_group = gr.CheckboxGroup(
97
+ choices=check_box['all'],
98
+ value=check_box['required'],
99
+ label='Evaluation Dimension',
100
+ interactive=True,
101
+ )
102
+
103
+ headers = ['Rank'] + checkbox_group.value
104
  with gr.Row():
105
+
106
+ algo_name = gr.CheckboxGroup(
107
+ choices=ALGORITHMS,
108
+ value=ALGORITHMS,
109
+ label='Algorithm',
110
+ interactive=True
 
 
 
111
  )
112
 
113
+ dataset_name = gr.CheckboxGroup(
114
+ choices=DATASETS,
115
+ value=DATASETS,
116
+ label='Datasets',
117
+ interactive=True
 
 
118
  )
119
+
120
+ llm_name = gr.CheckboxGroup(
121
+ choices=LLM,
122
+ value=LLM,
123
+ label='LLM',
124
+ interactive=True
125
  )
126
+
127
+ data_component = gr.components.DataFrame(
128
+ value=table[headers],
129
+ type='pandas',
130
+ datatype=[type_map[x] for x in headers],
131
+ interactive=False,
132
+ wrap=True,
133
+ visible=True)
134
+
135
+ def filter_df(fields, algos, datasets, llms):
136
+ headers = ['Rank'] + check_box['essential'] + fields
137
+ df = table.copy()
138
+
139
+ # 过滤数据
140
+ df['flag'] = df.apply(lambda row: (
141
+ row['Algorithm'] in algos and
142
+ row['Dataset'] in datasets and
143
+ row['LLM'] in llms
144
+ ), axis=1)
145
+
146
+ df = df[df['flag']].copy()
147
+ df.pop('flag')
148
+
149
+ # 按数据集分组,在每个组内根据Score排序并计算排名
150
+ if 'Score' in df.columns:
151
+ # 创建一个临时的排名列
152
+ df['Rank'] = df.groupby('Dataset')['Score'].rank(method='first', ascending=False)
153
+
154
+ # 确保排名为整数
155
+ df['Rank'] = df['Rank'].astype(int)
156
+
157
+ comp = gr.components.DataFrame(
158
+ value=df[headers],
159
+ type='pandas',
160
+ datatype=[type_map[x] for x in headers],
161
+ interactive=False,
162
+ wrap=True,
163
+ visible=True)
164
+ return comp
165
+
166
+ # 为所有复选框组添加change事件
167
+ checkbox_group.change(
168
+ fn=filter_df,
169
+ inputs=[checkbox_group, algo_name, dataset_name, llm_name],
170
+ outputs=data_component
171
+ )
172
+
173
+ algo_name.change(
174
+ fn=filter_df,
175
+ inputs=[checkbox_group, algo_name, dataset_name, llm_name],
176
+ outputs=data_component
177
  )
178
+
179
+ dataset_name.change(
180
+ fn=filter_df,
181
+ inputs=[checkbox_group, algo_name, dataset_name, llm_name],
182
+ outputs=data_component
183
+ )
184
+
185
+ llm_name.change(
186
+ fn=filter_df,
187
+ inputs=[checkbox_group, algo_name, dataset_name, llm_name],
188
+ outputs=data_component
189
+ )
190
+
191
 
192
  with gr.Row():
193
  with gr.Accordion("📙 Citation", open=False):
194
+ gr.Textbox(
195
+ value=CITATION_BUTTON_TEXT, lines=7,
196
+ label="Copy the BibTeX snippet to cite this source",
 
197
  elem_id="citation-button",
198
  show_copy_button=True,
199
  )
200
 
201
+
202
+ if __name__ == '__main__':
203
+ demo.launch(server_name='0.0.0.0')
 
detail_math_score.json ADDED
@@ -0,0 +1,345 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "time": "2025-01-09 17:13:45",
3
+ "results": {
4
+ "IO": {
5
+ "gpt-3.5-turbo": {
6
+ "META": {
7
+ "Algorithm": "IO",
8
+ "LLM": "gpt-3.5-turbo",
9
+ "Eval Date": "2025/01/07"
10
+ },
11
+ "gsm8k": {
12
+ "Score": 37.83,
13
+ "Pass rate": 99.92,
14
+ "X-shot": 8,
15
+ "Parameters": "",
16
+ "Samples": 1319,
17
+ "Total input tokens": 546990,
18
+ "Average input tokens": 415,
19
+ "Total output tokens": 39563,
20
+ "Average output tokens": 30,
21
+ "All tokens": 586553,
22
+ "Cost($)": 0.3328
23
+ },
24
+ "AQuA": {
25
+ "Score": 38.98,
26
+ "Pass rate": 100.00,
27
+ "X-shot": 0,
28
+ "Parameters": "",
29
+ "Samples": 254,
30
+ "Total input tokens": 25701,
31
+ "Average input tokens": 101,
32
+ "Total output tokens": 16770,
33
+ "Average output tokens": 66,
34
+ "All tokens": 42471,
35
+ "Cost($)": 0.0380
36
+ }
37
+ },
38
+ "Doubao-lite-32k": {
39
+ "META": {
40
+ "Algorithm": "IO",
41
+ "LLM": "Doubao-lite-32k",
42
+ "Eval Date": "2025/01/07"
43
+ },
44
+ "gsm8k": {
45
+ "Score": 72.02,
46
+ "Pass rate": 99.92,
47
+ "X-shot": 8,
48
+ "Parameters": "",
49
+ "Samples": 1319,
50
+ "Total input tokens": 617377,
51
+ "Average input tokens": 468,
52
+ "Total output tokens": 123106,
53
+ "Average output tokens": 93,
54
+ "All tokens": 740483,
55
+ "Cost($)": 0.0354
56
+ },
57
+ "AQuA": {
58
+ "Score": 79.13,
59
+ "Pass rate": 100.00,
60
+ "X-shot": 0,
61
+ "Parameters": "",
62
+ "Samples": 254,
63
+ "Total input tokens": 33058,
64
+ "Average input tokens": 130,
65
+ "Total output tokens": 54684,
66
+ "Average output tokens": 215,
67
+ "All tokens": 87742,
68
+ "Cost($)": 0.0058
69
+ }
70
+ }
71
+ },
72
+ "COT": {
73
+ "gpt-3.5-turbo": {
74
+ "META": {
75
+ "Algorithm": "COT",
76
+ "LLM": "gpt-3.5-turbo",
77
+ "Eval Date": "2025/01/07"
78
+ },
79
+ "gsm8k": {
80
+ "Score": 78.70,
81
+ "Pass rate": 100.00,
82
+ "X-shot": 8,
83
+ "Parameters": "",
84
+ "Samples": 1319,
85
+ "Total input tokens": 953242,
86
+ "Average input tokens": 723,
87
+ "Total output tokens": 134799,
88
+ "Average output tokens": 102,
89
+ "All tokens": 1088041,
90
+ "Cost($)": 0.6788
91
+ },
92
+ "AQuA": {
93
+ "Score": 61.02,
94
+ "Pass rate": 93.70,
95
+ "X-shot": 0,
96
+ "Parameters": "",
97
+ "Samples": 254,
98
+ "Total input tokens": 25447,
99
+ "Average input tokens": 100,
100
+ "Total output tokens": 55346,
101
+ "Average output tokens": 218,
102
+ "All tokens": 80793,
103
+ "Cost($)": 0.0957
104
+ }
105
+ },
106
+ "Doubao-lite-32k": {
107
+ "META": {
108
+ "Algorithm": "COT",
109
+ "LLM": "Doubao-lite-32k",
110
+ "Eval Date": "2025/01/07"
111
+ },
112
+ "gsm8k": {
113
+ "Score": 89.31,
114
+ "Pass rate": 100.00,
115
+ "X-shot": 8,
116
+ "Parameters": "",
117
+ "Samples": 1319,
118
+ "Total input tokens": 1042095,
119
+ "Average input tokens": 790,
120
+ "Total output tokens": 159725,
121
+ "Average output tokens": 121,
122
+ "All tokens": 1201820,
123
+ "Cost($)": 0.0557
124
+ },
125
+ "AQuA": {
126
+ "Score": 82.68,
127
+ "Pass rate": 97.24,
128
+ "X-shot": 0,
129
+ "Parameters": "",
130
+ "Samples": 254,
131
+ "Total input tokens": 27978,
132
+ "Average input tokens": 110,
133
+ "Total output tokens": 66599,
134
+ "Average output tokens": 262,
135
+ "All tokens": 94577,
136
+ "Cost($)": 0.0066
137
+ }
138
+ }
139
+ },
140
+ "SC-COT": {
141
+ "gpt-3.5-turbo": {
142
+ "META": {
143
+ "Algorithm": "SC-COT",
144
+ "LLM": "gpt-3.5-turbo",
145
+ "Eval Date": "2025/01/07"
146
+ },
147
+ "gsm8k": {
148
+ "Score": 80.06,
149
+ "Pass rate": 99.62,
150
+ "X-shot": 8,
151
+ "Parameters": "temperature=1, num_path=5",
152
+ "Samples": 1319,
153
+ "Total input tokens": 5260319,
154
+ "Average input tokens": 3988,
155
+ "Total output tokens": 1595016,
156
+ "Average output tokens": 1209,
157
+ "All tokens": 6855335,
158
+ "Cost($)": 5.0227
159
+ },
160
+ "AQuA": {
161
+ "Score": 67.32,
162
+ "Pass rate": 100.00,
163
+ "X-shot": 0,
164
+ "Parameters": "temperature=1, path_num=5",
165
+ "Samples": 254,
166
+ "Total input tokens": 219241,
167
+ "Average input tokens": 863,
168
+ "Total output tokens": 359629,
169
+ "Average output tokens": 1416,
170
+ "All tokens": 578870,
171
+ "Cost($)": 0.6491
172
+ }
173
+ },
174
+ "Doubao-lite-32k": {
175
+ "META": {
176
+ "Algorithm": "SC-COT",
177
+ "LLM": "Doubao-lite-32k",
178
+ "Eval Date": "2025/01/07"
179
+ },
180
+ "gsm8k": {
181
+ "Score": 88.63,
182
+ "Pass rate": 99.77,
183
+ "X-shot": 8,
184
+ "Parameters": "temperature=1, num_path=5",
185
+ "Samples": 1319,
186
+ "Total input tokens": 1150443,
187
+ "Average input tokens": 872,
188
+ "Total output tokens": 1295750,
189
+ "Average output tokens": 982,
190
+ "All tokens": 2446193,
191
+ "Cost($)": 0.1533
192
+ },
193
+ "AQuA": {
194
+ "Score": 83.46,
195
+ "Pass rate": 97.24,
196
+ "X-shot": 0,
197
+ "Parameters": "temperature=1, num_path=5",
198
+ "Samples": 254,
199
+ "Total input tokens": 259804,
200
+ "Average input tokens": 1023,
201
+ "Total output tokens": 369741,
202
+ "Average output tokens": 1456,
203
+ "All tokens": 629545,
204
+ "Cost($)": 0.0409
205
+ }
206
+ }
207
+ },
208
+ "POT": {
209
+ "gpt-3.5-turbo": {
210
+ "META": {
211
+ "Algorithm": "POT",
212
+ "LLM": "gpt-3.5-turbo",
213
+ "Eval Date": "2025/01/07"
214
+ },
215
+ "gsm8k": {
216
+ "Score": 76.88,
217
+ "Pass rate": 99.24,
218
+ "X-shot": 8,
219
+ "Parameters": "",
220
+ "Samples": 1319,
221
+ "Total input tokens": 1090418,
222
+ "Average input tokens": 827,
223
+ "Total output tokens": 96662,
224
+ "Average output tokens": 73,
225
+ "All tokens": 1187080,
226
+ "Cost($)": 0.6902
227
+ },
228
+ "AQuA": {
229
+ "Score": 51.97,
230
+ "Pass rate": 92.91,
231
+ "X-shot": 0,
232
+ "Parameters": "",
233
+ "Samples": 254,
234
+ "Total input tokens": 223438,
235
+ "Average input tokens": 880,
236
+ "Total output tokens": 29323,
237
+ "Average output tokens": 115,
238
+ "All tokens": 252761,
239
+ "Cost($)": 0.1557
240
+ }
241
+ },
242
+ "Doubao-lite-32k": {
243
+ "META": {
244
+ "Algorithm": "POT",
245
+ "LLM": "Doubao-lite-32k",
246
+ "Eval Date": "2025/01/07"
247
+ },
248
+ "gsm8k": {
249
+ "Score": 79.15,
250
+ "Pass rate": 92.65,
251
+ "X-shot": 8,
252
+ "Parameters": "",
253
+ "Samples": 1319,
254
+ "Total input tokens": 1170038,
255
+ "Average input tokens": 887,
256
+ "Total output tokens": 116987,
257
+ "Average output tokens": 89,
258
+ "All tokens": 1287025,
259
+ "Cost($)": 0.0575
260
+ },
261
+ "AQuA": {
262
+ "Score": 52.36,
263
+ "Pass rate": 82.28,
264
+ "X-shot": 0,
265
+ "Parameters": "",
266
+ "Samples": 254,
267
+ "Total input tokens": 256721,
268
+ "Average input tokens": 1011,
269
+ "Total output tokens": 44729,
270
+ "Average output tokens": 176,
271
+ "All tokens": 301450,
272
+ "Cost($)": 0.0142
273
+ }
274
+ }
275
+ },
276
+ "ReAct-Pro": {
277
+ "gpt-3.5-turbo": {
278
+ "META": {
279
+ "Algorithm": "ReAct-Pro",
280
+ "LLM": "gpt-3.5-turbo",
281
+ "Eval Date": "2025/01/07"
282
+ },
283
+ "gsm8k": {
284
+ "Score": 74.91,
285
+ "Pass rate": 99.39,
286
+ "X-shot": 8,
287
+ "Parameters": "max_steps=10",
288
+ "Samples": 1319,
289
+ "Total input tokens": 6506164,
290
+ "Average input tokens": 4933,
291
+ "Total output tokens": 140122,
292
+ "Average output tokens": 106,
293
+ "All tokens": 6646286,
294
+ "Cost($)": 3.4633
295
+ },
296
+ "AQuA": {
297
+ "Score": 64.57,
298
+ "Pass rate": 98.03,
299
+ "X-shot": 0,
300
+ "Parameters": "max_steps=10",
301
+ "Samples": 254,
302
+ "Total input tokens": 862614,
303
+ "Average input tokens": 3396,
304
+ "Total output tokens": 40973,
305
+ "Average output tokens": 161,
306
+ "All tokens": 903587,
307
+ "Cost($)": 0.4928
308
+ }
309
+ },
310
+ "Doubao-lite-32k": {
311
+ "META": {
312
+ "Algorithm": "ReAct-Pro",
313
+ "LLM": "Doubao-lite-32k",
314
+ "Eval Date": "2025/01/07"
315
+ },
316
+ "gsm8k": {
317
+ "Score": 85.60,
318
+ "Pass rate": 99.62,
319
+ "X-shot": 8,
320
+ "Parameters": "max_steps=10",
321
+ "Samples": 1319,
322
+ "Total input tokens": 5862016,
323
+ "Average input tokens": 4444,
324
+ "Total output tokens": 136623,
325
+ "Average output tokens": 104,
326
+ "All tokens": 5998639,
327
+ "Cost($)": 0.2513
328
+ },
329
+ "AQuA": {
330
+ "Score": 77.56,
331
+ "Pass rate": 96.06,
332
+ "X-shot": 0,
333
+ "Parameters": "max_steps=10",
334
+ "Samples": 254,
335
+ "Total input tokens": 977890,
336
+ "Average input tokens": 3850,
337
+ "Total output tokens": 54951,
338
+ "Average output tokens": 216,
339
+ "All tokens": 1032841,
340
+ "Cost($)": 0.0446
341
+ }
342
+ }
343
+ }
344
+ }
345
+ }
gen_table.py ADDED
@@ -0,0 +1,218 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import copy as cp
2
+ import json
3
+ from collections import defaultdict
4
+ from urllib.request import urlopen
5
+
6
+ import gradio as gr
7
+ import numpy as np
8
+ import pandas as pd
9
+
10
+ from meta_data import OVERALL_MATH_SCORE_FILE, DEFAULT_MATH_BENCH, META_FIELDS
11
+
12
+
13
+ def listinstr(lst, s):
14
+ assert isinstance(lst, list)
15
+ for item in lst:
16
+ if item in s:
17
+ return True
18
+ return False
19
+
20
+
21
+ def load_results(file_name=OVERALL_MATH_SCORE_FILE):
22
+ data = json.loads(open(file_name, "r").read())
23
+ return data
24
+
25
+ def format_timestamp(timestamp):
26
+ date = timestamp[:10]
27
+ time = timestamp[11:13] + ':' + timestamp[14:16] + ':' + timestamp[17:19]
28
+ return date + ' ' + time
29
+
30
+ def nth_large(val, vals):
31
+ return sum([1 for v in vals if v > val]) + 1
32
+
33
+ def BUILD_L1_DF(results, fields):
34
+ check_box = {}
35
+ check_box['essential'] = ['Algorithm', 'LLM', 'Eval Date']
36
+ # revise there to set default dataset
37
+ check_box['required'] = ['Avg Score'] + [item for f in fields for item in (f'{f}-Score', f'{f}-Cost($)')]
38
+ check_box['avg'] = ['Avg Score']
39
+ check_box['all'] = check_box['avg'] + [item for f in fields for item in (f'{f}-Score', f'{f}-Cost($)')]
40
+ type_map = defaultdict(lambda: 'number')
41
+ type_map['Algorithm'] = 'html'
42
+ type_map['LLM'] = type_map['Vision Model'] = 'html'
43
+ type_map['Eval Date'] = 'str'
44
+ check_box['type_map'] = type_map
45
+
46
+ # df = generate_table(results, fields)
47
+ return check_box
48
+
49
+
50
+ def BUILD_L2_DF(results, fields):
51
+ res = defaultdict(list)
52
+
53
+ # Iterate over each algorithm and its corresponding models
54
+ for algo_name, algo_data in results.items():
55
+ for model_name, model_data in algo_data.items():
56
+ # Get META information
57
+ meta = model_data['META']
58
+
59
+ # Create a record for each dataset
60
+ for dataset in fields:
61
+ if dataset not in model_data:
62
+ continue
63
+
64
+ # Add metadata
65
+ for k, v in meta.items():
66
+ res[k].append(v)
67
+
68
+ # Add dataset name
69
+ res['Dataset'].append(dataset)
70
+
71
+ # Get dataset data
72
+ dataset_data = model_data[dataset]
73
+
74
+ # Add all fields
75
+ for field, value in dataset_data.items():
76
+ res[field].append(value)
77
+
78
+ # Create DataFrame
79
+ df = pd.DataFrame(res)
80
+
81
+ # Sort by Dataset and Score in descending order
82
+ df = df.sort_values(['Dataset', 'Score'], ascending=[True, False])
83
+
84
+ # Add rank for each dataset separately
85
+ df['Rank'] = df.groupby('Dataset').cumcount() + 1
86
+
87
+ # Rearrange column order
88
+ columns = ['Rank', 'Algorithm', 'Dataset', 'LLM', 'Eval Date', 'Score', 'Pass rate', 'X-shot', 'Parameters']
89
+ remaining_columns = [col for col in df.columns if col not in columns]
90
+ df = df[columns + remaining_columns]
91
+
92
+ # Set checkbox configuration
93
+ check_box = {}
94
+ check_box['essential'] = ['Algorithm', 'Dataset', 'LLM', 'Eval Date']
95
+ check_box['required'] = check_box['essential'] + ['Score', 'Pass rate', 'X-shot', 'Parameters', 'Samples', 'All tokens', 'Cost($)']
96
+ check_box['all'] = ['Score', 'Pass rate', 'X-shot', 'Parameters', 'Samples', 'Total input tokens', 'Average input tokens', 'Total output tokens', 'Average output tokens', 'All tokens', 'Cost($)']
97
+ type_map = defaultdict(lambda: 'number')
98
+ type_map['Algorithm'] = 'html'
99
+ type_map['LLM'] = type_map['Vision Model'] = 'html'
100
+ type_map['Eval Date'] = 'str'
101
+ type_map['Dataset'] = 'str'
102
+ type_map['Parameters'] = 'str'
103
+ type_map['All tokens'] = 'number'
104
+ type_map['Cost($)'] = 'number'
105
+ check_box['type_map'] = type_map
106
+
107
+
108
+ return df, check_box
109
+
110
+
111
+ def generate_table(results, fields):
112
+ res = defaultdict(list)
113
+ for i, m in enumerate(results):
114
+ item = results[m]
115
+ meta = item['META']
116
+ for k in META_FIELDS:
117
+ res[k].append(meta[k])
118
+ scores, costs = [], []
119
+ for d in fields:
120
+ if d in item.keys():
121
+ res[d+"-Score"].append(item[d]["Score"])
122
+ res[d+"-Cost($)"].append(item[d]["Cost($)"])
123
+ scores.append(item[d]["Score"])
124
+ costs.append(item[d]["Cost($)"])
125
+ else:
126
+ res[d+"-Score"].append(None)
127
+ res[d+"-Cost($)"].append(None)
128
+ scores.append(None)
129
+ costs.append(None)
130
+
131
+ res['Avg Score'].append(round(np.mean(scores), 2) if None not in scores else None)
132
+
133
+ df = pd.DataFrame(res)
134
+
135
+ # Sort by Avg Score and assign rank
136
+ valid = df[~pd.isna(df['Avg Score'])].copy()
137
+ missing = df[pd.isna(df['Avg Score'])].copy()
138
+
139
+ # Assign rank to valid rows (using integer type)
140
+ valid = valid.sort_values('Avg Score', ascending=False)
141
+ valid['Rank'] = pd.Series(range(1, len(valid) + 1)[::-1], dtype=int)
142
+
143
+ # Assign last rank to missing rows (using integer type)
144
+ if not missing.empty:
145
+ missing['Rank'] = pd.Series([len(valid) + 1] * len(missing), dtype=int)
146
+
147
+ # Merge and sort by Rank
148
+ df = pd.concat([valid, missing])
149
+ df = df.sort_values('Rank')
150
+
151
+ # Rearrange column order to ensure Rank is the first column
152
+ columns = ['Rank', 'Algorithm', 'LLM', 'Eval Date', 'Avg Score'] # Fixed column order
153
+ for d in fields:
154
+ columns.extend([f"{d}-Score", f"{d}-Cost($)"]) # Add dataset-related columns
155
+
156
+ # Ensure all columns exist and reorder
157
+ existing_columns = [col for col in columns if col in df.columns]
158
+ remaining_columns = [col for col in df.columns if col not in columns]
159
+ df = df[existing_columns + remaining_columns] # Reorder columns
160
+
161
+ # Sort by Score in descending order
162
+ df = df.sort_values(['Avg Score'], ascending=[False])
163
+
164
+ # Add rank for each dataset separately
165
+ df['Rank'] = range(1, len(df) + 1)
166
+
167
+ # Rearrange column order
168
+ columns = ['Rank', 'Algorithm', 'LLM', 'Eval Date', 'Avg Score']
169
+ remaining_columns = [col for col in df.columns if col not in columns]
170
+ df = df[columns + remaining_columns]
171
+ return df
172
+
173
+
174
+
175
+
176
+ def generate_table_detail(results, fields):
177
+ res = defaultdict(list)
178
+
179
+ # Iterate over each algorithm and its corresponding models
180
+ for algo_name, algo_data in results.items():
181
+ for model_name, model_data in algo_data.items():
182
+ # Get META information
183
+ meta = model_data['META']
184
+
185
+ # Create a record for each dataset
186
+ for dataset in fields:
187
+ if dataset not in model_data:
188
+ continue
189
+
190
+ # Add metadata
191
+ for k, v in meta.items():
192
+ res[k].append(v)
193
+
194
+ # Add dataset name
195
+ res['Dataset'].append(dataset)
196
+
197
+ # Get dataset data
198
+ dataset_data = model_data[dataset]
199
+
200
+ # Add all fields
201
+ for field, value in dataset_data.items():
202
+ res[field].append(value)
203
+
204
+ # Create DataFrame
205
+ df = pd.DataFrame(res)
206
+
207
+ # Sort by Dataset and Score in descending order
208
+ df = df.sort_values(['Dataset', 'Score'], ascending=[True, False])
209
+
210
+ # Add rank for each dataset separately
211
+ df['Rank'] = df.groupby('Dataset').cumcount() + 1
212
+
213
+ # Rearrange column order
214
+ columns = ['Rank', 'Dataset', 'Algorithm', 'LLM', 'Eval Date', 'Score', 'Pass rate', 'X-shot', 'Parameters']
215
+ remaining_columns = [col for col in df.columns if col not in columns]
216
+ df = df[columns + remaining_columns]
217
+
218
+ return df
meta_data.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # CONSTANTS-URL
2
+ URL = "http://opencompass.openxlab.space/assets/OpenVLM.json"
3
+ OVERALL_MATH_SCORE_FILE = "overall_math_score.json"
4
+ DETAIL_MATH_SCORE_FILE = "detail_math_score.json"
5
+ # CONSTANTS-TEXT
6
+ LEADERBORAD_INTRODUCTION = """# Open Agent Leaderboard
7
+ ### Welcome to the Open Agent Leaderboard! We share the evaluation results of open agents: COT, SC_COT, POT, ReAct, etc. The agents are impletemented by the OpenSource Framework: [*OmAgent*](https://github.com/om-ai-lab/OmAgent)
8
+
9
+ This leaderboard was last updated: {}.
10
+
11
+ To add your own agent to the leaderboard, please create a PR in [*OmAgent*](https://github.com/om-ai-lab/OmAgent), then we will help with the evaluation and updating the leaderboard. For any questions or concerns, please feel free to contact us.
12
+ """
13
+
14
+ DEFAULT_MATH_BENCH = [
15
+ 'gsm8k', 'AQuA'
16
+ ]
17
+ # The README file for each benchmark
18
+ LEADERBOARD_MD = {}
19
+
20
+ LEADERBOARD_MD['MATH_MAIN'] = f"""
21
+ ## Math task main Evaluation Results
22
+
23
+ - Metrics:
24
+ - Avg Score: The average score on all math Benchmarks (normalized to 0 - 100, the higher the better).
25
+ - Rank: The average rank on all math Benchmarks (the lower the better).
26
+ - Score: The evaluation score on each math Benchmarks (the higher the better).
27
+ - Cost: The cost on each math Benchmarks (the lower the better).
28
+
29
+ - By default, we present the overall evaluation results based on {', '.join(DEFAULT_MATH_BENCH)}, sorted by the descending order of Avg Score.
30
+ """
31
+
32
+ LEADERBOARD_MD['MATH_DETAIL'] = f"""
33
+ ## Math task detail Evaluation Results
34
+
35
+ - By default, we present the overall evaluation results based on {', '.join(DEFAULT_MATH_BENCH)}
36
+ - default parameters: temperature=0.0
37
+ - LLM prices:
38
+ - gpt-3.5-turbo:
39
+ - 0.0005$/1M tokens (input)
40
+ - 0.0015$/1M tokens (output)
41
+ - Doubao-lite-32k (1 USD = 7.3249 CNY):
42
+ - 0.00004096$/1M tokens (input)
43
+ - 0.0001$/1M tokens (output)
44
+ - ReAct-Pro*: We modified ReAct to ReAct-Pro, following the Reflexion repository. Implementation details can be found in the [*OmAgent*](https://github.com/om-ai-lab/OmAgent) repository.
45
+ """
46
+
47
+ META_FIELDS = [
48
+ 'Algorithm', 'LLM', 'Eval Date'
49
+ ]
50
+
51
+ DATASETS = [
52
+ 'gsm8k', 'AQuA'
53
+ ]
54
+
55
+ LLM = [
56
+ 'Doubao-lite-32k', 'gpt-3.5-turbo'
57
+ ]
58
+
59
+ ALGORITHMS = [
60
+ 'IO', 'COT', 'SC_COT', 'POT', 'ReAct-Pro*'
61
+ ]
62
+
63
+ CITATION_BUTTON_TEXT = r"""@article{zhang2024omagent,
64
+ title={OmAgent: A Multi-modal Agent Framework for Complex Video Understanding with Task Divide-and-Conquer},
65
+ author={Zhang, Lu and Zhao, Tiancheng and Ying, Heting and Ma, Yibo and Lee, Kyusong},
66
+ journal={arXiv preprint arXiv:2406.16620},
67
+ year={2024}
68
+ }"""
overall_math_score.json ADDED
@@ -0,0 +1,155 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "time": "2025-01-09 17:13:45",
3
+ "results": {
4
+ "IO": {
5
+ "META": {
6
+ "Algorithm": "IO",
7
+ "LLM": "gpt-3.5-turbo",
8
+ "Eval Date": "2025/01/07"
9
+ },
10
+ "gsm8k": {
11
+ "Score": 37.83,
12
+ "Cost($)": 0.3328
13
+ },
14
+ "AQuA": {
15
+ "Score": 38.98,
16
+ "Cost($)": 0.0380
17
+ }
18
+ },
19
+ "COT": {
20
+ "META": {
21
+ "Algorithm": "COT",
22
+ "LLM": "gpt-3.5-turbo",
23
+ "Eval Date": "2025/01/07"
24
+ },
25
+ "gsm8k": {
26
+ "Score": 78.70,
27
+ "Cost($)": 0.6788
28
+ },
29
+ "AQuA": {
30
+ "Score": 61.02,
31
+ "Cost($)": 0.0957
32
+ }
33
+ },
34
+ "SC-COT": {
35
+ "META": {
36
+ "Algorithm": "SC-COT",
37
+ "LLM": "gpt-3.5-turbo",
38
+ "Eval Date": "2025/01/07"
39
+ },
40
+ "gsm8k": {
41
+ "Score": 80.06,
42
+ "Cost($)": 5.0227
43
+ },
44
+ "AQuA": {
45
+ "Score": 67.32,
46
+ "Cost($)": 0.6491
47
+ }
48
+ },
49
+ "POT": {
50
+ "META": {
51
+ "Algorithm": "POT",
52
+ "LLM": "gpt-3.5-turbo",
53
+ "Eval Date": "2025/01/07"
54
+ },
55
+ "gsm8k": {
56
+ "Score": 76.88,
57
+ "Cost($)": 0.6902
58
+ },
59
+ "AQuA": {
60
+ "Score": 51.97,
61
+ "Cost($)": 0.1557
62
+ }
63
+ },
64
+ "ReAct-Pro*": {
65
+ "META": {
66
+ "Algorithm": "ReAct-Pro*",
67
+ "LLM": "gpt-3.5-turbo",
68
+ "Eval Date": "2025/01/07"
69
+ },
70
+ "gsm8k": {
71
+ "Score": 74.91,
72
+ "Cost($)": 3.4633
73
+ },
74
+ "AQuA": {
75
+ "Score": 64.57,
76
+ "Cost($)": 0.4928
77
+ }
78
+ },
79
+ "IO-Doubao": {
80
+ "META": {
81
+ "Algorithm": "IO",
82
+ "LLM": "Doubao-lite-32k",
83
+ "Eval Date": "2025/01/07"
84
+ },
85
+ "gsm8k": {
86
+ "Score": 72.02,
87
+ "Cost($)": 0.0354
88
+ },
89
+ "AQuA": {
90
+ "Score": 79.13,
91
+ "Cost($)": 0.0058
92
+ }
93
+ },
94
+ "COT-Doubao": {
95
+ "META": {
96
+ "Algorithm": "COT",
97
+ "LLM": "Doubao-lite-32k",
98
+ "Eval Date": "2025/01/07"
99
+ },
100
+ "gsm8k": {
101
+ "Score": 89.31,
102
+ "Cost($)": 0.0557
103
+ },
104
+ "AQuA": {
105
+ "Score": 82.68,
106
+ "Cost($)": 0.0066
107
+ }
108
+ },
109
+ "SC-COT-Doubao": {
110
+ "META": {
111
+ "Algorithm": "SC-COT",
112
+ "LLM": "Doubao-lite-32k",
113
+ "Eval Date": "2025/01/07"
114
+ },
115
+ "gsm8k": {
116
+ "Score": 88.63,
117
+ "Cost($)": 0.1533
118
+ },
119
+ "AQuA": {
120
+ "Score": 83.46,
121
+ "Cost($)": 0.0409
122
+ }
123
+ },
124
+ "POT-Doubao": {
125
+ "META": {
126
+ "Algorithm": "POT",
127
+ "LLM": "Doubao-lite-32k",
128
+ "Eval Date": "2025/01/07"
129
+ },
130
+ "gsm8k": {
131
+ "Score": 79.15,
132
+ "Cost($)": 0.0575
133
+ },
134
+ "AQuA": {
135
+ "Score": 52.36,
136
+ "Cost($)": 0.0142
137
+ }
138
+ },
139
+ "ReAct-Pro-Doubao": {
140
+ "META": {
141
+ "Algorithm": "ReAct-Pro",
142
+ "LLM": "Doubao-lite-32k",
143
+ "Eval Date": "2025/01/07"
144
+ },
145
+ "gsm8k": {
146
+ "Score": 85.60,
147
+ "Cost($)": 0.2513
148
+ },
149
+ "AQuA": {
150
+ "Score": 77.56,
151
+ "Cost($)": 0.0446
152
+ }
153
+ }
154
+ }
155
+ }