qq-hzlh commited on
Commit
efbd6cf
·
1 Parent(s): bd69a52

improve pot implementation and score

Browse files
gen_table.py CHANGED
@@ -7,6 +7,7 @@ import gradio as gr
7
  import numpy as np
8
  import pandas as pd
9
 
 
10
  from meta_data import OVERALL_MATH_SCORE_FILE, DEFAULT_MATH_BENCH, META_FIELDS
11
 
12
 
@@ -150,7 +151,14 @@ def generate_table(results, fields):
150
  res[f"{d}-Cost($)"].append(None)
151
 
152
  # Calculate average score
153
- res['Avg Score'].append(round(np.mean(scores), 2) if scores else None)
 
 
 
 
 
 
 
154
 
155
  df = pd.DataFrame(res)
156
 
 
7
  import numpy as np
8
  import pandas as pd
9
 
10
+ from decimal import Decimal, ROUND_HALF_UP
11
  from meta_data import OVERALL_MATH_SCORE_FILE, DEFAULT_MATH_BENCH, META_FIELDS
12
 
13
 
 
151
  res[f"{d}-Cost($)"].append(None)
152
 
153
  # Calculate average score
154
+ if scores:
155
+ decimal_numbers = [Decimal(str(num)) for num in scores]
156
+ avg_score = Decimal(str(np.mean(scores) if scores else None))
157
+ avg_score = sum(decimal_numbers) / len(decimal_numbers)
158
+ else:
159
+ avg_score = None
160
+ formatted_average = avg_score.quantize(Decimal('0.01'), rounding=ROUND_HALF_UP)
161
+ res['Avg Score'].append(formatted_average)
162
 
163
  df = pd.DataFrame(res)
164
 
src/detail_math_score.json CHANGED
@@ -226,17 +226,17 @@
226
  "Cost($)": 0.6902
227
  },
228
  "AQuA": {
229
- "Score": 51.97,
230
- "Pass rate": 92.91,
231
  "X-shot": 0,
232
  "Parameters": "",
233
  "Samples": 254,
234
- "Total input tokens": 223438,
235
- "Average input tokens": 880,
236
- "Total output tokens": 29323,
237
- "Average output tokens": 115,
238
- "All tokens": 252761,
239
- "Cost($)": 0.1557
240
  }
241
  },
242
  "Doubao-lite-32k": {
@@ -246,30 +246,30 @@
246
  "Eval Date": "2025/01/07"
247
  },
248
  "gsm8k": {
249
- "Score": 79.15,
250
- "Pass rate": 92.65,
251
  "X-shot": 8,
252
  "Parameters": "",
253
  "Samples": 1319,
254
  "Total input tokens": 1170038,
255
  "Average input tokens": 887,
256
- "Total output tokens": 116987,
257
  "Average output tokens": 89,
258
- "All tokens": 1287025,
259
  "Cost($)": 0.0575
260
  },
261
  "AQuA": {
262
- "Score": 52.36,
263
- "Pass rate": 82.28,
264
  "X-shot": 0,
265
  "Parameters": "",
266
  "Samples": 254,
267
- "Total input tokens": 256721,
268
- "Average input tokens": 1011,
269
- "Total output tokens": 44729,
270
- "Average output tokens": 176,
271
- "All tokens": 301450,
272
- "Cost($)": 0.0142
273
  }
274
  }
275
  },
 
226
  "Cost($)": 0.6902
227
  },
228
  "AQuA": {
229
+ "Score": 59.45,
230
+ "Pass rate": 100,
231
  "X-shot": 0,
232
  "Parameters": "",
233
  "Samples": 254,
234
+ "Total input tokens": 225162,
235
+ "Average input tokens": 886,
236
+ "Total output tokens": 41492,
237
+ "Average output tokens": 163,
238
+ "All tokens": 266654,
239
+ "Cost($)": 0.1748
240
  }
241
  },
242
  "Doubao-lite-32k": {
 
246
  "Eval Date": "2025/01/07"
247
  },
248
  "gsm8k": {
249
+ "Score": 79.61,
250
+ "Pass rate": 92.57,
251
  "X-shot": 8,
252
  "Parameters": "",
253
  "Samples": 1319,
254
  "Total input tokens": 1170038,
255
  "Average input tokens": 887,
256
+ "Total output tokens": 118017,
257
  "Average output tokens": 89,
258
+ "All tokens": 1288055,
259
  "Cost($)": 0.0575
260
  },
261
  "AQuA": {
262
+ "Score": 71.65,
263
+ "Pass rate": 96.85,
264
  "X-shot": 0,
265
  "Parameters": "",
266
  "Samples": 254,
267
+ "Total input tokens": 259863,
268
+ "Average input tokens": 1023,
269
+ "Total output tokens": 49573,
270
+ "Average output tokens": 195,
271
+ "All tokens": 309436,
272
+ "Cost($)": 0.0147
273
  }
274
  }
275
  },
src/overall_math_score.json CHANGED
@@ -57,8 +57,8 @@
57
  "Cost($)": 0.6902
58
  },
59
  "AQuA": {
60
- "Score": 51.97,
61
- "Cost($)": 0.1557
62
  }
63
  },
64
  "ReAct-Pro*": {
@@ -128,12 +128,12 @@
128
  "Eval Date": "2025/01/07"
129
  },
130
  "gsm8k": {
131
- "Score": 79.15,
132
- "Cost($)": 0.0575
133
  },
134
  "AQuA": {
135
- "Score": 52.36,
136
- "Cost($)": 0.0142
137
  }
138
  },
139
  "ReAct-Pro-Doubao": {
 
57
  "Cost($)": 0.6902
58
  },
59
  "AQuA": {
60
+ "Score": 59.45,
61
+ "Cost($)": 0.1748
62
  }
63
  },
64
  "ReAct-Pro*": {
 
128
  "Eval Date": "2025/01/07"
129
  },
130
  "gsm8k": {
131
+ "Score": 79.61,
132
+ "Cost($)": 0.0576
133
  },
134
  "AQuA": {
135
+ "Score": 71.65,
136
+ "Cost($)": 0.0147
137
  }
138
  },
139
  "ReAct-Pro-Doubao": {