Spaces:
Running
Running
improve pot implementation and score
Browse files- gen_table.py +9 -1
- src/detail_math_score.json +20 -20
- src/overall_math_score.json +6 -6
gen_table.py
CHANGED
@@ -7,6 +7,7 @@ import gradio as gr
|
|
7 |
import numpy as np
|
8 |
import pandas as pd
|
9 |
|
|
|
10 |
from meta_data import OVERALL_MATH_SCORE_FILE, DEFAULT_MATH_BENCH, META_FIELDS
|
11 |
|
12 |
|
@@ -150,7 +151,14 @@ def generate_table(results, fields):
|
|
150 |
res[f"{d}-Cost($)"].append(None)
|
151 |
|
152 |
# Calculate average score
|
153 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
154 |
|
155 |
df = pd.DataFrame(res)
|
156 |
|
|
|
7 |
import numpy as np
|
8 |
import pandas as pd
|
9 |
|
10 |
+
from decimal import Decimal, ROUND_HALF_UP
|
11 |
from meta_data import OVERALL_MATH_SCORE_FILE, DEFAULT_MATH_BENCH, META_FIELDS
|
12 |
|
13 |
|
|
|
151 |
res[f"{d}-Cost($)"].append(None)
|
152 |
|
153 |
# Calculate average score
|
154 |
+
if scores:
|
155 |
+
decimal_numbers = [Decimal(str(num)) for num in scores]
|
156 |
+
avg_score = Decimal(str(np.mean(scores) if scores else None))
|
157 |
+
avg_score = sum(decimal_numbers) / len(decimal_numbers)
|
158 |
+
else:
|
159 |
+
avg_score = None
|
160 |
+
formatted_average = avg_score.quantize(Decimal('0.01'), rounding=ROUND_HALF_UP)
|
161 |
+
res['Avg Score'].append(formatted_average)
|
162 |
|
163 |
df = pd.DataFrame(res)
|
164 |
|
src/detail_math_score.json
CHANGED
@@ -226,17 +226,17 @@
|
|
226 |
"Cost($)": 0.6902
|
227 |
},
|
228 |
"AQuA": {
|
229 |
-
"Score":
|
230 |
-
"Pass rate":
|
231 |
"X-shot": 0,
|
232 |
"Parameters": "",
|
233 |
"Samples": 254,
|
234 |
-
"Total input tokens":
|
235 |
-
"Average input tokens":
|
236 |
-
"Total output tokens":
|
237 |
-
"Average output tokens":
|
238 |
-
"All tokens":
|
239 |
-
"Cost($)": 0.
|
240 |
}
|
241 |
},
|
242 |
"Doubao-lite-32k": {
|
@@ -246,30 +246,30 @@
|
|
246 |
"Eval Date": "2025/01/07"
|
247 |
},
|
248 |
"gsm8k": {
|
249 |
-
"Score": 79.
|
250 |
-
"Pass rate": 92.
|
251 |
"X-shot": 8,
|
252 |
"Parameters": "",
|
253 |
"Samples": 1319,
|
254 |
"Total input tokens": 1170038,
|
255 |
"Average input tokens": 887,
|
256 |
-
"Total output tokens":
|
257 |
"Average output tokens": 89,
|
258 |
-
"All tokens":
|
259 |
"Cost($)": 0.0575
|
260 |
},
|
261 |
"AQuA": {
|
262 |
-
"Score":
|
263 |
-
"Pass rate":
|
264 |
"X-shot": 0,
|
265 |
"Parameters": "",
|
266 |
"Samples": 254,
|
267 |
-
"Total input tokens":
|
268 |
-
"Average input tokens":
|
269 |
-
"Total output tokens":
|
270 |
-
"Average output tokens":
|
271 |
-
"All tokens":
|
272 |
-
"Cost($)": 0.
|
273 |
}
|
274 |
}
|
275 |
},
|
|
|
226 |
"Cost($)": 0.6902
|
227 |
},
|
228 |
"AQuA": {
|
229 |
+
"Score": 59.45,
|
230 |
+
"Pass rate": 100,
|
231 |
"X-shot": 0,
|
232 |
"Parameters": "",
|
233 |
"Samples": 254,
|
234 |
+
"Total input tokens": 225162,
|
235 |
+
"Average input tokens": 886,
|
236 |
+
"Total output tokens": 41492,
|
237 |
+
"Average output tokens": 163,
|
238 |
+
"All tokens": 266654,
|
239 |
+
"Cost($)": 0.1748
|
240 |
}
|
241 |
},
|
242 |
"Doubao-lite-32k": {
|
|
|
246 |
"Eval Date": "2025/01/07"
|
247 |
},
|
248 |
"gsm8k": {
|
249 |
+
"Score": 79.61,
|
250 |
+
"Pass rate": 92.57,
|
251 |
"X-shot": 8,
|
252 |
"Parameters": "",
|
253 |
"Samples": 1319,
|
254 |
"Total input tokens": 1170038,
|
255 |
"Average input tokens": 887,
|
256 |
+
"Total output tokens": 118017,
|
257 |
"Average output tokens": 89,
|
258 |
+
"All tokens": 1288055,
|
259 |
"Cost($)": 0.0575
|
260 |
},
|
261 |
"AQuA": {
|
262 |
+
"Score": 71.65,
|
263 |
+
"Pass rate": 96.85,
|
264 |
"X-shot": 0,
|
265 |
"Parameters": "",
|
266 |
"Samples": 254,
|
267 |
+
"Total input tokens": 259863,
|
268 |
+
"Average input tokens": 1023,
|
269 |
+
"Total output tokens": 49573,
|
270 |
+
"Average output tokens": 195,
|
271 |
+
"All tokens": 309436,
|
272 |
+
"Cost($)": 0.0147
|
273 |
}
|
274 |
}
|
275 |
},
|
src/overall_math_score.json
CHANGED
@@ -57,8 +57,8 @@
|
|
57 |
"Cost($)": 0.6902
|
58 |
},
|
59 |
"AQuA": {
|
60 |
-
"Score":
|
61 |
-
"Cost($)": 0.
|
62 |
}
|
63 |
},
|
64 |
"ReAct-Pro*": {
|
@@ -128,12 +128,12 @@
|
|
128 |
"Eval Date": "2025/01/07"
|
129 |
},
|
130 |
"gsm8k": {
|
131 |
-
"Score": 79.
|
132 |
-
"Cost($)": 0.
|
133 |
},
|
134 |
"AQuA": {
|
135 |
-
"Score":
|
136 |
-
"Cost($)": 0.
|
137 |
}
|
138 |
},
|
139 |
"ReAct-Pro-Doubao": {
|
|
|
57 |
"Cost($)": 0.6902
|
58 |
},
|
59 |
"AQuA": {
|
60 |
+
"Score": 59.45,
|
61 |
+
"Cost($)": 0.1748
|
62 |
}
|
63 |
},
|
64 |
"ReAct-Pro*": {
|
|
|
128 |
"Eval Date": "2025/01/07"
|
129 |
},
|
130 |
"gsm8k": {
|
131 |
+
"Score": 79.61,
|
132 |
+
"Cost($)": 0.0576
|
133 |
},
|
134 |
"AQuA": {
|
135 |
+
"Score": 71.65,
|
136 |
+
"Cost($)": 0.0147
|
137 |
}
|
138 |
},
|
139 |
"ReAct-Pro-Doubao": {
|