open-agent-leaderboard / detail_math_score.json
qq-hzlh's picture
Upload 5 files
fc71d05 verified
raw
history blame
12.8 kB
{
"time": "2025-01-09 17:13:45",
"results": {
"IO": {
"gpt-3.5-turbo": {
"META": {
"Algorithm": "IO",
"LLM": "gpt-3.5-turbo",
"Eval Date": "2025/01/07"
},
"gsm8k": {
"Score": 37.83,
"Pass rate": 99.92,
"X-shot": 8,
"Parameters": "",
"Samples": 1319,
"Total input tokens": 546990,
"Average input tokens": 415,
"Total output tokens": 39563,
"Average output tokens": 30,
"All tokens": 586553,
"Cost($)": 0.3328
},
"AQuA": {
"Score": 38.98,
"Pass rate": 100.00,
"X-shot": 0,
"Parameters": "",
"Samples": 254,
"Total input tokens": 25701,
"Average input tokens": 101,
"Total output tokens": 16770,
"Average output tokens": 66,
"All tokens": 42471,
"Cost($)": 0.0380
}
},
"Doubao-lite-32k": {
"META": {
"Algorithm": "IO",
"LLM": "Doubao-lite-32k",
"Eval Date": "2025/01/07"
},
"gsm8k": {
"Score": 72.02,
"Pass rate": 99.92,
"X-shot": 8,
"Parameters": "",
"Samples": 1319,
"Total input tokens": 617377,
"Average input tokens": 468,
"Total output tokens": 123106,
"Average output tokens": 93,
"All tokens": 740483,
"Cost($)": 0.0354
},
"AQuA": {
"Score": 79.13,
"Pass rate": 100.00,
"X-shot": 0,
"Parameters": "",
"Samples": 254,
"Total input tokens": 33058,
"Average input tokens": 130,
"Total output tokens": 54684,
"Average output tokens": 215,
"All tokens": 87742,
"Cost($)": 0.0058
}
}
},
"COT": {
"gpt-3.5-turbo": {
"META": {
"Algorithm": "COT",
"LLM": "gpt-3.5-turbo",
"Eval Date": "2025/01/07"
},
"gsm8k": {
"Score": 78.70,
"Pass rate": 100.00,
"X-shot": 8,
"Parameters": "",
"Samples": 1319,
"Total input tokens": 953242,
"Average input tokens": 723,
"Total output tokens": 134799,
"Average output tokens": 102,
"All tokens": 1088041,
"Cost($)": 0.6788
},
"AQuA": {
"Score": 61.02,
"Pass rate": 93.70,
"X-shot": 0,
"Parameters": "",
"Samples": 254,
"Total input tokens": 25447,
"Average input tokens": 100,
"Total output tokens": 55346,
"Average output tokens": 218,
"All tokens": 80793,
"Cost($)": 0.0957
}
},
"Doubao-lite-32k": {
"META": {
"Algorithm": "COT",
"LLM": "Doubao-lite-32k",
"Eval Date": "2025/01/07"
},
"gsm8k": {
"Score": 89.31,
"Pass rate": 100.00,
"X-shot": 8,
"Parameters": "",
"Samples": 1319,
"Total input tokens": 1042095,
"Average input tokens": 790,
"Total output tokens": 159725,
"Average output tokens": 121,
"All tokens": 1201820,
"Cost($)": 0.0557
},
"AQuA": {
"Score": 82.68,
"Pass rate": 97.24,
"X-shot": 0,
"Parameters": "",
"Samples": 254,
"Total input tokens": 27978,
"Average input tokens": 110,
"Total output tokens": 66599,
"Average output tokens": 262,
"All tokens": 94577,
"Cost($)": 0.0066
}
}
},
"SC-COT": {
"gpt-3.5-turbo": {
"META": {
"Algorithm": "SC-COT",
"LLM": "gpt-3.5-turbo",
"Eval Date": "2025/01/07"
},
"gsm8k": {
"Score": 80.06,
"Pass rate": 99.62,
"X-shot": 8,
"Parameters": "temperature=1, num_path=5",
"Samples": 1319,
"Total input tokens": 5260319,
"Average input tokens": 3988,
"Total output tokens": 1595016,
"Average output tokens": 1209,
"All tokens": 6855335,
"Cost($)": 5.0227
},
"AQuA": {
"Score": 67.32,
"Pass rate": 100.00,
"X-shot": 0,
"Parameters": "temperature=1, path_num=5",
"Samples": 254,
"Total input tokens": 219241,
"Average input tokens": 863,
"Total output tokens": 359629,
"Average output tokens": 1416,
"All tokens": 578870,
"Cost($)": 0.6491
}
},
"Doubao-lite-32k": {
"META": {
"Algorithm": "SC-COT",
"LLM": "Doubao-lite-32k",
"Eval Date": "2025/01/07"
},
"gsm8k": {
"Score": 88.63,
"Pass rate": 99.77,
"X-shot": 8,
"Parameters": "temperature=1, num_path=5",
"Samples": 1319,
"Total input tokens": 1150443,
"Average input tokens": 872,
"Total output tokens": 1295750,
"Average output tokens": 982,
"All tokens": 2446193,
"Cost($)": 0.1533
},
"AQuA": {
"Score": 83.46,
"Pass rate": 97.24,
"X-shot": 0,
"Parameters": "temperature=1, num_path=5",
"Samples": 254,
"Total input tokens": 259804,
"Average input tokens": 1023,
"Total output tokens": 369741,
"Average output tokens": 1456,
"All tokens": 629545,
"Cost($)": 0.0409
}
}
},
"POT": {
"gpt-3.5-turbo": {
"META": {
"Algorithm": "POT",
"LLM": "gpt-3.5-turbo",
"Eval Date": "2025/01/07"
},
"gsm8k": {
"Score": 76.88,
"Pass rate": 99.24,
"X-shot": 8,
"Parameters": "",
"Samples": 1319,
"Total input tokens": 1090418,
"Average input tokens": 827,
"Total output tokens": 96662,
"Average output tokens": 73,
"All tokens": 1187080,
"Cost($)": 0.6902
},
"AQuA": {
"Score": 51.97,
"Pass rate": 92.91,
"X-shot": 0,
"Parameters": "",
"Samples": 254,
"Total input tokens": 223438,
"Average input tokens": 880,
"Total output tokens": 29323,
"Average output tokens": 115,
"All tokens": 252761,
"Cost($)": 0.1557
}
},
"Doubao-lite-32k": {
"META": {
"Algorithm": "POT",
"LLM": "Doubao-lite-32k",
"Eval Date": "2025/01/07"
},
"gsm8k": {
"Score": 79.15,
"Pass rate": 92.65,
"X-shot": 8,
"Parameters": "",
"Samples": 1319,
"Total input tokens": 1170038,
"Average input tokens": 887,
"Total output tokens": 116987,
"Average output tokens": 89,
"All tokens": 1287025,
"Cost($)": 0.0575
},
"AQuA": {
"Score": 52.36,
"Pass rate": 82.28,
"X-shot": 0,
"Parameters": "",
"Samples": 254,
"Total input tokens": 256721,
"Average input tokens": 1011,
"Total output tokens": 44729,
"Average output tokens": 176,
"All tokens": 301450,
"Cost($)": 0.0142
}
}
},
"ReAct-Pro": {
"gpt-3.5-turbo": {
"META": {
"Algorithm": "ReAct-Pro",
"LLM": "gpt-3.5-turbo",
"Eval Date": "2025/01/07"
},
"gsm8k": {
"Score": 74.91,
"Pass rate": 99.39,
"X-shot": 8,
"Parameters": "max_steps=10",
"Samples": 1319,
"Total input tokens": 6506164,
"Average input tokens": 4933,
"Total output tokens": 140122,
"Average output tokens": 106,
"All tokens": 6646286,
"Cost($)": 3.4633
},
"AQuA": {
"Score": 64.57,
"Pass rate": 98.03,
"X-shot": 0,
"Parameters": "max_steps=10",
"Samples": 254,
"Total input tokens": 862614,
"Average input tokens": 3396,
"Total output tokens": 40973,
"Average output tokens": 161,
"All tokens": 903587,
"Cost($)": 0.4928
}
},
"Doubao-lite-32k": {
"META": {
"Algorithm": "ReAct-Pro",
"LLM": "Doubao-lite-32k",
"Eval Date": "2025/01/07"
},
"gsm8k": {
"Score": 85.60,
"Pass rate": 99.62,
"X-shot": 8,
"Parameters": "max_steps=10",
"Samples": 1319,
"Total input tokens": 5862016,
"Average input tokens": 4444,
"Total output tokens": 136623,
"Average output tokens": 104,
"All tokens": 5998639,
"Cost($)": 0.2513
},
"AQuA": {
"Score": 77.56,
"Pass rate": 96.06,
"X-shot": 0,
"Parameters": "max_steps=10",
"Samples": 254,
"Total input tokens": 977890,
"Average input tokens": 3850,
"Total output tokens": 54951,
"Average output tokens": 216,
"All tokens": 1032841,
"Cost($)": 0.0446
}
}
}
}
}