{ "time": "2025-01-09 17:13:45", "results": { "IO": { "gpt-3.5-turbo": { "META": { "Algorithm": "IO", "LLM": "gpt-3.5-turbo", "Eval Date": "2025/01/07" }, "gsm8k": { "Score": 37.83, "Pass rate": 99.92, "X-shot": 8, "Parameters": "", "Samples": 1319, "Total input tokens": 546990, "Average input tokens": 415, "Total output tokens": 39563, "Average output tokens": 30, "All tokens": 586553, "Cost($)": 0.3328 }, "AQuA": { "Score": 38.98, "Pass rate": 100.00, "X-shot": 0, "Parameters": "", "Samples": 254, "Total input tokens": 25701, "Average input tokens": 101, "Total output tokens": 16770, "Average output tokens": 66, "All tokens": 42471, "Cost($)": 0.0380 } }, "Doubao-lite-32k": { "META": { "Algorithm": "IO", "LLM": "Doubao-lite-32k", "Eval Date": "2025/01/07" }, "gsm8k": { "Score": 72.02, "Pass rate": 99.92, "X-shot": 8, "Parameters": "", "Samples": 1319, "Total input tokens": 617377, "Average input tokens": 468, "Total output tokens": 123106, "Average output tokens": 93, "All tokens": 740483, "Cost($)": 0.0354 }, "AQuA": { "Score": 79.13, "Pass rate": 100.00, "X-shot": 0, "Parameters": "", "Samples": 254, "Total input tokens": 33058, "Average input tokens": 130, "Total output tokens": 54684, "Average output tokens": 215, "All tokens": 87742, "Cost($)": 0.0058 } } }, "COT": { "gpt-3.5-turbo": { "META": { "Algorithm": "COT", "LLM": "gpt-3.5-turbo", "Eval Date": "2025/01/07" }, "gsm8k": { "Score": 78.70, "Pass rate": 100.00, "X-shot": 8, "Parameters": "", "Samples": 1319, "Total input tokens": 953242, "Average input tokens": 723, "Total output tokens": 134799, "Average output tokens": 102, "All tokens": 1088041, "Cost($)": 0.6788 }, "AQuA": { "Score": 61.02, "Pass rate": 93.70, "X-shot": 0, "Parameters": "", "Samples": 254, "Total input tokens": 25447, "Average input tokens": 100, "Total output tokens": 55346, "Average output tokens": 218, "All tokens": 80793, "Cost($)": 0.0957 } }, "Doubao-lite-32k": { "META": { "Algorithm": "COT", "LLM": "Doubao-lite-32k", "Eval Date": "2025/01/07" }, "gsm8k": { "Score": 89.31, "Pass rate": 100.00, "X-shot": 8, "Parameters": "", "Samples": 1319, "Total input tokens": 1042095, "Average input tokens": 790, "Total output tokens": 159725, "Average output tokens": 121, "All tokens": 1201820, "Cost($)": 0.0557 }, "AQuA": { "Score": 82.68, "Pass rate": 97.24, "X-shot": 0, "Parameters": "", "Samples": 254, "Total input tokens": 27978, "Average input tokens": 110, "Total output tokens": 66599, "Average output tokens": 262, "All tokens": 94577, "Cost($)": 0.0066 } } }, "SC-COT": { "gpt-3.5-turbo": { "META": { "Algorithm": "SC-COT", "LLM": "gpt-3.5-turbo", "Eval Date": "2025/01/07" }, "gsm8k": { "Score": 80.06, "Pass rate": 99.62, "X-shot": 8, "Parameters": "temperature=1, num_path=5", "Samples": 1319, "Total input tokens": 5260319, "Average input tokens": 3988, "Total output tokens": 1595016, "Average output tokens": 1209, "All tokens": 6855335, "Cost($)": 5.0227 }, "AQuA": { "Score": 67.32, "Pass rate": 100.00, "X-shot": 0, "Parameters": "temperature=1, path_num=5", "Samples": 254, "Total input tokens": 219241, "Average input tokens": 863, "Total output tokens": 359629, "Average output tokens": 1416, "All tokens": 578870, "Cost($)": 0.6491 } }, "Doubao-lite-32k": { "META": { "Algorithm": "SC-COT", "LLM": "Doubao-lite-32k", "Eval Date": "2025/01/07" }, "gsm8k": { "Score": 88.63, "Pass rate": 99.77, "X-shot": 8, "Parameters": "temperature=1, num_path=5", "Samples": 1319, "Total input tokens": 1150443, "Average input tokens": 872, "Total output tokens": 1295750, "Average output tokens": 982, "All tokens": 2446193, "Cost($)": 0.1533 }, "AQuA": { "Score": 83.46, "Pass rate": 97.24, "X-shot": 0, "Parameters": "temperature=1, num_path=5", "Samples": 254, "Total input tokens": 259804, "Average input tokens": 1023, "Total output tokens": 369741, "Average output tokens": 1456, "All tokens": 629545, "Cost($)": 0.0409 } } }, "POT": { "gpt-3.5-turbo": { "META": { "Algorithm": "POT", "LLM": "gpt-3.5-turbo", "Eval Date": "2025/01/07" }, "gsm8k": { "Score": 76.88, "Pass rate": 99.24, "X-shot": 8, "Parameters": "", "Samples": 1319, "Total input tokens": 1090418, "Average input tokens": 827, "Total output tokens": 96662, "Average output tokens": 73, "All tokens": 1187080, "Cost($)": 0.6902 }, "AQuA": { "Score": 51.97, "Pass rate": 92.91, "X-shot": 0, "Parameters": "", "Samples": 254, "Total input tokens": 223438, "Average input tokens": 880, "Total output tokens": 29323, "Average output tokens": 115, "All tokens": 252761, "Cost($)": 0.1557 } }, "Doubao-lite-32k": { "META": { "Algorithm": "POT", "LLM": "Doubao-lite-32k", "Eval Date": "2025/01/07" }, "gsm8k": { "Score": 79.15, "Pass rate": 92.65, "X-shot": 8, "Parameters": "", "Samples": 1319, "Total input tokens": 1170038, "Average input tokens": 887, "Total output tokens": 116987, "Average output tokens": 89, "All tokens": 1287025, "Cost($)": 0.0575 }, "AQuA": { "Score": 52.36, "Pass rate": 82.28, "X-shot": 0, "Parameters": "", "Samples": 254, "Total input tokens": 256721, "Average input tokens": 1011, "Total output tokens": 44729, "Average output tokens": 176, "All tokens": 301450, "Cost($)": 0.0142 } } }, "ReAct-Pro": { "gpt-3.5-turbo": { "META": { "Algorithm": "ReAct-Pro", "LLM": "gpt-3.5-turbo", "Eval Date": "2025/01/07" }, "gsm8k": { "Score": 74.91, "Pass rate": 99.39, "X-shot": 8, "Parameters": "max_steps=10", "Samples": 1319, "Total input tokens": 6506164, "Average input tokens": 4933, "Total output tokens": 140122, "Average output tokens": 106, "All tokens": 6646286, "Cost($)": 3.4633 }, "AQuA": { "Score": 64.57, "Pass rate": 98.03, "X-shot": 0, "Parameters": "max_steps=10", "Samples": 254, "Total input tokens": 862614, "Average input tokens": 3396, "Total output tokens": 40973, "Average output tokens": 161, "All tokens": 903587, "Cost($)": 0.4928 } }, "Doubao-lite-32k": { "META": { "Algorithm": "ReAct-Pro", "LLM": "Doubao-lite-32k", "Eval Date": "2025/01/07" }, "gsm8k": { "Score": 85.60, "Pass rate": 99.62, "X-shot": 8, "Parameters": "max_steps=10", "Samples": 1319, "Total input tokens": 5862016, "Average input tokens": 4444, "Total output tokens": 136623, "Average output tokens": 104, "All tokens": 5998639, "Cost($)": 0.2513 }, "AQuA": { "Score": 77.56, "Pass rate": 96.06, "X-shot": 0, "Parameters": "max_steps=10", "Samples": 254, "Total input tokens": 977890, "Average input tokens": 3850, "Total output tokens": 54951, "Average output tokens": 216, "All tokens": 1032841, "Cost($)": 0.0446 } } } } }