Spaces:
Running
Running
{ | |
"time": "2025-01-09 17:13:45", | |
"results": { | |
"IO": { | |
"gpt-3.5-turbo": { | |
"META": { | |
"Algorithm": "IO", | |
"LLM": "gpt-3.5-turbo", | |
"Eval Date": "2025/01/07" | |
}, | |
"gsm8k": { | |
"Score": 37.83, | |
"Pass rate": 99.92, | |
"X-shot": 8, | |
"Parameters": "", | |
"Samples": 1319, | |
"Total input tokens": 546990, | |
"Average input tokens": 415, | |
"Total output tokens": 39563, | |
"Average output tokens": 30, | |
"All tokens": 586553, | |
"Cost($)": 0.3328 | |
}, | |
"AQuA": { | |
"Score": 38.98, | |
"Pass rate": 100.00, | |
"X-shot": 0, | |
"Parameters": "", | |
"Samples": 254, | |
"Total input tokens": 25701, | |
"Average input tokens": 101, | |
"Total output tokens": 16770, | |
"Average output tokens": 66, | |
"All tokens": 42471, | |
"Cost($)": 0.0380 | |
} | |
}, | |
"Doubao-lite-32k": { | |
"META": { | |
"Algorithm": "IO", | |
"LLM": "Doubao-lite-32k", | |
"Eval Date": "2025/01/07" | |
}, | |
"gsm8k": { | |
"Score": 72.02, | |
"Pass rate": 99.92, | |
"X-shot": 8, | |
"Parameters": "", | |
"Samples": 1319, | |
"Total input tokens": 617377, | |
"Average input tokens": 468, | |
"Total output tokens": 123106, | |
"Average output tokens": 93, | |
"All tokens": 740483, | |
"Cost($)": 0.0354 | |
}, | |
"AQuA": { | |
"Score": 79.13, | |
"Pass rate": 100.00, | |
"X-shot": 0, | |
"Parameters": "", | |
"Samples": 254, | |
"Total input tokens": 33058, | |
"Average input tokens": 130, | |
"Total output tokens": 54684, | |
"Average output tokens": 215, | |
"All tokens": 87742, | |
"Cost($)": 0.0058 | |
} | |
} | |
}, | |
"COT": { | |
"gpt-3.5-turbo": { | |
"META": { | |
"Algorithm": "COT", | |
"LLM": "gpt-3.5-turbo", | |
"Eval Date": "2025/01/07" | |
}, | |
"gsm8k": { | |
"Score": 78.70, | |
"Pass rate": 100.00, | |
"X-shot": 8, | |
"Parameters": "", | |
"Samples": 1319, | |
"Total input tokens": 953242, | |
"Average input tokens": 723, | |
"Total output tokens": 134799, | |
"Average output tokens": 102, | |
"All tokens": 1088041, | |
"Cost($)": 0.6788 | |
}, | |
"AQuA": { | |
"Score": 61.02, | |
"Pass rate": 93.70, | |
"X-shot": 0, | |
"Parameters": "", | |
"Samples": 254, | |
"Total input tokens": 25447, | |
"Average input tokens": 100, | |
"Total output tokens": 55346, | |
"Average output tokens": 218, | |
"All tokens": 80793, | |
"Cost($)": 0.0957 | |
} | |
}, | |
"Doubao-lite-32k": { | |
"META": { | |
"Algorithm": "COT", | |
"LLM": "Doubao-lite-32k", | |
"Eval Date": "2025/01/07" | |
}, | |
"gsm8k": { | |
"Score": 89.31, | |
"Pass rate": 100.00, | |
"X-shot": 8, | |
"Parameters": "", | |
"Samples": 1319, | |
"Total input tokens": 1042095, | |
"Average input tokens": 790, | |
"Total output tokens": 159725, | |
"Average output tokens": 121, | |
"All tokens": 1201820, | |
"Cost($)": 0.0557 | |
}, | |
"AQuA": { | |
"Score": 82.68, | |
"Pass rate": 97.24, | |
"X-shot": 0, | |
"Parameters": "", | |
"Samples": 254, | |
"Total input tokens": 27978, | |
"Average input tokens": 110, | |
"Total output tokens": 66599, | |
"Average output tokens": 262, | |
"All tokens": 94577, | |
"Cost($)": 0.0066 | |
} | |
} | |
}, | |
"SC-COT": { | |
"gpt-3.5-turbo": { | |
"META": { | |
"Algorithm": "SC-COT", | |
"LLM": "gpt-3.5-turbo", | |
"Eval Date": "2025/01/07" | |
}, | |
"gsm8k": { | |
"Score": 80.06, | |
"Pass rate": 99.62, | |
"X-shot": 8, | |
"Parameters": "temperature=1, num_path=5", | |
"Samples": 1319, | |
"Total input tokens": 5260319, | |
"Average input tokens": 3988, | |
"Total output tokens": 1595016, | |
"Average output tokens": 1209, | |
"All tokens": 6855335, | |
"Cost($)": 5.0227 | |
}, | |
"AQuA": { | |
"Score": 67.32, | |
"Pass rate": 100.00, | |
"X-shot": 0, | |
"Parameters": "temperature=1, path_num=5", | |
"Samples": 254, | |
"Total input tokens": 219241, | |
"Average input tokens": 863, | |
"Total output tokens": 359629, | |
"Average output tokens": 1416, | |
"All tokens": 578870, | |
"Cost($)": 0.6491 | |
} | |
}, | |
"Doubao-lite-32k": { | |
"META": { | |
"Algorithm": "SC-COT", | |
"LLM": "Doubao-lite-32k", | |
"Eval Date": "2025/01/07" | |
}, | |
"gsm8k": { | |
"Score": 88.63, | |
"Pass rate": 99.77, | |
"X-shot": 8, | |
"Parameters": "temperature=1, num_path=5", | |
"Samples": 1319, | |
"Total input tokens": 1150443, | |
"Average input tokens": 872, | |
"Total output tokens": 1295750, | |
"Average output tokens": 982, | |
"All tokens": 2446193, | |
"Cost($)": 0.1533 | |
}, | |
"AQuA": { | |
"Score": 83.46, | |
"Pass rate": 97.24, | |
"X-shot": 0, | |
"Parameters": "temperature=1, num_path=5", | |
"Samples": 254, | |
"Total input tokens": 259804, | |
"Average input tokens": 1023, | |
"Total output tokens": 369741, | |
"Average output tokens": 1456, | |
"All tokens": 629545, | |
"Cost($)": 0.0409 | |
} | |
} | |
}, | |
"POT": { | |
"gpt-3.5-turbo": { | |
"META": { | |
"Algorithm": "POT", | |
"LLM": "gpt-3.5-turbo", | |
"Eval Date": "2025/01/07" | |
}, | |
"gsm8k": { | |
"Score": 76.88, | |
"Pass rate": 99.24, | |
"X-shot": 8, | |
"Parameters": "", | |
"Samples": 1319, | |
"Total input tokens": 1090418, | |
"Average input tokens": 827, | |
"Total output tokens": 96662, | |
"Average output tokens": 73, | |
"All tokens": 1187080, | |
"Cost($)": 0.6902 | |
}, | |
"AQuA": { | |
"Score": 51.97, | |
"Pass rate": 92.91, | |
"X-shot": 0, | |
"Parameters": "", | |
"Samples": 254, | |
"Total input tokens": 223438, | |
"Average input tokens": 880, | |
"Total output tokens": 29323, | |
"Average output tokens": 115, | |
"All tokens": 252761, | |
"Cost($)": 0.1557 | |
} | |
}, | |
"Doubao-lite-32k": { | |
"META": { | |
"Algorithm": "POT", | |
"LLM": "Doubao-lite-32k", | |
"Eval Date": "2025/01/07" | |
}, | |
"gsm8k": { | |
"Score": 79.15, | |
"Pass rate": 92.65, | |
"X-shot": 8, | |
"Parameters": "", | |
"Samples": 1319, | |
"Total input tokens": 1170038, | |
"Average input tokens": 887, | |
"Total output tokens": 116987, | |
"Average output tokens": 89, | |
"All tokens": 1287025, | |
"Cost($)": 0.0575 | |
}, | |
"AQuA": { | |
"Score": 52.36, | |
"Pass rate": 82.28, | |
"X-shot": 0, | |
"Parameters": "", | |
"Samples": 254, | |
"Total input tokens": 256721, | |
"Average input tokens": 1011, | |
"Total output tokens": 44729, | |
"Average output tokens": 176, | |
"All tokens": 301450, | |
"Cost($)": 0.0142 | |
} | |
} | |
}, | |
"ReAct-Pro": { | |
"gpt-3.5-turbo": { | |
"META": { | |
"Algorithm": "ReAct-Pro", | |
"LLM": "gpt-3.5-turbo", | |
"Eval Date": "2025/01/07" | |
}, | |
"gsm8k": { | |
"Score": 74.91, | |
"Pass rate": 99.39, | |
"X-shot": 8, | |
"Parameters": "max_steps=10", | |
"Samples": 1319, | |
"Total input tokens": 6506164, | |
"Average input tokens": 4933, | |
"Total output tokens": 140122, | |
"Average output tokens": 106, | |
"All tokens": 6646286, | |
"Cost($)": 3.4633 | |
}, | |
"AQuA": { | |
"Score": 64.57, | |
"Pass rate": 98.03, | |
"X-shot": 0, | |
"Parameters": "max_steps=10", | |
"Samples": 254, | |
"Total input tokens": 862614, | |
"Average input tokens": 3396, | |
"Total output tokens": 40973, | |
"Average output tokens": 161, | |
"All tokens": 903587, | |
"Cost($)": 0.4928 | |
} | |
}, | |
"Doubao-lite-32k": { | |
"META": { | |
"Algorithm": "ReAct-Pro", | |
"LLM": "Doubao-lite-32k", | |
"Eval Date": "2025/01/07" | |
}, | |
"gsm8k": { | |
"Score": 85.60, | |
"Pass rate": 99.62, | |
"X-shot": 8, | |
"Parameters": "max_steps=10", | |
"Samples": 1319, | |
"Total input tokens": 5862016, | |
"Average input tokens": 4444, | |
"Total output tokens": 136623, | |
"Average output tokens": 104, | |
"All tokens": 5998639, | |
"Cost($)": 0.2513 | |
}, | |
"AQuA": { | |
"Score": 77.56, | |
"Pass rate": 96.06, | |
"X-shot": 0, | |
"Parameters": "max_steps=10", | |
"Samples": 254, | |
"Total input tokens": 977890, | |
"Average input tokens": 3850, | |
"Total output tokens": 54951, | |
"Average output tokens": 216, | |
"All tokens": 1032841, | |
"Cost($)": 0.0446 | |
} | |
} | |
} | |
} | |
} | |