# CONSTANTS-URL OVERALL_MATH_SCORE_FILE = "src/overall_math_score.json" DETAIL_MATH_SCORE_FILE = "src/detail_math_score.json" # CONSTANTS-TEXT LEADERBORAD_INTRODUCTION = """# Open Agent Leaderboard ### Welcome to the Open Agent Leaderboard! We share the evaluation results of open agents: COT, SC_COT, POT, ReAct, etc. The agents are impletemented by the OpenSource Framework: [*OmAgent*](https://github.com/om-ai-lab/OmAgent) This leaderboard was last updated: {}. To add your own agent to the leaderboard, please create a PR in [*OmAgent*](https://github.com/om-ai-lab/OmAgent), then we will help with the evaluation and updating the leaderboard. For any questions or concerns, please feel free to contact us. """ DEFAULT_MATH_BENCH = [ 'gsm8k', 'AQuA' ] # The README file for each benchmark LEADERBOARD_MD = {} LEADERBOARD_MD['MATH_MAIN'] = f""" ## Math task main Evaluation Results - Metrics: - Avg Score: The average score on all math Benchmarks (normalized to 0 - 100, the higher the better). - Rank: The average rank on all math Benchmarks (the lower the better). - Score: The evaluation score on each math Benchmarks (the higher the better). - Cost: The cost on each math Benchmarks (the lower the better). - By default, we present the overall evaluation results based on {', '.join(DEFAULT_MATH_BENCH)}, sorted by the descending order of Avg Score. - ReAct-Pro*: We modified ReAct to ReAct-Pro, following the Reflexion repository. Implementation details can be found in the [*OmAgent*](https://github.com/om-ai-lab/OmAgent) repository. """ LEADERBOARD_MD['MATH_DETAIL'] = f""" ## Math task detail Evaluation Results - By default, we present the overall evaluation results based on {', '.join(DEFAULT_MATH_BENCH)} - Metrics: - Score: The evaluation score on each math Benchmarks (the higher the better). - Pass rate: The percentage of response that are valid, where a response is valid if it is neither empty nor null. - Cost: The cost on each math Benchmarks (the lower the better). - Rank: The rank on each math Benchmarks (the lower the better). - default parameters: temperature=0.0 - LLM prices: - gpt-3.5-turbo: - 0.0005$/1M tokens (input) - 0.0015$/1M tokens (output) - Doubao-lite-32k (1 USD = 7.3249 CNY): - 0.00004096$/1M tokens (input) - 0.0001$/1M tokens (output) - ReAct-Pro*: We modified ReAct to ReAct-Pro, following the Reflexion repository. Implementation details can be found in the [*OmAgent*](https://github.com/om-ai-lab/OmAgent) repository. """ META_FIELDS = [ 'Algorithm', 'LLM', 'Eval Date' ] DATASETS = [ 'gsm8k', 'AQuA' ] LLM = [ 'Doubao-lite-32k', 'gpt-3.5-turbo' ] ALGORITHMS = [ 'IO', 'COT', 'SC_COT', 'POT', 'ReAct-Pro*' ] CITATION_BUTTON_TEXT = r"""@article{zhang2024omagent, title={OmAgent: A Multi-modal Agent Framework for Complex Video Understanding with Task Divide-and-Conquer}, author={Zhang, Lu and Zhao, Tiancheng and Ying, Heting and Ma, Yibo and Lee, Kyusong}, journal={arXiv preprint arXiv:2406.16620}, year={2024} }"""