open-agent-leaderboard / meta_data.py
qq-hzlh's picture
update algo names
610b4b4
# CONSTANTS-URL
OVERALL_MATH_SCORE_FILE = "src/overall_math_score.json"
DETAIL_MATH_SCORE_FILE = "src/detail_math_score.json"
# CONSTANTS-TEXT
LEADERBORAD_INTRODUCTION = """# Open Agent Leaderboard
### Welcome to the Open Agent Leaderboard! We share the evaluation results of open agents: CoT, SC-CoT, PoT, ReAct, etc. The agents are impletemented by the OpenSource Framework: [*OmAgent*](https://github.com/om-ai-lab/OmAgent)
This leaderboard was last updated: {}.
To add your own agent to the leaderboard, please create a PR in [*OmAgent*](https://github.com/om-ai-lab/OmAgent), then we will help with the evaluation and updating the leaderboard. For any questions or concerns, please feel free to contact us.
"""
DEFAULT_MATH_BENCH = [
'gsm8k', 'AQuA'
]
# The README file for each benchmark
LEADERBOARD_MD = {}
LEADERBOARD_MD['MATH_MAIN'] = f"""
## Math task main Evaluation Results
- Metrics:
- Avg Score: The average score on all math Benchmarks (normalized to 0 - 100, the higher the better).
- Rank: The average rank on all math Benchmarks (the lower the better).
- Score: The evaluation score on each math Benchmarks (the higher the better).
- Cost: The cost on each math Benchmarks (the lower the better).
- By default, we present the overall evaluation results based on {', '.join(DEFAULT_MATH_BENCH)}, sorted by the descending order of Avg Score.
- IO (Input-Output): The baseline method that directly prompts the model with the question and expects an answer without any intermediate reasoning steps.
- ReAct-Pro*: We modified ReAct to ReAct-Pro, following the Reflexion repository. Implementation details can be found in the [*OmAgent*](https://github.com/om-ai-lab/OmAgent) repository.
"""
LEADERBOARD_MD['MATH_DETAIL'] = f"""
## Math task detail Evaluation Results
- By default, we present the overall evaluation results based on {', '.join(DEFAULT_MATH_BENCH)}
- Metrics:
- Score: The evaluation score on each math Benchmarks (the higher the better).
- Pass rate: The percentage of response that are valid, where a response is valid if it is neither empty nor null.
- Cost: The cost on each math Benchmarks (the lower the better).
- Rank: The rank on each math Benchmarks (the lower the better).
- default parameters: temperature=0.0
- LLM prices:
- gpt-3.5-turbo:
- 0.0005$/1M tokens (input)
- 0.0015$/1M tokens (output)
- Doubao-lite-32k (1 USD = 7.3249 CNY):
- 0.00004096$/1M tokens (input)
- 0.0001$/1M tokens (output)
- IO (Input-Output) is the baseline method that directly prompts the model with the question and expects an answer without any intermediate reasoning steps. It represents the most basic way of using language models and serves as a reference point for evaluating the effectiveness of other algorithms.
- ReAct-Pro*: We modified ReAct to ReAct-Pro, following the Reflexion repository. Implementation details can be found in the [*OmAgent*](https://github.com/om-ai-lab/OmAgent) repository.
"""
META_FIELDS = [
'Algorithm', 'LLM', 'Eval Date'
]
DATASETS = [
'gsm8k', 'AQuA'
]
LLM = [
'Doubao-lite-32k', 'gpt-3.5-turbo'
]
ALGORITHMS = [
'IO', 'CoT', 'SC-CoT', 'PoT', 'ReAct-Pro*'
]
CITATION_BUTTON_TEXT = r"""@misc{open-agent-leaderboard,
title={Open Agent Leaderboard},
author={Om AI Lab},
year={2025},
publisher={GitHub},
howpublished={\url{https://github.com/om-ai-lab/open-agent-leaderboard}}
}"""