File size: 2,583 Bytes
fc71d05
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
# CONSTANTS-URL
URL = "http://opencompass.openxlab.space/assets/OpenVLM.json"
OVERALL_MATH_SCORE_FILE = "overall_math_score.json"
DETAIL_MATH_SCORE_FILE = "detail_math_score.json"
# CONSTANTS-TEXT
LEADERBORAD_INTRODUCTION = """# Open Agent Leaderboard
### Welcome to the Open Agent Leaderboard! We share the evaluation results of open agents: COT, SC_COT, POT, ReAct, etc. The agents are impletemented by the OpenSource Framework: [*OmAgent*](https://github.com/om-ai-lab/OmAgent) 

This leaderboard was last updated: {}. 

To add your own agent to the leaderboard, please create a PR in [*OmAgent*](https://github.com/om-ai-lab/OmAgent), then we will help with the evaluation and updating the leaderboard. For any questions or concerns, please feel free to contact us.
"""

DEFAULT_MATH_BENCH = [
    'gsm8k', 'AQuA'
]
# The README file for each benchmark
LEADERBOARD_MD = {}

LEADERBOARD_MD['MATH_MAIN'] = f"""
## Math task main Evaluation Results

- Metrics:
  - Avg Score: The average score on all math Benchmarks (normalized to 0 - 100, the higher the better). 
  - Rank: The average rank on all math Benchmarks (the lower the better). 
  - Score: The evaluation score on each math Benchmarks (the higher the better). 
  - Cost: The cost on each math Benchmarks (the lower the better). 

- By default, we present the overall evaluation results based on {', '.join(DEFAULT_MATH_BENCH)}, sorted by the descending order of Avg Score. 
"""

LEADERBOARD_MD['MATH_DETAIL'] = f"""
## Math task detail Evaluation Results

- By default, we present the overall evaluation results based on {', '.join(DEFAULT_MATH_BENCH)}
- default parameters: temperature=0.0
- LLM prices:
  - gpt-3.5-turbo: 
      - 0.0005$/1M tokens (input)
    - 0.0015$/1M tokens (output)
  - Doubao-lite-32k (1 USD = 7.3249 CNY): 
      - 0.00004096$/1M tokens (input)
    - 0.0001$/1M tokens (output)
- ReAct-Pro*: We modified ReAct to ReAct-Pro, following the Reflexion repository. Implementation details can be found in the [*OmAgent*](https://github.com/om-ai-lab/OmAgent) repository.
"""

META_FIELDS = [
    'Algorithm', 'LLM', 'Eval Date'
]

DATASETS = [
    'gsm8k', 'AQuA'
]

LLM = [
    'Doubao-lite-32k', 'gpt-3.5-turbo'
]

ALGORITHMS = [
  'IO',  'COT', 'SC_COT', 'POT', 'ReAct-Pro*'
]

CITATION_BUTTON_TEXT = r"""@article{zhang2024omagent,
  title={OmAgent: A Multi-modal Agent Framework for Complex Video Understanding with Task Divide-and-Conquer},
  author={Zhang, Lu and Zhao, Tiancheng and Ying, Heting and Ma, Yibo and Lee, Kyusong},
  journal={arXiv preprint arXiv:2406.16620},
  year={2024}
}"""