Junming Yang commited on
Commit
612773e
1 Parent(s): 3dc6c39

update leaderboard

Browse files
Files changed (2) hide show
  1. gen_table.py +6 -2
  2. meta_data.py +21 -3
gen_table.py CHANGED
@@ -115,9 +115,13 @@ def BUILD_L2_DF(results, dataset):
115
  # Use the first 5 non-overall fields as required fields
116
  required_fields = overall_fields if len(overall_fields) else non_overall_fields[:5]
117
 
118
- if 'Overall' in overall_fields:
 
 
 
 
119
  df = df.sort_values('Overall')
120
- df = df.iloc[::-1]
121
 
122
  check_box = {}
123
  check_box['essential'] = ['Method', 'Param (B)', 'Language Model', 'Vision Model']
 
115
  # Use the first 5 non-overall fields as required fields
116
  required_fields = overall_fields if len(overall_fields) else non_overall_fields[:5]
117
 
118
+ if dataset == 'OCRBench':
119
+ df = df.sort_values('Final Score')
120
+ elif dataset == 'COCO_VAL':
121
+ df = df.sort_values('CIDEr')
122
+ else:
123
  df = df.sort_values('Overall')
124
+ df = df.iloc[::-1]
125
 
126
  check_box = {}
127
  check_box['essential'] = ['Method', 'Param (B)', 'Language Model', 'Vision Model']
meta_data.py CHANGED
@@ -17,7 +17,7 @@ LEADERBORAD_INTRODUCTION = """# OpenVLM Leaderboard
17
 
18
  This leaderboard was last updated: {}.
19
 
20
- OpenVLM Leaderboard only includes open-source VLMs or API models that are publicly available. To add your own model to the leaderboard, please create a PR in [VLMEvalKit](https://github.com/open-compass/VLMEvalKit) to support your VLM and then we will help with the evaluation and updating the leaderboard. For any questions or concerns, please feel free to contact us at [opencompass,duanhaodong]@pjlab.org.cn.
21
  """
22
  # CONSTANTS-FIELDS
23
  META_FIELDS = ['Method', 'Param (B)', 'Language Model', 'Vision Model', 'OpenSource', 'Verified']
@@ -133,7 +133,7 @@ LEADERBOARD_MD['LLaVABench'] = """
133
  LEADERBOARD_MD['COCO_VAL'] = """
134
  ## COCO Caption Results
135
 
136
- - By default, we evaluate COCO Caption Validation set (5000 samples), and report the following metrics: `BLEU-1, BLEU-4, CIDEr, ROUGE-L
137
  - We use the following prompt to evaluate all VLMs: `Please describe this image in general. Directly provide the description, do not include prefix like "This image depicts". `
138
  - **No specific prompt is adopted for all VLMs.**
139
  """
@@ -195,4 +195,22 @@ LEADERBOARD_MD['POPE'] = """
195
  - Note that the official POPE dataset contains approximately 8910 cases. POPE includes three tracks, and there are some overlapping samples among the three tracks. To reduce the data file size, we have kept only a single copy of the overlapping samples (about 5127 examples). However, the final accuracy will be calculated on the ~9k samples.
196
  - Some API models, due to safety policies, refuse to answer certain questions, so their actual capabilities may be higher than the reported scores.
197
  - We report the average F1 score across the three types of data as the overall score. Accuracy, precision, and recall are also shown in the table. F1 score = 2 * (precision * recall) / (precision + recall).
198
- """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
 
18
  This leaderboard was last updated: {}.
19
 
20
+ OpenVLM Leaderboard only includes open-source VLMs or API models that are publicly available. To add your own model to the leaderboard, please create a PR in [VLMEvalKit](https://github.com/open-compass/VLMEvalKit) to support your VLM and then we will help with the evaluation and updating the leaderboard. For any questions or concerns, please feel free to contact us at [opencompass, duanhaodong]@pjlab.org.cn.
21
  """
22
  # CONSTANTS-FIELDS
23
  META_FIELDS = ['Method', 'Param (B)', 'Language Model', 'Vision Model', 'OpenSource', 'Verified']
 
133
  LEADERBOARD_MD['COCO_VAL'] = """
134
  ## COCO Caption Results
135
 
136
+ - By default, we evaluate COCO Caption Validation set (5000 samples), and report the following metrics: BLEU-1, BLEU-4, CIDEr, ROUGE-L (default sorted by CIDEr).
137
  - We use the following prompt to evaluate all VLMs: `Please describe this image in general. Directly provide the description, do not include prefix like "This image depicts". `
138
  - **No specific prompt is adopted for all VLMs.**
139
  """
 
195
  - Note that the official POPE dataset contains approximately 8910 cases. POPE includes three tracks, and there are some overlapping samples among the three tracks. To reduce the data file size, we have kept only a single copy of the overlapping samples (about 5127 examples). However, the final accuracy will be calculated on the ~9k samples.
196
  - Some API models, due to safety policies, refuse to answer certain questions, so their actual capabilities may be higher than the reported scores.
197
  - We report the average F1 score across the three types of data as the overall score. Accuracy, precision, and recall are also shown in the table. F1 score = 2 * (precision * recall) / (precision + recall).
198
+ """
199
+
200
+ LEADERBOARD_MD['SEEDBench2_Plus'] = """
201
+ ## SEEDBench2 Plus Evaluation Results
202
+
203
+ - SEEDBench2 Plus comprises 2.3K multiple-choice questions with precise human annotations, spanning three broad categories: Charts, Maps, and Webs, each of which covers a wide spectrum of textrich scenarios in the real world.
204
+ """
205
+
206
+ LEADERBOARD_MD['MMTBench_VAL'] = """
207
+ ## MMTBench Validation Evaluation Results
208
+
209
+ - MMT-Bench comprises 31,325 meticulously curated multi-choice visual questions from various multimodal scenarios such as vehicle driving and embodied navigation, covering 32 core meta-tasks and 162 subtasks in multimodal understanding.
210
+ """
211
+
212
+ LEADERBOARD_MD['SEEDBench2'] = """
213
+ ## SEEDBench2 Evaluation Results
214
+
215
+ - SEEDBench2 comprises 24K multiple-choice questions with accurate human annotations, which spans 27 dimensions, including the evaluation of both text and image generation.
216
+ """