kennymckormick commited on
Commit
ee3f06f
Β·
1 Parent(s): 583df3c
Files changed (3) hide show
  1. app.py +14 -21
  2. gen_table.py +91 -81
  3. meta_data.py +14 -217
app.py CHANGED
@@ -6,24 +6,20 @@ from gen_table import *
6
  from meta_data import *
7
 
8
  with gr.Blocks() as demo:
9
- struct = load_results()
10
- timestamp = struct['time']
11
- EVAL_TIME = format_timestamp(timestamp)
12
- results = struct['results']
13
- N_MODEL = len(results)
14
- N_DATA = len(results['LLaVA-v1.5-7B']) - 1
15
- DATASETS = list(results['LLaVA-v1.5-7B'])
16
- DATASETS.remove('META')
17
- print(DATASETS)
18
-
19
- gr.Markdown(LEADERBORAD_INTRODUCTION.format(N_MODEL, N_DATA, EVAL_TIME))
20
  structs = [abc.abstractproperty() for _ in range(N_DATA)]
21
 
 
 
22
  with gr.Tabs(elem_classes='tab-buttons') as tabs:
23
- with gr.TabItem('πŸ… OpenVLM Main Leaderboard', elem_id='main', id=0):
24
- gr.Markdown(LEADERBOARD_MD['MAIN'])
25
- _, check_box = BUILD_L1_DF(results, MAIN_FIELDS)
26
- table = generate_table(results, DEFAULT_BENCH)
27
  table['Rank'] = list(range(1, len(table) + 1))
28
 
29
  type_map = check_box['type_map']
@@ -58,11 +54,9 @@ with gr.Blocks() as demo:
58
  visible=True)
59
 
60
  def filter_df(fields, model_size, model_type):
61
- filter_list = ['Avg Score', 'Avg Rank', 'OpenSource', 'Verified']
62
  headers = ['Rank'] + check_box['essential'] + fields
63
 
64
- new_fields = [field for field in fields if field not in filter_list]
65
- df = generate_table(results, new_fields)
66
 
67
  df['flag'] = [model_size_flag(x, model_size) for x in df['Param (B)']]
68
  df = df[df['flag']]
@@ -85,12 +79,11 @@ with gr.Blocks() as demo:
85
  cbox.change(fn=filter_df, inputs=[checkbox_group, model_size, model_type], outputs=data_component)
86
 
87
  with gr.TabItem('πŸ” About', elem_id='about', id=1):
88
- gr.Markdown(urlopen(VLMEVALKIT_README).read().decode())
89
 
 
90
  for i, dataset in enumerate(DATASETS):
91
  with gr.TabItem(f'πŸ“Š {dataset} Leaderboard', elem_id=dataset, id=i + 2):
92
- if dataset in LEADERBOARD_MD:
93
- gr.Markdown(LEADERBOARD_MD[dataset])
94
 
95
  s = structs[i]
96
  s.table, s.check_box = BUILD_L2_DF(results, dataset)
 
6
  from meta_data import *
7
 
8
  with gr.Blocks() as demo:
9
+ results = load_results()
10
+ for k in results:
11
+ val = results[k]
12
+ val.pop('key')
13
+ N_DATA = 5
 
 
 
 
 
 
14
  structs = [abc.abstractproperty() for _ in range(N_DATA)]
15
 
16
+ gr.Markdown(LEADERBORAD_INTRODUCTION)
17
+
18
  with gr.Tabs(elem_classes='tab-buttons') as tabs:
19
+ with gr.TabItem('πŸ… MMBench Leaderboard', elem_id='main', id=0):
20
+ _, check_box = BUILD_L1_DF(results)
21
+
22
+ table = generate_table(results)
23
  table['Rank'] = list(range(1, len(table) + 1))
24
 
25
  type_map = check_box['type_map']
 
54
  visible=True)
55
 
56
  def filter_df(fields, model_size, model_type):
 
57
  headers = ['Rank'] + check_box['essential'] + fields
58
 
59
+ df = generate_table(results)
 
60
 
61
  df['flag'] = [model_size_flag(x, model_size) for x in df['Param (B)']]
62
  df = df[df['flag']]
 
79
  cbox.change(fn=filter_df, inputs=[checkbox_group, model_size, model_type], outputs=data_component)
80
 
81
  with gr.TabItem('πŸ” About', elem_id='about', id=1):
82
+ gr.Markdown(urlopen(MMBench_README).read().decode())
83
 
84
+ DATASETS = ['MMBench_TEST_EN_V11', 'MMBench_TEST_CN_V11', 'CCBench', 'MMBench_TEST_EN', 'MMBench_TEST_CN']
85
  for i, dataset in enumerate(DATASETS):
86
  with gr.TabItem(f'πŸ“Š {dataset} Leaderboard', elem_id=dataset, id=i + 2):
 
 
87
 
88
  s = structs[i]
89
  s.table, s.check_box = BUILD_L2_DF(results, dataset)
gen_table.py CHANGED
@@ -7,7 +7,7 @@ import gradio as gr
7
  import numpy as np
8
  import pandas as pd
9
 
10
- from meta_data import DEFAULT_BENCH, META_FIELDS, URL
11
 
12
 
13
  def listinstr(lst, s):
@@ -18,26 +18,41 @@ def listinstr(lst, s):
18
  return False
19
 
20
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  def load_results():
22
  data = json.loads(urlopen(URL).read())
23
- return data
 
 
 
 
 
 
 
 
24
 
25
 
26
  def nth_large(val, vals):
27
  return sum([1 for v in vals if v > val]) + 1
28
 
29
 
30
- def format_timestamp(timestamp):
31
- date = timestamp[:2] + '.' + timestamp[2:4] + '.' + timestamp[4:6]
32
- time = timestamp[6:8] + ':' + timestamp[8:10] + ':' + timestamp[10:12]
33
- return date + ' ' + time
34
-
35
-
36
  def model_size_flag(sz, FIELDS):
37
  if pd.isna(sz) and 'Unknown' in FIELDS:
38
  return True
39
  if pd.isna(sz):
40
  return False
 
41
  if '<4B' in FIELDS and sz < 4:
42
  return True
43
  if '4B-10B' in FIELDS and sz >= 4 and sz < 10:
@@ -52,28 +67,29 @@ def model_size_flag(sz, FIELDS):
52
 
53
 
54
  def model_type_flag(line, FIELDS):
55
- if 'OpenSource' in FIELDS and line['OpenSource'] == 'Yes':
56
- return True
57
- if 'API' in FIELDS and line['OpenSource'] == 'No' and line['Verified'] == 'Yes':
58
  return True
59
- if 'Proprietary' in FIELDS and line['OpenSource'] == 'No' and line['Verified'] == 'No':
 
 
60
  return True
61
  return False
62
 
63
 
64
- def BUILD_L1_DF(results, fields):
65
  check_box = {}
66
- check_box['essential'] = ['Method', 'Param (B)', 'Language Model', 'Vision Model']
67
  # revise there to set default dataset
68
- check_box['required'] = ['Avg Score', 'Avg Rank'] + DEFAULT_BENCH
69
- check_box['avg'] = ['Avg Score', 'Avg Rank']
70
- check_box['all'] = check_box['avg'] + fields
71
  type_map = defaultdict(lambda: 'number')
72
  type_map['Method'] = 'html'
73
- type_map['Language Model'] = type_map['Vision Model'] = type_map['OpenSource'] = type_map['Verified'] = 'str'
 
74
  check_box['type_map'] = type_map
75
 
76
- df = generate_table(results, fields)
77
  return df, check_box
78
 
79
 
@@ -85,112 +101,106 @@ def BUILD_L2_DF(results, dataset):
85
 
86
  non_overall_fields = [x for x in fields if 'Overall' not in x]
87
  overall_fields = [x for x in fields if 'Overall' in x]
88
- if dataset == 'MME':
89
- non_overall_fields = [x for x in non_overall_fields if not listinstr(['Perception', 'Cognition'], x)]
90
- overall_fields = overall_fields + ['Perception', 'Cognition']
91
- if dataset == 'OCRBench':
92
- non_overall_fields = [x for x in non_overall_fields if not listinstr(['Final Score'], x)]
93
- overall_fields = ['Final Score']
94
 
95
  for m in results:
96
  item = results[m]
97
  if dataset not in item:
98
  continue
99
- meta = item['META']
100
  for k in META_FIELDS:
101
  if k == 'Param (B)':
102
- param = meta['Parameters']
103
  res[k].append(float(param.replace('B', '')) if param != '' else None)
104
  elif k == 'Method':
105
- name, url = meta['Method']
106
  res[k].append(f'<a href="{url}">{name}</a>')
107
  else:
108
- res[k].append(meta[k])
109
- fields = [x for x in fields]
 
110
 
111
- for d in non_overall_fields:
112
- res[d].append(item[dataset][d])
113
  for d in overall_fields:
114
- res[d].append(item[dataset][d])
115
-
 
 
116
  df = pd.DataFrame(res)
117
  all_fields = overall_fields + non_overall_fields
118
  # Use the first 5 non-overall fields as required fields
119
  required_fields = overall_fields if len(overall_fields) else non_overall_fields[:5]
120
 
121
- if dataset == 'OCRBench':
122
- df = df.sort_values('Final Score')
123
- elif dataset == 'COCO_VAL':
124
- df = df.sort_values('CIDEr')
125
- else:
126
- df = df.sort_values('Overall')
127
  df = df.iloc[::-1]
128
 
129
  check_box = {}
130
- check_box['essential'] = ['Method', 'Param (B)', 'Language Model', 'Vision Model']
131
  check_box['required'] = required_fields
132
  check_box['all'] = all_fields
133
  type_map = defaultdict(lambda: 'number')
134
  type_map['Method'] = 'html'
135
- type_map['Language Model'] = type_map['Vision Model'] = type_map['OpenSource'] = type_map['Verified'] = 'str'
 
136
  check_box['type_map'] = type_map
137
  return df, check_box
138
 
139
 
140
- def generate_table(results, fields):
141
-
142
- def get_mmbench_v11(item):
143
- assert 'MMBench_TEST_CN_V11' in item and 'MMBench_TEST_EN_V11' in item
144
- val = (item['MMBench_TEST_CN_V11']['Overall'] + item['MMBench_TEST_EN_V11']['Overall']) / 2
145
- val = float(f'{val:.1f}')
146
- return val
147
 
148
  res = defaultdict(list)
149
  for i, m in enumerate(results):
150
  item = results[m]
151
- meta = item['META']
152
  for k in META_FIELDS:
153
  if k == 'Param (B)':
154
- param = meta['Parameters']
155
  res[k].append(float(param.replace('B', '')) if param != '' else None)
156
  elif k == 'Method':
157
- name, url = meta['Method']
158
  res[k].append(f'<a href="{url}">{name}</a>')
159
- res['name'].append(name)
160
  else:
161
- res[k].append(meta[k])
162
- scores, ranks = [], []
163
- for d in fields:
 
 
 
164
  key_name = 'Overall' if d != 'OCRBench' else 'Final Score'
165
  # Every Model should have MMBench_V11 results
166
- if d == 'MMBench_V11':
167
- val = get_mmbench_v11(item)
168
- res[d].append(val)
169
- scores.append(val)
170
- ranks.append(nth_large(val, [get_mmbench_v11(x) for x in results.values()]))
171
- elif d in item:
172
- res[d].append(item[d][key_name])
173
- if d == 'MME':
174
- scores.append(item[d][key_name] / 28)
175
- elif d == 'OCRBench':
176
- scores.append(item[d][key_name] / 10)
177
  else:
178
- scores.append(item[d][key_name])
179
- ranks.append(nth_large(item[d][key_name], [x[d][key_name] for x in results.values() if d in x]))
 
 
 
 
 
 
 
 
 
 
 
180
  else:
181
  res[d].append(None)
182
- scores.append(None)
183
- ranks.append(None)
184
-
185
- res['Avg Score'].append(round(np.mean(scores), 1) if None not in scores else None)
186
- res['Avg Rank'].append(round(np.mean(ranks), 2) if None not in ranks else None)
187
 
188
  df = pd.DataFrame(res)
189
- valid, missing = df[~pd.isna(df['Avg Score'])], df[pd.isna(df['Avg Score'])]
190
- valid = valid.sort_values('Avg Score')
191
- valid = valid.iloc[::-1]
192
- if len(fields):
193
- missing = missing.sort_values('MMBench_V11' if 'MMBench_V11' in fields else fields[0])
194
- missing = missing.iloc[::-1]
195
- df = pd.concat([valid, missing])
 
 
 
 
 
 
 
 
196
  return df
 
7
  import numpy as np
8
  import pandas as pd
9
 
10
+ from meta_data import MMBENCH_FIELDS, META_FIELDS, URL
11
 
12
 
13
  def listinstr(lst, s):
 
18
  return False
19
 
20
 
21
+ def upper_key(k):
22
+ if k == 'ocr':
23
+ return 'OCR'
24
+ elif '_' in k:
25
+ k = k.split('_')
26
+ k = [x[0].upper() + x[1:] for x in k]
27
+ k = ' '.join(k)
28
+ return k
29
+ else:
30
+ return k
31
+
32
+
33
  def load_results():
34
  data = json.loads(urlopen(URL).read())
35
+ names = ['MMBench_TEST_EN_V11', 'MMBench_TEST_CN_V11', 'CCBench', 'MMBench_TEST_EN', 'MMBench_TEST_CN']
36
+ skip_keys = ['Method', 'Parameters', 'Language Model', 'Vision Model', 'Org', 'Time', 'Verified', 'OpenSource', 'key']
37
+ META_MAP = data['META_MAP']
38
+ for n in names:
39
+ print(n)
40
+ res_map = {x['Method'][0]: {upper_key(k): v for k, v in x.items() if k not in skip_keys} for x in data[n + '_Data']}
41
+ for r in res_map:
42
+ META_MAP[r][n] = res_map[r]
43
+ return META_MAP
44
 
45
 
46
  def nth_large(val, vals):
47
  return sum([1 for v in vals if v > val]) + 1
48
 
49
 
 
 
 
 
 
 
50
  def model_size_flag(sz, FIELDS):
51
  if pd.isna(sz) and 'Unknown' in FIELDS:
52
  return True
53
  if pd.isna(sz):
54
  return False
55
+ sz = int(sz)
56
  if '<4B' in FIELDS and sz < 4:
57
  return True
58
  if '4B-10B' in FIELDS and sz >= 4 and sz < 10:
 
67
 
68
 
69
  def model_type_flag(line, FIELDS):
70
+ if 'Public' in FIELDS and line['OpenSource'] == 'Yes':
 
 
71
  return True
72
+ if 'Private' in FIELDS and line['OpenSource'] == 'No':
73
+ return True
74
+ if 'Verified' in FIELDS and line['Verified'] == 'Yes':
75
  return True
76
  return False
77
 
78
 
79
+ def BUILD_L1_DF(results):
80
  check_box = {}
81
+ check_box['essential'] = ['Method', 'Org', 'Param (B)', 'Language Model', 'Vision Model']
82
  # revise there to set default dataset
83
+ check_box['required'] = ['MMBench_TEST_V11', 'MMBench_TEST', 'CCBench']
84
+ check_box['avg'] = ['MMBench_TEST_V11', 'MMBench_TEST']
85
+ check_box['all'] = check_box['avg'] + MMBENCH_FIELDS
86
  type_map = defaultdict(lambda: 'number')
87
  type_map['Method'] = 'html'
88
+ type_map['Language Model'] = type_map['Vision Model'] = type_map['Org'] = 'html'
89
+ type_map['OpenSource'] = type_map['Verified'] = 'str'
90
  check_box['type_map'] = type_map
91
 
92
+ df = generate_table(results)
93
  return df, check_box
94
 
95
 
 
101
 
102
  non_overall_fields = [x for x in fields if 'Overall' not in x]
103
  overall_fields = [x for x in fields if 'Overall' in x]
 
 
 
 
 
 
104
 
105
  for m in results:
106
  item = results[m]
107
  if dataset not in item:
108
  continue
 
109
  for k in META_FIELDS:
110
  if k == 'Param (B)':
111
+ param = item['Parameters']
112
  res[k].append(float(param.replace('B', '')) if param != '' else None)
113
  elif k == 'Method':
114
+ name, url = item['Method']
115
  res[k].append(f'<a href="{url}">{name}</a>')
116
  else:
117
+ s = item[k].replace('\n', '<br>')
118
+ s = s.replace(' & ', '<br>')
119
+ res[k].append(s)
120
 
 
 
121
  for d in overall_fields:
122
+ res[d].append(float(item[dataset][d]))
123
+ for d in non_overall_fields:
124
+ res[d].append(float(item[dataset][d]))
125
+
126
  df = pd.DataFrame(res)
127
  all_fields = overall_fields + non_overall_fields
128
  # Use the first 5 non-overall fields as required fields
129
  required_fields = overall_fields if len(overall_fields) else non_overall_fields[:5]
130
 
131
+ df = df.sort_values('Overall')
 
 
 
 
 
132
  df = df.iloc[::-1]
133
 
134
  check_box = {}
135
+ check_box['essential'] = ['Method', 'Org', 'Param (B)', 'Language Model', 'Vision Model']
136
  check_box['required'] = required_fields
137
  check_box['all'] = all_fields
138
  type_map = defaultdict(lambda: 'number')
139
  type_map['Method'] = 'html'
140
+ type_map['Language Model'] = type_map['Vision Model'] = type_map['Org'] = 'html'
141
+ type_map['OpenSource'] = type_map['Verified'] = 'str'
142
  check_box['type_map'] = type_map
143
  return df, check_box
144
 
145
 
146
+ def generate_table(results):
 
 
 
 
 
 
147
 
148
  res = defaultdict(list)
149
  for i, m in enumerate(results):
150
  item = results[m]
 
151
  for k in META_FIELDS:
152
  if k == 'Param (B)':
153
+ param = item['Parameters']
154
  res[k].append(float(param.replace('B', '')) if param != '' else None)
155
  elif k == 'Method':
156
+ name, url = item['Method']
157
  res[k].append(f'<a href="{url}">{name}</a>')
 
158
  else:
159
+ s = item[k].replace('\n', '<br>')
160
+ s = s.replace(' & ', '<br>')
161
+ res[k].append(s)
162
+
163
+
164
+ for d in ['MMBench_TEST_V11', 'MMBench_TEST_EN_V11', 'MMBench_TEST_CN_V11', 'CCBench', 'MMBench_TEST', 'MMBench_TEST_EN', 'MMBench_TEST_CN']:
165
  key_name = 'Overall' if d != 'OCRBench' else 'Final Score'
166
  # Every Model should have MMBench_V11 results
167
+ if d == 'MMBench_TEST_V11':
168
+ if 'MMBench_TEST_EN_V11' in item and 'MMBench_TEST_CN_V11' in item:
169
+ val = item['MMBench_TEST_EN_V11'][key_name] + item['MMBench_TEST_CN_V11'][key_name]
170
+ val = val / 2
171
+ val = float(f'{val:.1f}')
172
+ res[d].append(val)
 
 
 
 
 
173
  else:
174
+ res[d].append(None)
175
+ elif d == 'MMBench_TEST':
176
+ if 'MMBench_TEST_EN' in item and 'MMBench_TEST_CN' in item:
177
+ val = float(item['MMBench_TEST_EN'][key_name]) + float(item['MMBench_TEST_CN'][key_name])
178
+ val = val / 2
179
+ val = float(f'{val:.1f}')
180
+ res[d].append(val)
181
+ else:
182
+ res[d].append(None)
183
+ elif d in item:
184
+ val = float(item[d][key_name])
185
+ val = float(f'{val:.1f}')
186
+ res[d].append(val)
187
  else:
188
  res[d].append(None)
 
 
 
 
 
189
 
190
  df = pd.DataFrame(res)
191
+ df_list = []
192
+ for k in [
193
+ 'MMBench_TEST_V11', 'MMBench_TEST',
194
+ 'MMBench_TEST_EN_V11', 'MMBench_TEST_CN_V11',
195
+ 'MMBench_TEST_EN', 'MMBench_TEST_CN', 'CCBench'
196
+ ]:
197
+ if len(df) == 0:
198
+ break
199
+ valid, missing = df[~pd.isna(df[k])], df[pd.isna(df[k])]
200
+ valid = valid.sort_values(k)
201
+ valid = valid.iloc[::-1]
202
+ df_list.append(valid)
203
+ df = missing
204
+
205
+ df = pd.concat(df_list)
206
  return df
meta_data.py CHANGED
@@ -1,227 +1,24 @@
1
  # CONSTANTS-URL
2
- URL = "http://opencompass.openxlab.space/assets/OpenVLM.json"
3
- VLMEVALKIT_README = 'https://raw.githubusercontent.com/open-compass/VLMEvalKit/main/README.md'
4
  # CONSTANTS-CITATION
5
- CITATION_BUTTON_TEXT = r"""@misc{2023opencompass,
6
- title={OpenCompass: A Universal Evaluation Platform for Foundation Models},
7
- author={OpenCompass Contributors},
8
- howpublished = {\url{https://github.com/open-compass/opencompass}},
9
- year={2023}
10
  }"""
11
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
12
  # CONSTANTS-TEXT
13
- LEADERBORAD_INTRODUCTION = """# OpenVLM Leaderboard
14
- ### Welcome to the OpenVLM Leaderboard! On this leaderboard we share the evaluation results of VLMs obtained by the OpenSource Framework:
15
- ### [*VLMEvalKit*: A Toolkit for Evaluating Large Vision-Language Models](https://github.com/open-compass/VLMEvalKit) πŸ†
16
- ### Currently, OpenVLM Leaderboard covers {} different VLMs (including GPT-4v, Gemini, QwenVLPlus, LLaVA, etc.) and {} different multi-modal benchmarks.
17
 
18
- This leaderboard was last updated: {}.
19
-
20
- OpenVLM Leaderboard only includes open-source VLMs or API models that are publicly available. To add your own model to the leaderboard, please create a PR in [VLMEvalKit](https://github.com/open-compass/VLMEvalKit) to support your VLM and then we will help with the evaluation and updating the leaderboard. For any questions or concerns, please feel free to contact us at [opencompass, duanhaodong]@pjlab.org.cn.
21
  """
 
22
  # CONSTANTS-FIELDS
23
- META_FIELDS = ['Method', 'Param (B)', 'Language Model', 'Vision Model', 'OpenSource', 'Verified']
24
- MAIN_FIELDS = [
25
- 'MMBench_V11', 'MMStar', 'MME',
26
- 'MMMU_VAL', 'MathVista', 'OCRBench', 'AI2D',
27
- 'HallusionBench', 'SEEDBench_IMG', 'MMVet',
28
- 'LLaVABench', 'CCBench', 'RealWorldQA', 'POPE', 'ScienceQA_TEST',
29
- 'SEEDBench2_Plus', 'MMT-Bench_VAL', 'BLINK'
30
- ]
31
- DEFAULT_BENCH = [
32
- 'MMBench_V11', 'MMStar', 'MMMU_VAL', 'MathVista', 'OCRBench', 'AI2D',
33
- 'HallusionBench', 'MMVet'
34
- ]
35
  MMBENCH_FIELDS = ['MMBench_TEST_EN_V11', 'MMBench_TEST_CN_V11', 'MMBench_TEST_EN', 'MMBench_TEST_CN', 'CCBench']
36
  MODEL_SIZE = ['<4B', '4B-10B', '10B-20B', '20B-40B', '>40B', 'Unknown']
37
- MODEL_TYPE = ['API', 'OpenSource', 'Proprietary']
38
-
39
- # The README file for each benchmark
40
- LEADERBOARD_MD = {}
41
-
42
- LEADERBOARD_MD['MAIN'] = f"""
43
- ## Main Evaluation Results
44
-
45
- - Metrics:
46
- - Avg Score: The average score on all VLM Benchmarks (normalized to 0 - 100, the higher the better).
47
- - Avg Rank: The average rank on all VLM Benchmarks (the lower the better).
48
- - Avg Score & Rank are calculated based on selected benchmark. **When results for some selected benchmarks are missing, Avg Score / Rank will be None!!!**
49
- - By default, we present the overall evaluation results based on {len(DEFAULT_BENCH)} VLM benchmarks, sorted by the descending order of Avg Score.
50
- - The following datasets are included in the main results: {', '.join(DEFAULT_BENCH)}.
51
- - Detailed evaluation results for each dataset (included or not included in main) are provided in the consequent tabs.
52
- """
53
-
54
- for dataset in ['MMBench_DEV_CN', 'MMBench_TEST_CN', 'MMBench_DEV_EN', 'MMBench_TEST_EN', 'CCBench']:
55
- LEADERBOARD_MD[dataset] = f"""
56
- ## {dataset.replace('_', ' ')} Evaluation Results
57
-
58
- - We adopt Circular Eval for benchmarks in MMBench series, you can check https://arxiv.org/pdf/2307.06281.pdf for the detailed definition of Circular Eval.
59
- """
60
-
61
- LEADERBOARD_MD['SEEDBench_IMG'] = """
62
- ## SEEDBench_IMG Scores (ChatGPT Answer Extraction / Official Leaderboard)
63
-
64
- - **Overall**: The overall accuracy across all questions with **ChatGPT answer matching**.
65
- - **Overall (official)**: SEEDBench_IMG acc on the official leaderboard (if applicable).
66
- """
67
-
68
- LEADERBOARD_MD['MMVet'] = """
69
- ## MMVet Evaluation Results
70
-
71
- - In MMVet Evaluation, we use GPT-4-Turbo (gpt-4-1106-preview) as the judge LLM to assign scores to the VLM outputs. We only perform the evaluation once due to the limited variance among results of multiple evaluation pass originally reported.
72
- - No specific prompt template adopted for **ALL VLMs**.
73
- - We also provide performance on the [**Official Leaderboard**](https://paperswithcode.com/sota/visual-question-answering-on-mm-vet) for models that are applicable. Those results are obtained with GPT-4-0314 evaluator (which has been deperacted for new users).
74
- """
75
-
76
- LEADERBOARD_MD['MMMU_VAL'] = """
77
- ## MMMU Validation Evaluation Results
78
-
79
- - For MMMU, we support the evaluation of the `dev` (150 samples) and `validation` (900 samples) set. Here we only report the results on the `validation` set.
80
- - **Answer Inference:**
81
- - For models with `interleave_generate` interface (accept interleaved images & texts as inputs), all testing samples can be inferred. **`interleave_generate` is adopted for inference.**
82
- - For models without `interleave_generate` interface, samples with more than one images are skipped (42 out of 1050, directly count as wrong). **`generate` is adopted for inference.**
83
- - **Evaluation**:
84
- - MMMU include two types of questions: **multi-choice questions** & **open-ended QA**.
85
- - For **open-ended QA (62/1050)**, we re-formulate it as multi-choice questions: `{'question': 'QQQ', 'answer': 'AAA'} -> {'question': 'QQQ', 'A': 'AAA', 'B': 'Other Answers', 'answer': 'A'}`, and then adopt the same evaluation paradigm for **multi-choice questions**.
86
- - For **multi-choice questions (988/1050)**, we use **GPT-3.5-Turbo-0613** for matching prediction with options if heuristic matching does not work.
87
- """
88
-
89
- LEADERBOARD_MD['MathVista'] = """
90
- ## MMMU TestMini Evaluation Results
91
-
92
- - We report the evaluation results on MathVista **TestMini**, which include 1000 test samples.
93
- - We adopt `GPT-4-Turbo (1106)` as the answer extractor when we failed to extract the answer with heuristic matching.
94
- - The performance of **Human (High school)** and **Random Choice** are copied from the official leaderboard.
95
- **Category Definitions:** **FQA:** figure QA, **GPS:** geometry problem solving, **MWP:** math word problem, **TQA:** textbook QA, **VQA:** visual QA, **ALG:** algebraic, **ARI:** arithmetic, **GEO:** geometry, **LOG:** logical , **NUM:** numeric, **SCI:** scientific, **STA:** statistical.
96
- """
97
-
98
- LEADERBOARD_MD['HallusionBench'] = """
99
- [**HallusionBench**](https://github.com/tianyi-lab/HallusionBench) is a benchmark to evaluate hallucination of VLMs. It asks a set of visual questions with one original image and one modified image (the answers for a question can be different, considering the image content).
100
-
101
- **Examples in HallusionBench:**
102
-
103
- | Original Figure | Modified Figure |
104
- | ------------------------------------------------------------ | ------------------------------------------------------------ |
105
- | ![](http://opencompass.openxlab.space/utils/Hallu0.png) | ![](http://opencompass.openxlab.space/utils/Hallu1.png) |
106
- | **Q1.** Is the right orange circle the same size as the left orange circle? **A1. Yes** | **Q1.** Is the right orange circle the same size as the left orange circle? **A1. No** |
107
- | **Q2.** Is the right orange circle larger than the left orange circle? **A2. No** | **Q2.** Is the right orange circle larger than the left orange circle? **A2. Yes** |
108
- | **Q3.** Is the right orange circle smaller than the left orange circle? **A3. No** | **Q3.** Is the right orange circle smaller than the left orange circle? **A3. No** |
109
-
110
- **Metrics**:
111
-
112
- >- aAcc: The overall accuracy of **all** atomic questions.
113
- >
114
- >- qAcc: The mean accuracy of unique **questions**. One question can be asked multiple times with different figures, we consider VLM correctly solved a unique question only if it succeeds in all <question, figure> pairs for this unique question.
115
- >- fAcc: The mean accuracy of all **figures**. One figure is associated with multiple questions, we consider VLM correct on a figure only if it succeeds to solve all questions of this figure.
116
-
117
- **Evaluation Setting**:
118
-
119
- > 1. **No-visual** Questions (questions asked without the associated figure) in HallusionBench are **skipped** during evaluation.
120
- > 2. When we failed to extract Yes / No from the VLM prediction, we adopt **GPT-3.5-Turbo-0613** as the answer extractor.
121
- > 3. We report aAcc, qAcc, and fAcc for all evaluated VLMs.
122
-
123
- ## HallusionBench Evaluation Results
124
- """
125
-
126
- LEADERBOARD_MD['LLaVABench'] = """
127
- ## LLaVABench Evaluation Results
128
-
129
- - In LLaVABench Evaluation, we use GPT-4-Turbo (gpt-4-1106-preview) as the judge LLM to assign scores to the VLM outputs. We only perform the evaluation once due to the limited variance among results of multiple evaluation pass originally reported.
130
- - No specific prompt template adopted for **ALL VLMs**.
131
- - We also include the official results (obtained by gpt-4-0314) for applicable models.
132
- """
133
-
134
- LEADERBOARD_MD['COCO_VAL'] = """
135
- ## COCO Caption Results
136
-
137
- - By default, we evaluate COCO Caption Validation set (5000 samples), and report the following metrics: BLEU-1, BLEU-4, CIDEr, ROUGE-L (default sorted by CIDEr).
138
- - We use the following prompt to evaluate all VLMs: `Please describe this image in general. Directly provide the description, do not include prefix like "This image depicts". `
139
- - **No specific prompt is adopted for all VLMs.**
140
- """
141
-
142
- LEADERBOARD_MD['ScienceQA_VAL'] = """
143
- ## ScienceQA Evaluation Results
144
-
145
- - We benchmark the **image** subset of ScienceQA validation and test set, and report the Top-1 accuracy.
146
- - During evaluation, we use `GPT-3.5-Turbo-0613` as the choice extractor for all VLMs if the choice can not be extracted via heuristic matching. **Zero-shot** inference is adopted.
147
- """
148
-
149
- LEADERBOARD_MD['ScienceQA_TEST'] = LEADERBOARD_MD['ScienceQA_VAL']
150
-
151
- LEADERBOARD_MD['OCRBench'] = """
152
- ## OCRBench Evaluation Results
153
-
154
- - The evaluation of OCRBench is implemented by the official team: https://github.com/Yuliang-Liu/MultimodalOCR.
155
- - The performance of GPT4V might be underestimated: GPT4V rejects to answer 12 percent of the questions due to the policy of OpenAI. For those questions, the returned answer is "Your input image may contain content that is not allowed by our safety system."
156
- """
157
-
158
- LEADERBOARD_MD['MMStar'] = """
159
- ## MMStar Evaluation Results
160
-
161
- - MMStar is an elite vision-indispensable multi-modal benchmark, including 1,500 challenging samples meticulously selected by humans.
162
- - During the evaluation of MMStar, we find that some API models may reject to answer some of the questions. Currently, we treat such cases as wrong answers when reporting the results.
163
- """
164
-
165
- LEADERBOARD_MD['RealWorldQA'] = """
166
- ## RealWorldQA Evaluation Results
167
-
168
- - RealWorldQA is a benchmark designed to evaluate the real-world spatial understanding capabilities of multimodal AI models, contributed by XAI. It assesses how well these models comprehend physical environments. The benchmark consists of 700+ images, each accompanied by a question and a verifiable answer. These images are drawn from real-world scenarios, including those captured from vehicles. The goal is to advance AI models' understanding of our physical world.
169
- """
170
-
171
- LEADERBOARD_MD['TextVQA_VAL'] = """
172
- ## TextVQA Evaluation Results
173
-
174
- - TextVQA is a dataset to benchmark visual reasoning based on text in images. TextVQA requires models to read and reason about text in images to answer questions about them. Specifically, models need to incorporate a new modality of text present in the images and reason over it to answer TextVQA questions.
175
- - Note that some models may not be able to generate standardized responses based on the prompt. We currently do not have reports for these models.
176
- """
177
-
178
- LEADERBOARD_MD['ChartQA_TEST'] = """
179
- ## ChartQA Evaluation Results
180
-
181
- - ChartQA is a benchmark for question answering about charts with visual and logical reasoning.
182
- - Note that some models may not be able to generate standardized responses based on the prompt. We currently do not have reports for these models.
183
- """
184
-
185
- LEADERBOARD_MD['OCRVQA_TESTCORE'] = """
186
- ## OCRVQA Evaluation Results
187
-
188
- - OCRVQA is a benchmark for visual question answering by reading text in images. It presents a large-scale dataset, OCR-VQA-200K, comprising over 200,000 images of book covers. The study combines techniques from the Optical Character Recognition (OCR) and Visual Question Answering (VQA) domains to address the challenges associated with this new task and dataset.
189
- - Note that some models may not be able to generate standardized responses based on the prompt. We currently do not have reports for these models.
190
- """
191
-
192
- LEADERBOARD_MD['POPE'] = """
193
- ## POPE Evaluation Results
194
-
195
- - POPE is a benchmark for object hallucination evaluation. It includes three tracks of object hallucination: random, popular, and adversarial.
196
- - Note that the official POPE dataset contains approximately 8910 cases. POPE includes three tracks, and there are some overlapping samples among the three tracks. To reduce the data file size, we have kept only a single copy of the overlapping samples (about 5127 examples). However, the final accuracy will be calculated on the ~9k samples.
197
- - Some API models, due to safety policies, refuse to answer certain questions, so their actual capabilities may be higher than the reported scores.
198
- - We report the average F1 score across the three types of data as the overall score. Accuracy, precision, and recall are also shown in the table. F1 score = 2 * (precision * recall) / (precision + recall).
199
- """
200
-
201
- LEADERBOARD_MD['SEEDBench2_Plus'] = """
202
- ## SEEDBench2 Plus Evaluation Results
203
-
204
- - SEEDBench2 Plus comprises 2.3K multiple-choice questions with precise human annotations, spanning three broad categories: Charts, Maps, and Webs, each of which covers a wide spectrum of textrich scenarios in the real world.
205
- """
206
-
207
- LEADERBOARD_MD['MMT-Bench_VAL'] = """
208
- ## MMT-Bench Validation Evaluation Results
209
-
210
- - MMT-Bench comprises 31,325 meticulously curated multi-choice visual questions from various multimodal scenarios such as vehicle driving and embodied navigation, covering 32 core meta-tasks and 162 subtasks in multimodal understanding.
211
- - MMT-Bench_VAL is the validation set of MMT-Bench. MMT-Bench_ALL includes both validation and test sets. The suffix `MI`, such as `MMT-Bench_VAL_MI`, represents the multi-image version of the dataset with several images input.
212
- The defualt version is the single-image version, which concats the multiple images into a single image as input.
213
- """
214
-
215
- LEADERBOARD_MD['SEEDBench2'] = """
216
- ## SEEDBench2 Evaluation Results
217
-
218
- - SEEDBench2 comprises 24K multiple-choice questions with accurate human annotations, which spans 27 dimensions, including the evaluation of both text and image generation.
219
- - Note that we only evaluate and report the part of model's results on the SEEDBench2.
220
- """
221
-
222
- LEADERBOARD_MD['BLINK'] = """
223
- ## BLINK Test Evaluation Results
224
-
225
- - BLINK is a benchmark containing 14 visual perception tasks that can be solved by humans β€œwithin a blink”, but pose significant challenges for current multimodal large language models (LLMs).
226
- - We evaluate BLINK on the test set of the benchmark, which contains 1901 visual questions in multi-choice format.
227
- """
 
1
  # CONSTANTS-URL
2
+ URL = "http://opencompass.openxlab.space/assets/mmbench/mmbench-data.json"
3
+ MMBench_README = 'https://raw.githubusercontent.com/open-compass/MMBench/main/README.md'
4
  # CONSTANTS-CITATION
5
+ CITATION_BUTTON_TEXT = r"""@article{MMBench,
6
+ author = {Yuan Liu, Haodong Duan, Yuanhan Zhang, Bo Li, Songyang Zhang, Wangbo Zhao, Yike Yuan, Jiaqi Wang, Conghui He, Ziwei Liu, Kai Chen, Dahua Lin},
7
+ journal = {arXiv:2307.06281},
8
+ title = {MMBench: Is Your Multi-modal Model an All-around Player?},
9
+ year = {2023},
10
  }"""
11
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
12
  # CONSTANTS-TEXT
13
+ LEADERBORAD_INTRODUCTION = """# MMBench Leaderboard
14
+ ### Welcome to the MMBench Leaderboard! On this leaderboard we share the evaluation results of VLMs on MMBench, MMBench v1.1, and CCBench.
 
 
15
 
16
+ To add your own model to the leaderboard, please create a PR in [VLMEvalKit](https://github.com/open-compass/VLMEvalKit) to support your VLM or directly send us the inference results of your VLM, obtained with VLMEvalKit.
17
+ For any questions or concerns, please feel free to contact us at [opencompass, duanhaodong]@pjlab.org.cn.
 
18
  """
19
+
20
  # CONSTANTS-FIELDS
21
+ META_FIELDS = ['Method', 'Param (B)', 'Language Model', 'Vision Model', 'OpenSource', 'Verified', 'Org']
 
 
 
 
 
 
 
 
 
 
 
22
  MMBENCH_FIELDS = ['MMBench_TEST_EN_V11', 'MMBench_TEST_CN_V11', 'MMBench_TEST_EN', 'MMBench_TEST_CN', 'CCBench']
23
  MODEL_SIZE = ['<4B', '4B-10B', '10B-20B', '20B-40B', '>40B', 'Unknown']
24
+ MODEL_TYPE = ['Public', 'Private', 'Verified']