nebulae09 commited on
Commit
243897a
·
1 Parent(s): a542904

init leaderboard

Browse files
Files changed (3) hide show
  1. app.py +145 -0
  2. lb_info.py +194 -0
  3. requirements.txt +3 -0
app.py ADDED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import abc
2
+ import gradio as gr
3
+ from lb_info import *
4
+
5
+ with gr.Blocks() as demo:
6
+ struct = load_results()
7
+ timestamp = struct['time']
8
+ EVAL_TIME = format_timestamp(timestamp)
9
+ results = struct['results']
10
+ N_MODEL = len(results)
11
+ N_DATA = len(results['Video-LLaVA']) - 1
12
+ DATASETS = list(results['Video-LLaVA'])
13
+ DATASETS.remove('META')
14
+ print(DATASETS)
15
+
16
+ gr.Markdown(LEADERBORAD_INTRODUCTION.format(N_MODEL, N_DATA, EVAL_TIME))
17
+ structs = [abc.abstractproperty() for _ in range(N_DATA)]
18
+
19
+ with gr.Tabs(elem_classes='tab-buttons') as tabs:
20
+ with gr.TabItem('🏅 OpenVLM Video Leaderboard', elem_id='main', id=0):
21
+ gr.Markdown(LEADERBOARD_MD['MAIN'])
22
+ table, check_box = BUILD_L1_DF(results, MAIN_FIELDS)
23
+ type_map = check_box['type_map']
24
+ checkbox_group = gr.CheckboxGroup(
25
+ choices=check_box['all'],
26
+ value=check_box['required'],
27
+ label="Evaluation Dimension",
28
+ interactive=True,
29
+ )
30
+ headers = check_box['essential'] + checkbox_group.value
31
+ with gr.Row():
32
+ model_size = gr.CheckboxGroup(
33
+ choices=MODEL_SIZE,
34
+ value=MODEL_SIZE,
35
+ label='Model Size',
36
+ interactive=True
37
+ )
38
+ model_type = gr.CheckboxGroup(
39
+ choices=MODEL_TYPE,
40
+ value=MODEL_TYPE,
41
+ label='Model Type',
42
+ interactive=True
43
+ )
44
+ data_component = gr.components.DataFrame(
45
+ value=table[headers],
46
+ type="pandas",
47
+ datatype=[type_map[x] for x in headers],
48
+ interactive=False,
49
+ visible=True)
50
+
51
+ def filter_df(fields, model_size, model_type):
52
+ headers = check_box['essential'] + fields
53
+ df = cp.deepcopy(table)
54
+ df['flag'] = [model_size_flag(x, model_size) for x in df['Parameters (B)']]
55
+ df = df[df['flag']]
56
+ df.pop('flag')
57
+ if len(df):
58
+ print(model_type)
59
+ df['flag'] = [model_type_flag(df.iloc[i], model_type) for i in range(len(df))]
60
+ df = df[df['flag']]
61
+ df.pop('flag')
62
+
63
+ comp = gr.components.DataFrame(
64
+ value=df[headers],
65
+ type="pandas",
66
+ datatype=[type_map[x] for x in headers],
67
+ interactive=False,
68
+ visible=True)
69
+ return comp
70
+
71
+ for cbox in [checkbox_group, model_size, model_type]:
72
+ cbox.change(fn=filter_df, inputs=[checkbox_group, model_size, model_type], outputs=data_component)
73
+
74
+ with gr.TabItem('🔍 About', elem_id='about', id=1):
75
+ gr.Markdown(urlopen(VLMEVALKIT_README).read().decode())
76
+
77
+ for i, dataset in enumerate(DATASETS):
78
+ with gr.TabItem(f'📊 {dataset} Leaderboard', elem_id=dataset, id=i + 2):
79
+ if dataset in LEADERBOARD_MD:
80
+ gr.Markdown(LEADERBOARD_MD[dataset])
81
+
82
+ s = structs[i]
83
+ s.table, s.check_box = BUILD_L2_DF(results, dataset)
84
+ s.type_map = s.check_box['type_map']
85
+ s.checkbox_group = gr.CheckboxGroup(
86
+ choices=s.check_box['all'],
87
+ value=s.check_box['required'],
88
+ label=f"{dataset} CheckBoxes",
89
+ interactive=True,
90
+ )
91
+ s.headers = s.check_box['essential'] + s.checkbox_group.value
92
+ with gr.Row():
93
+ s.model_size = gr.CheckboxGroup(
94
+ choices=MODEL_SIZE,
95
+ value=MODEL_SIZE,
96
+ label='Model Size',
97
+ interactive=True
98
+ )
99
+ s.model_type = gr.CheckboxGroup(
100
+ choices=MODEL_TYPE,
101
+ value=MODEL_TYPE,
102
+ label='Model Type',
103
+ interactive=True
104
+ )
105
+ s.data_component = gr.components.DataFrame(
106
+ value=s.table[s.headers],
107
+ type="pandas",
108
+ datatype=[s.type_map[x] for x in s.headers],
109
+ interactive=False,
110
+ visible=True)
111
+ s.dataset = gr.Textbox(value=dataset, label=dataset, visible=False)
112
+
113
+ def filter_df_l2(dataset_name, fields, model_size, model_type):
114
+ s = structs[DATASETS.index(dataset_name)]
115
+ headers = s.check_box['essential'] + fields
116
+ df = cp.deepcopy(s.table)
117
+ df['flag'] = [model_size_flag(x, model_size) for x in df['Parameters (B)']]
118
+ df = df[df['flag']]
119
+ df.pop('flag')
120
+ if len(df):
121
+ df['flag'] = [model_type_flag(df.iloc[i], model_type) for i in range(len(df))]
122
+ df = df[df['flag']]
123
+ df.pop('flag')
124
+
125
+ comp = gr.components.DataFrame(
126
+ value=df[headers],
127
+ type="pandas",
128
+ datatype=[s.type_map[x] for x in headers],
129
+ interactive=False,
130
+ visible=True)
131
+ return comp
132
+
133
+ for cbox in [s.checkbox_group, s.model_size, s.model_type]:
134
+ cbox.change(fn=filter_df_l2, inputs=[s.dataset, s.checkbox_group, s.model_size, s.model_type], outputs=s.data_component)
135
+
136
+
137
+ with gr.Row():
138
+ with gr.Accordion("Citation", open=False):
139
+ citation_button = gr.Textbox(
140
+ value=CITATION_BUTTON_TEXT,
141
+ label=CITATION_BUTTON_LABEL,
142
+ elem_id='citation-button')
143
+
144
+ if __name__ == '__main__':
145
+ demo.launch(server_name='0.0.0.0')
lb_info.py ADDED
@@ -0,0 +1,194 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import pandas as pd
3
+ from collections import defaultdict
4
+ import gradio as gr
5
+ import copy as cp
6
+ import numpy as np
7
+
8
+ def listinstr(lst, s):
9
+ assert isinstance(lst, list)
10
+ for item in lst:
11
+ if item in s:
12
+ return True
13
+ return False
14
+
15
+ # CONSTANTS-URL
16
+ # RESULT_FILE = '../video_leaderboard_result_final.json'
17
+ URL = "http://opencompass.openxlab.space/utils/video_leaderboard.json"
18
+ VLMEVALKIT_README = 'https://raw.githubusercontent.com/open-compass/VLMEvalKit/main/README.md'
19
+ # CONSTANTS-CITATION
20
+ CITATION_BUTTON_TEXT = r"""@misc{duan2024vlmevalkitopensourcetoolkitevaluating,
21
+ title={VLMEvalKit: An Open-Source Toolkit for Evaluating Large Multi-Modality Models},
22
+ author={Haodong Duan and Junming Yang and Yuxuan Qiao and Xinyu Fang and Lin Chen and Yuan Liu and Amit Agarwal and Zhe Chen and Mo Li and Yubo Ma and Hailong Sun and Xiangyu Zhao and Junbo Cui and Xiaoyi Dong and Yuhang Zang and Pan Zhang and Jiaqi Wang and Dahua Lin and Kai Chen},
23
+ year={2024},
24
+ eprint={2407.11691},
25
+ archivePrefix={arXiv},
26
+ primaryClass={cs.CV},
27
+ url={https://arxiv.org/abs/2407.11691},
28
+ }"""
29
+ CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
30
+ # CONSTANTS-TEXT
31
+ LEADERBORAD_INTRODUCTION = """# OpenVLM Video Leaderboard
32
+ ### Welcome to the OpenVLM Video Leaderboard! On this leaderboard we share the evaluation results of VLMs on the video understanding benchmark obtained by the OpenSource Framework [**VLMEvalKit**](https://github.com/open-compass/VLMEvalKit) 🏆
33
+ ### Currently, OpenVLM Video Leaderboard covers {} different VLMs (including GPT-4o, Gemini-1.5, LLaVA-OneVision, etc.) and {} different video understanding benchmarks.
34
+
35
+ This leaderboard was last updated: {}.
36
+ """
37
+ # CONSTANTS-FIELDS
38
+ META_FIELDS = ['Method', 'Parameters (B)', 'Language Model', 'Vision Model', 'OpenSource', 'Verified', 'Frames']
39
+ MAIN_FIELDS = ['MVBench', 'Video-MME (w/o subs)', 'MMBench-Video']
40
+ MODEL_SIZE = ['<10B', '10B-20B', '20B-40B', '>40B', 'Unknown']
41
+ MODEL_TYPE = ['API', 'OpenSource']
42
+
43
+ # The README file for each benchmark
44
+ LEADERBOARD_MD = {}
45
+
46
+ LEADERBOARD_MD['MAIN'] = """
47
+ ## Main Evaluation Results
48
+
49
+ - Avg Score: The average score on all video understanding Benchmarks (normalized to 0 - 100, the higher the better).
50
+ - Avg Rank: The average rank on all video understanding Benchmarks (the lower the better).
51
+ - The overall evaluation results on 3 video understanding benchmarks, sorted by the ascending order of Avg Rank.
52
+ """
53
+
54
+ LEADERBOARD_MD['Video-MME (w/o subs)'] = """
55
+ ## Video-MME (w/o subs) Evaluation Results
56
+
57
+ - We give the total scores for the three video lengths (short, medium and long), as well as the total scores for each task type.
58
+ - Video-MME (w subs) will update as evaluation is completed.
59
+ """
60
+
61
+ # LEADERBOARD_MD['MVBench'] = """
62
+ # ## MVBench Evaluation Results
63
+ # """
64
+
65
+ # LEADERBOARD_MD['MMBench-Video'] = """
66
+ # ## MMBench-Video Evaluation Results
67
+ # """
68
+
69
+
70
+
71
+ from urllib.request import urlopen
72
+
73
+ # def load_results():
74
+ # with open(RESULT_FILE, 'r', encoding='utf-8') as file:
75
+ # data = json.load(file)
76
+ # return data
77
+ def load_results():
78
+ data = json.loads(urlopen(URL).read())
79
+ return data
80
+
81
+ def nth_large(val, vals):
82
+ return sum([1 for v in vals if v > val]) + 1
83
+
84
+ def format_timestamp(timestamp):
85
+ return timestamp[:2] + '.' + timestamp[2:4] + '.' + timestamp[4:6] + ' ' + timestamp[6:8] + ':' + timestamp[8:10] + ':' + timestamp[10:12]
86
+
87
+ def model_size_flag(sz, FIELDS):
88
+ if pd.isna(sz) or sz == 'N/A':
89
+ if 'Unknown' in FIELDS:
90
+ return True
91
+ else:
92
+ return False
93
+ sz = float(sz.replace('B','').replace('(LLM)',''))
94
+ if '<10B' in FIELDS and sz < 10:
95
+ return True
96
+ if '10B-20B' in FIELDS and sz >= 10 and sz < 20:
97
+ return True
98
+ if '20B-40B' in FIELDS and sz >= 20 and sz < 40:
99
+ return True
100
+ if '>40B' in FIELDS and sz >= 40:
101
+ return True
102
+ return False
103
+
104
+ def model_type_flag(line, FIELDS):
105
+ if 'OpenSource' in FIELDS and line['OpenSource'] == 'Yes':
106
+ return True
107
+ if 'API' in FIELDS and line['OpenSource'] == 'No':
108
+ return True
109
+ return False
110
+
111
+ def BUILD_L1_DF(results, fields):
112
+ res = defaultdict(list)
113
+ for i, m in enumerate(results):
114
+ item = results[m]
115
+ meta = item['META']
116
+ for k in META_FIELDS:
117
+ if k == 'Parameters (B)':
118
+ param = meta['Parameters']
119
+ res[k].append(param.replace('B', '') if param != '' else None)
120
+ # res[k].append(float(param.replace('B', '')) if param != '' else None)
121
+ elif k == 'Method':
122
+ name, url = meta['Method']
123
+ res[k].append(f'<a href="{url}">{name}</a>')
124
+ else:
125
+ res[k].append(meta[k])
126
+ scores, ranks = [], []
127
+ for d in fields:
128
+ res[d].append(item[d]['Overall'])
129
+ # scores.append(item[d]['Overall'])
130
+ if d == 'MMBench-Video':
131
+ scores.append(item[d]['Overall'] / 3 * 100)
132
+ else:
133
+ scores.append(item[d]['Overall'])
134
+ ranks.append(nth_large(item[d]['Overall'], [x[d]['Overall'] for x in results.values()]))
135
+ res['Avg Score'].append(round(np.mean(scores), 1))
136
+ res['Avg Rank'].append(round(np.mean(ranks), 2))
137
+
138
+ df = pd.DataFrame(res)
139
+ df = df.sort_values('Avg Rank')
140
+
141
+ check_box = {}
142
+ check_box['essential'] = ['Method', 'Parameters (B)', 'Language Model', 'Vision Model', 'Frames']
143
+ check_box['required'] = ['Avg Score', 'Avg Rank']
144
+ check_box['all'] = check_box['required'] + ['OpenSource', 'Verified'] + fields
145
+ type_map = defaultdict(lambda: 'number')
146
+ type_map['Method'] = 'html'
147
+ type_map['Language Model'] = type_map['Vision Model'] = type_map['OpenSource'] = type_map['Verified'] = type_map['Frames'] = 'str'
148
+ check_box['type_map'] = type_map
149
+ return df, check_box
150
+
151
+ def BUILD_L2_DF(results, dataset):
152
+ res = defaultdict(list)
153
+ fields = list(list(results.values())[0][dataset].keys())
154
+ non_overall_fields = [x for x in fields if 'Overall' not in x]
155
+ overall_fields = [x for x in fields if 'Overall' in x]
156
+
157
+ for m in results:
158
+ item = results[m]
159
+ meta = item['META']
160
+ for k in META_FIELDS:
161
+ if k == 'Parameters (B)':
162
+ param = meta['Parameters']
163
+ res[k].append(param.replace('B', '') if param != '' else None)
164
+ # res[k].append(float(param.replace('B', '')) if param != '' else None)
165
+ elif k == 'Method':
166
+ name, url = meta['Method']
167
+ res[k].append(f'<a href="{url}">{name}</a>')
168
+ else:
169
+ res[k].append(meta[k])
170
+ fields = [x for x in fields]
171
+
172
+ for d in non_overall_fields:
173
+ res[d].append(item[dataset][d])
174
+ for d in overall_fields:
175
+ res[d].append(item[dataset][d])
176
+
177
+ df = pd.DataFrame(res)
178
+ df = df.sort_values('Overall')
179
+ df = df.iloc[::-1]
180
+
181
+ check_box = {}
182
+ check_box['essential'] = ['Method', 'Parameters (B)', 'Language Model', 'Vision Model', 'Frames']
183
+ if dataset == 'MMBench-Video':
184
+ check_box['required'] = overall_fields + ['Perception', 'Reasoning']
185
+ elif 'Video-MME' in dataset:
186
+ check_box['required'] = overall_fields + ['short', 'medium', 'long']
187
+ else:
188
+ check_box['required'] = overall_fields
189
+ check_box['all'] = non_overall_fields + overall_fields
190
+ type_map = defaultdict(lambda: 'number')
191
+ type_map['Method'] = 'html'
192
+ type_map['Language Model'] = type_map['Vision Model'] = type_map['OpenSource'] = type_map['Verified'] = type_map['Frames'] ='str'
193
+ check_box['type_map'] = type_map
194
+ return df, check_box
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ numpy>=1.23.4
2
+ pandas>=1.5.3
3
+ gradio==4.15.0