yinanhe commited on
Commit
791df9f
1 Parent(s): 63bede2
Files changed (4) hide show
  1. app.py +240 -0
  2. constants.py +38 -0
  3. requirements.txt +3 -0
  4. result.csv +18 -0
app.py ADDED
@@ -0,0 +1,240 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ __all__ = ['block', 'make_clickable_model', 'make_clickable_user', 'get_submissions']
3
+
4
+ import gradio as gr
5
+ import pandas as pd
6
+ import json
7
+ import tempfile
8
+
9
+ from constants import *
10
+
11
+ global data_component, filter_component
12
+
13
+
14
+ def upload_file(files):
15
+ file_paths = [file.name for file in files]
16
+ return file_paths
17
+
18
+ def add_new_eval(
19
+ input_file,
20
+ model_name_textbox: str,
21
+ revision_name_textbox: str,
22
+ model_type: str,
23
+ model_link: str,
24
+ model_size: str,
25
+ LLM_type: str,
26
+ LLM_name_textbox: str,
27
+ ):
28
+ if input_file is None:
29
+ return "Error! Empty file!"
30
+
31
+ upload_data=json.loads(input_file)
32
+ csv_data = pd.read_csv(CSV_DIR)
33
+
34
+ if LLM_type == 'Other':
35
+ LLM_name = LLM_name_textbox
36
+ else:
37
+ LLM_name = LLM_type
38
+
39
+ if revision_name_textbox == '':
40
+ col = csv_data.shape[0]
41
+ model_name = model_name_textbox
42
+ else:
43
+ model_name = revision_name_textbox
44
+ model_name_list = csv_data['Model']
45
+ name_list = [name.split(']')[0][1:] for name in model_name_list]
46
+ if revision_name_textbox not in name_list:
47
+ col = csv_data.shape[0]
48
+ else:
49
+ col = name_list.index(revision_name_textbox)
50
+
51
+ if model_link == '':
52
+ model_name = model_name # no url
53
+ else:
54
+ model_name = '[' + model_name + '](' + model_link + ')'
55
+
56
+ # add new data
57
+ new_data = [
58
+ model_type,
59
+ model_name,
60
+ LLM_name
61
+ ]
62
+ for key in TASK_INFO:
63
+ if key in upload_data:
64
+ new_data.append(upload_data[key])
65
+ else:
66
+ new_data.append(0)
67
+ csv_data.loc[col] = new_data
68
+ csv_data = csv_data.to_csv(CSV_DIR, index=False)
69
+ return 0
70
+
71
+ def get_baseline_df():
72
+ # pdb.set_trace()
73
+ df = pd.read_csv(CSV_DIR)
74
+ df = df.sort_values(by="Avg", ascending=False)
75
+ present_columns = MODEL_INFO + checkbox_group.value
76
+ df = df[present_columns]
77
+ return df
78
+
79
+ def get_all_df():
80
+ df = pd.read_csv(CSV_DIR)
81
+ df = df.sort_values(by="Avg", ascending=False)
82
+ return df
83
+
84
+
85
+ block = gr.Blocks()
86
+
87
+
88
+ with block:
89
+ gr.Markdown(
90
+ LEADERBORAD_INTRODUCTION
91
+ )
92
+ with gr.Tabs(elem_classes="tab-buttons") as tabs:
93
+ with gr.TabItem("📊 MVBench", elem_id="mvbench-tab-table", id=1):
94
+ with gr.Row():
95
+ with gr.Accordion("Citation", open=False):
96
+ citation_button = gr.Textbox(
97
+ value=CITATION_BUTTON_TEXT,
98
+ label=CITATION_BUTTON_LABEL,
99
+ elem_id="citation-button",
100
+ ).style(show_copy_button=True)
101
+
102
+ gr.Markdown(
103
+ TABLE_INTRODUCTION
104
+ )
105
+
106
+ # selection for column part:
107
+ checkbox_group = gr.CheckboxGroup(
108
+ choices=TASK_INFO,
109
+ value=AVG_INFO,
110
+ label="Evaluation Dimension",
111
+ interactive=True,
112
+ )
113
+
114
+
115
+
116
+ # 创建数据帧组件
117
+ data_component = gr.components.Dataframe(
118
+ value=get_baseline_df,
119
+ headers=COLUMN_NAMES,
120
+ type="pandas",
121
+ datatype=DATA_TITILE_TYPE,
122
+ interactive=False,
123
+ visible=True,
124
+ )
125
+
126
+ def on_filter_model_size_method_change(selected_columns):
127
+ updated_data = get_all_df()
128
+
129
+ # columns:
130
+ selected_columns = [item for item in TASK_INFO if item in selected_columns]
131
+ present_columns = MODEL_INFO + selected_columns
132
+ updated_data = updated_data[present_columns]
133
+ updated_data = updated_data.sort_values(by=selected_columns[0], ascending=False)
134
+ updated_headers = present_columns
135
+ update_datatype = [DATA_TITILE_TYPE[COLUMN_NAMES.index(x)] for x in updated_headers]
136
+
137
+ filter_component = gr.components.Dataframe(
138
+ value=updated_data,
139
+ headers=updated_headers,
140
+ type="pandas",
141
+ datatype=update_datatype,
142
+ interactive=False,
143
+ visible=True,
144
+ )
145
+
146
+ return filter_component.value
147
+
148
+ checkbox_group.change(fn=on_filter_model_size_method_change, inputs=[ checkbox_group], outputs=data_component)
149
+
150
+ # table 2
151
+ with gr.TabItem("📝 About", elem_id="mvbench-tab-table", id=2):
152
+ gr.Markdown(LEADERBORAD_INFO, elem_classes="markdown-text")
153
+
154
+ # table 3
155
+ with gr.TabItem("🚀 Submit here! ", elem_id="mvbench-tab-table", id=3):
156
+ gr.Markdown(LEADERBORAD_INTRODUCTION, elem_classes="markdown-text")
157
+
158
+ with gr.Row():
159
+ gr.Markdown(SUBMIT_INTRODUCTION, elem_classes="markdown-text")
160
+
161
+ with gr.Row():
162
+ gr.Markdown("# ✉️✨ Submit your model evaluation json file here!", elem_classes="markdown-text")
163
+
164
+ with gr.Row():
165
+ with gr.Column():
166
+ model_name_textbox = gr.Textbox(
167
+ label="Model name", placeholder="LLaMA-7B"
168
+ )
169
+ revision_name_textbox = gr.Textbox(
170
+ label="Revision Model Name", placeholder="LLaMA-7B"
171
+ )
172
+ model_type = gr.Dropdown(
173
+ choices=[
174
+ "LLM",
175
+ "ImageLLM",
176
+ "VideoLLM",
177
+ "Other",
178
+ ],
179
+ label="Model type",
180
+ multiselect=False,
181
+ value="ImageLLM",
182
+ interactive=True,
183
+ )
184
+
185
+
186
+
187
+ with gr.Column():
188
+ LLM_type = gr.Dropdown(
189
+ choices=["Vicuna-7B", "Flan-T5-XL", "LLaMA-7B", "InternLM-7B", "Other"],
190
+ label="LLM type",
191
+ multiselect=False,
192
+ value="LLaMA-7B",
193
+ interactive=True,
194
+ )
195
+ LLM_name_textbox = gr.Textbox(
196
+ label="LLM model (for Other)",
197
+ placeholder="LLaMA-13B"
198
+ )
199
+ model_link = gr.Textbox(
200
+ label="Model Link", placeholder="https://huggingface.co/decapoda-research/llama-7b-hf"
201
+ )
202
+ model_size = gr.Textbox(
203
+ label="Model size", placeholder="7B(Input content format must be 'number+B' or '-')"
204
+ )
205
+
206
+ with gr.Column():
207
+
208
+ input_file = gr.inputs.File(label = "Click to Upload a json File", file_count="single", type='binary')
209
+ submit_button = gr.Button("Submit Eval")
210
+
211
+ submission_result = gr.Markdown()
212
+ submit_button.click(
213
+ add_new_eval,
214
+ inputs = [
215
+ input_file,
216
+ model_name_textbox,
217
+ revision_name_textbox,
218
+ model_type,
219
+ model_link,
220
+ model_size,
221
+ LLM_type,
222
+ LLM_name_textbox,
223
+ ],
224
+ )
225
+
226
+
227
+ def refresh_data():
228
+ value1 = get_baseline_df()
229
+
230
+ return value1
231
+
232
+ with gr.Row():
233
+ data_run = gr.Button("Refresh")
234
+ data_run.click(
235
+ refresh_data, outputs=[data_component]
236
+ )
237
+
238
+ # block.load(get_baseline_df, outputs=data_title)
239
+
240
+ block.launch(server_name='0.0.0.0', server_port=10036)
constants.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # this is .py for store constants
2
+ MODEL_INFO = ["Type", "Model", "Language Model"]
3
+ TASK_INFO = ["Avg","Action Antonym","Action Count","Action Localization","Action Prediction","Action Sequence","Character Count","Counterfactual Inference","Egocentric Navigation","Episodic Reasoning","Fine grained Action","Fine grained Pose","Moving Attribute","Moving Count","Moving Direction","Object Existence","Object Interaction","Object Shuffle","Scene Transition","State Change","Unexpected Action"]
4
+
5
+ AVG_INFO = ["Avg"]
6
+
7
+ DATA_TITILE_TYPE = ['markdown', 'markdown', 'markdown', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number']
8
+
9
+ CSV_DIR = "./result.csv"
10
+
11
+ COLUMN_NAMES = MODEL_INFO + TASK_INFO
12
+
13
+
14
+
15
+ LEADERBORAD_INTRODUCTION = """# MVBench Leaderboard
16
+
17
+ Welcome to the leaderboard of the MVBench! 🏆
18
+
19
+ We introduce a comprehensive Multi-modal Video understanding Benchmark, namely MVBench, which covers 20 challenging video tasks that cannot be effectively solved with a single frame. Specifically, we first introduce a novel static-to-dynamic method to define these temporal-related tasks. By transforming various static tasks into dynamic ones, we enable the systematic generation of video tasks that require a broad spectrum of temporal skills, ranging from perception to cognition. Then, guided by the task definition, we automatically convert public video annotations into multiple-choice QA to evaluate each task. On one hand, such a distinct paradigm allows us to build MVBench efficiently, without much manual intervention. On the other hand, it guarantees evaluation fairness with ground-truth video annotations, avoiding the biased scoring of LLMs.
20
+ """
21
+
22
+ SUBMIT_INTRODUCTION = """# Submit on MVBench Benchmark Introduction
23
+ """
24
+
25
+ TABLE_INTRODUCTION = """
26
+ """
27
+
28
+ LEADERBORAD_INFO = """
29
+ With the rapid development of Multi-modal Large Language Models (MLLMs), a number of diagnostic benchmarks have recently emerged to evaluate the comprehension capabilities of these models. However, most benchmarks predominantly assess spatial understanding in the static image tasks, while overlooking temporal understanding in the dynamic video tasks. To alleviate this issue, we introduce a comprehensive Multi-modal Video understanding Benchmark, namely MVBench, which covers 20 challenging video tasks that cannot be effectively solved with a single frame. Specifically, we first introduce a novel static-to-dynamic method to define these temporal-related tasks. By transforming various static tasks into dynamic ones, we enable the systematic generation of video tasks that require a broad spectrum of temporal skills, ranging from perception to cognition. Then, guided by the task definition, we automatically convert public video annotations into multiple-choice QA to evaluate each task. On one hand, such a distinct paradigm allows us to build MVBench efficiently, without much manual intervention. On the other hand, it guarantees evaluation fairness with ground-truth video annotations, avoiding the biased scoring of LLMs. Moreover, we further develop a robust video MLLM baseline, i.e., VideoChat2, by progressive multi-modal training with diverse instruction-tuning data. The extensive results on our MVBench reveal that, the existing MLLMs are far from satisfactory in temporal understanding, while our VideoChat2 largely surpasses these leading models by over 15% on MVBench. All models and data are available at this https URL.
30
+ """
31
+
32
+ CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
33
+ CITATION_BUTTON_TEXT = r"""@article{li2023mvbench,
34
+ title={MVBench: A Comprehensive Multi-modal Video Understanding Benchmark},
35
+ author={Li, Kunchang and Wang, Yali and He, Yinan and Li, Yizhuo and Wang, Yi and Liu, Yi and Wang, Zun and Xu, Jilan and Chen, Guo and Luo, Ping and others},
36
+ journal={arXiv preprint arXiv:2311.17005},
37
+ year={2023}
38
+ }"""
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ gradio
2
+ gradio_client
3
+ pandas
result.csv ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Type,Model,Language Model,Avg,Action Antonym,Action Count,Action Localization,Action Prediction,Action Sequence,Character Count,Counterfactual Inference,Egocentric Navigation,Episodic Reasoning,Fine grained Action,Fine grained Pose,Moving Attribute,Moving Count,Moving Direction,Object Existence,Object Interaction,Object Shuffle,Scene Transition,State Change,Unexpected Action
2
+ LLM,Random,NOLLM,28.0,33.3,33.3,25.0,25.0,25.0,33.3,30.9,25.0,20.0,25.0,25.0,33.3,25.0,25.0,33.3,25.0,33.3,25.0,33.3,25.0
3
+ ImageLLM,mPLUG-Owl-I,LLaMA-7B,29.4,44.5,34.5,24.0,20.0,25.0,37.0,37.0,25.5,21.0,27.0,24.0,31.5,22.0,23.0,36.0,24.0,34.0,34.5,40.0,23.5
4
+ ImageLLM,LLaMA-Adapter,LLaMA-7B,31.7,51.0,29.0,21.5,28.0,23.0,31.5,32.0,22.5,28.0,30.0,25.0,41.5,22.5,25.5,53.5,32.5,33.5,30.5,39.5,33.0
5
+ ImageLLM,BLIP2,FlanT5-XL,31.4,33.5,25.5,26.0,29.0,24.5,30.0,31.0,26.0,37.0,17.0,27.0,40.0,30.0,25.5,51.5,26.0,31.0,32.5,42.0,42.0
6
+ ImageLLM,Otter-I,MPT-7B,33.5,39.5,20.0,25.5,32.0,34.5,27.0,36.5,32.0,29.0,30.5,28.0,28.5,32.5,19.0,48.5,44.0,29.5,55.0,39.0,38.5
7
+ ImageLLM,MiniGPT-4,Vicuna-7B,18.8,26.0,32.5,12.0,18.0,16.0,29.5,3.0,19.0,9.9,21.5,26.0,8.0,15.5,11.5,29.5,25.5,13.0,9.5,34.0,16.0
8
+ ImageLLM,InstructBLIP,Vicuna-7B,32.5,46.0,42.5,23.0,16.5,20.0,30.0,38.0,25.5,30.5,24.5,25.5,40.5,26.5,22.0,51.0,26.0,37.5,46.5,32.0,46.0
9
+ ImageLLM,LLaVA,Vicuna-7B,36.0,63.0,34.0,20.5,39.5,28.0,36.0,42.0,27.0,26.5,30.5,25.0,38.5,20.5,23.0,53.0,41.0,41.5,45.0,47.0,39.0
10
+ VideoLLM,Otter-V,LLaMA-7B,26.8,27.5,26.0,23.5,23.0,23.0,22.0,19.5,23.5,19.0,27.0,22.0,18.0,28.5,24.5,53.0,28.0,33.0,27.5,38.5,29.5
11
+ VideoLLM,mPLUG-Owl-V,LLaMA-7B,29.7,34.0,31.5,23.0,28.0,22.0,31.0,29.5,26.0,20.5,29.0,24.0,40.0,27.0,27.0,40.5,27.0,31.5,29.0,44.0,29.0
12
+ VideoLLM,VideoChatGPT,Vicuna-7B,32.7,62.0,30.5,20.0,26.0,23.5,33.0,35.5,29.5,26.0,22.5,29.0,39.5,25.5,23.0,54.0,28.0,40.0,31.0,48.5,26.5
13
+ VideoLLM,VideoLLaMA,Vicuna-7B,34.1,51.0,34.0,22.5,25.5,27.5,40.0,37.0,30.0,21.0,29.0,32.5,32.5,22.5,22.5,48.0,40.5,38.0,43.0,45.5,39.0
14
+ VideoLLM,VideoChat,Vicuna-7B,35.5,56.0,35.0,27.0,26.5,33.5,41.0,36.0,23.5,23.5,33.5,26.5,42.5,20.5,25.5,53.0,40.5,30.0,48.5,46.0,40.5
15
+ VideoLLM,VideoChat2_text,Vicuna-7B,34.7,49.5,41.5,27.0,27.0,24.5,36.0,40.0,33.0,32.0,27.0,26.5,32.5,27.5,25.5,53.0,28.0,40.0,38.5,46.5,38.0
16
+ VideoLLM,VideoChat2,Vicuna-7B,51.1,83.5,39.0,23.0,47.5,66.0,36.5,65.5,35.0,40.5,49.5,49.0,58.5,42.0,23.0,58.0,71.5,42.5,88.5,44.0,60.0
17
+ VideoLLM,GPT-4V,GPT-4,43.7,72.0,39.0,40.5,63.5,55.5,52.0,11.0,31.0,59.0,46.5,47.5,22.5,12.0,12.0,18.5,59.0,29.5,83.5,45.0,73.5
18
+ ImageLLM,GiminiPro,Gimini,37.7,43.7,3.9,40.0,41.8,35.4,38.7,33.7,36.4,36.4,36.2,26.5,41.5,18.0,16.5,43.5,37.5,39.8,75.4,42.3,67.1