Spaces:

OpenGVLab
/

MVBench_Leaderboard

Running

App Files Files Community

yinanhe commited on Dec 16, 2023

Commit

791df9f

•

1 Parent(s): 63bede2

[Init]

Browse files

Files changed (4) hide show

app.py +240 -0
constants.py +38 -0
requirements.txt +3 -0
result.csv +18 -0

app.py ADDED Viewed

	@@ -0,0 +1,240 @@

+__all__ = ['block', 'make_clickable_model', 'make_clickable_user', 'get_submissions']
+import gradio as gr
+import pandas as pd
+import json
+import tempfile
+from constants import *
+global data_component, filter_component
+def upload_file(files):
+    file_paths = [file.name for file in files]
+    return file_paths
+def add_new_eval(
+    input_file,
+    model_name_textbox: str,
+    revision_name_textbox: str,
+    model_type: str,
+    model_link: str,
+    model_size: str,
+    LLM_type: str,
+    LLM_name_textbox: str,
+):
+    if input_file is None:
+        return "Error! Empty file!"
+    upload_data=json.loads(input_file)
+    csv_data = pd.read_csv(CSV_DIR)
+    if LLM_type == 'Other':
+        LLM_name = LLM_name_textbox
+    else:
+        LLM_name = LLM_type
+    if revision_name_textbox == '':
+        col = csv_data.shape[0]
+        model_name = model_name_textbox
+    else:
+        model_name = revision_name_textbox
+        model_name_list = csv_data['Model']
+        name_list = [name.split(']')[0][1:] for name in model_name_list]
+        if revision_name_textbox not in name_list:
+            col = csv_data.shape[0]
+        else:
+            col = name_list.index(revision_name_textbox)
+    if model_link == '':
+        model_name = model_name  # no url
+    else:
+        model_name = '[' + model_name + '](' + model_link + ')'
+    # add new data
+    new_data = [
+        model_type,
+        model_name,
+        LLM_name
+        ]
+    for key in TASK_INFO:
+        if key in upload_data:
+            new_data.append(upload_data[key])
+        else:
+            new_data.append(0)
+    csv_data.loc[col] = new_data
+    csv_data = csv_data.to_csv(CSV_DIR, index=False)
+    return 0
+def get_baseline_df():
+    # pdb.set_trace()
+    df = pd.read_csv(CSV_DIR)
+    df = df.sort_values(by="Avg", ascending=False)
+    present_columns = MODEL_INFO + checkbox_group.value
+    df = df[present_columns]
+    return df
+def get_all_df():
+    df = pd.read_csv(CSV_DIR)
+    df = df.sort_values(by="Avg", ascending=False)
+    return df
+block = gr.Blocks()
+with block:
+    gr.Markdown(
+        LEADERBORAD_INTRODUCTION
+    )
+    with gr.Tabs(elem_classes="tab-buttons") as tabs:
+        with gr.TabItem("📊 MVBench", elem_id="mvbench-tab-table", id=1):
+            with gr.Row():
+                with gr.Accordion("Citation", open=False):
+                    citation_button = gr.Textbox(
+                        value=CITATION_BUTTON_TEXT,
+                        label=CITATION_BUTTON_LABEL,
+                        elem_id="citation-button",
+                    ).style(show_copy_button=True)
+            gr.Markdown(
+                TABLE_INTRODUCTION
+            )
+            # selection for column part:
+            checkbox_group = gr.CheckboxGroup(
+                choices=TASK_INFO,
+                value=AVG_INFO,
+                label="Evaluation Dimension",
+                interactive=True,
+            )
+            # 创建数据帧组件
+            data_component = gr.components.Dataframe(
+                value=get_baseline_df,
+                headers=COLUMN_NAMES,
+                type="pandas",
+                datatype=DATA_TITILE_TYPE,
+                interactive=False,
+                visible=True,
+                )
+            def on_filter_model_size_method_change(selected_columns):
+                updated_data = get_all_df()
+                # columns:
+                selected_columns = [item for item in TASK_INFO if item in selected_columns]
+                present_columns = MODEL_INFO + selected_columns
+                updated_data = updated_data[present_columns]
+                updated_data = updated_data.sort_values(by=selected_columns[0], ascending=False)
+                updated_headers = present_columns
+                update_datatype = [DATA_TITILE_TYPE[COLUMN_NAMES.index(x)] for x in updated_headers]
+                filter_component = gr.components.Dataframe(
+                    value=updated_data,
+                    headers=updated_headers,
+                    type="pandas",
+                    datatype=update_datatype,
+                    interactive=False,
+                    visible=True,
+                    )
+                return filter_component.value
+            checkbox_group.change(fn=on_filter_model_size_method_change, inputs=[ checkbox_group], outputs=data_component)
+        # table 2
+        with gr.TabItem("📝 About", elem_id="mvbench-tab-table", id=2):
+            gr.Markdown(LEADERBORAD_INFO, elem_classes="markdown-text")
+        # table 3
+        with gr.TabItem("🚀 Submit here! ", elem_id="mvbench-tab-table", id=3):
+            gr.Markdown(LEADERBORAD_INTRODUCTION, elem_classes="markdown-text")
+            with gr.Row():
+                gr.Markdown(SUBMIT_INTRODUCTION, elem_classes="markdown-text")
+            with gr.Row():
+                gr.Markdown("# ✉️✨ Submit your model evaluation json file here!", elem_classes="markdown-text")
+            with gr.Row():
+                with gr.Column():
+                    model_name_textbox = gr.Textbox(
+                        label="Model name", placeholder="LLaMA-7B"
+                        )
+                    revision_name_textbox = gr.Textbox(
+                        label="Revision Model Name", placeholder="LLaMA-7B"
+                    )
+                    model_type = gr.Dropdown(
+                        choices=[
+                            "LLM",
+                            "ImageLLM",
+                            "VideoLLM",
+                            "Other",
+                        ],
+                        label="Model type",
+                        multiselect=False,
+                        value="ImageLLM",
+                        interactive=True,
+                    )
+                with gr.Column():
+                    LLM_type = gr.Dropdown(
+                        choices=["Vicuna-7B", "Flan-T5-XL", "LLaMA-7B", "InternLM-7B", "Other"],
+                        label="LLM type",
+                        multiselect=False,
+                        value="LLaMA-7B",
+                        interactive=True,
+                    )
+                    LLM_name_textbox = gr.Textbox(
+                        label="LLM model (for Other)",
+                        placeholder="LLaMA-13B"
+                    )
+                    model_link = gr.Textbox(
+                        label="Model Link", placeholder="https://huggingface.co/decapoda-research/llama-7b-hf"
+                    )
+                    model_size = gr.Textbox(
+                        label="Model size", placeholder="7B(Input content format must be 'number+B' or '-')"
+                    )
+            with gr.Column():
+                input_file = gr.inputs.File(label = "Click to Upload a json File", file_count="single", type='binary')
+                submit_button = gr.Button("Submit Eval")
+                submission_result = gr.Markdown()
+                submit_button.click(
+                    add_new_eval,
+                    inputs = [
+                        input_file,
+                        model_name_textbox,
+                        revision_name_textbox,
+                        model_type,
+                        model_link,
+                        model_size,
+                        LLM_type,
+                        LLM_name_textbox,
+                    ],
+                )
+    def refresh_data():
+        value1 = get_baseline_df()
+        return value1
+    with gr.Row():
+        data_run = gr.Button("Refresh")
+        data_run.click(
+            refresh_data, outputs=[data_component]
+        )
+    # block.load(get_baseline_df, outputs=data_title)
+block.launch(server_name='0.0.0.0', server_port=10036)

constants.py ADDED Viewed

	@@ -0,0 +1,38 @@

+# this is .py for store constants
+MODEL_INFO = ["Type", "Model", "Language Model"]
+TASK_INFO = ["Avg","Action Antonym","Action Count","Action Localization","Action Prediction","Action Sequence","Character Count","Counterfactual Inference","Egocentric Navigation","Episodic Reasoning","Fine grained Action","Fine grained Pose","Moving Attribute","Moving Count","Moving Direction","Object Existence","Object Interaction","Object Shuffle","Scene Transition","State Change","Unexpected Action"]
+AVG_INFO = ["Avg"]
+DATA_TITILE_TYPE = ['markdown', 'markdown', 'markdown', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number']
+CSV_DIR = "./result.csv"
+COLUMN_NAMES = MODEL_INFO + TASK_INFO
+LEADERBORAD_INTRODUCTION = """# MVBench Leaderboard
+    Welcome to the leaderboard of the MVBench! 🏆
+    We introduce a comprehensive Multi-modal Video understanding Benchmark, namely MVBench, which covers 20 challenging video tasks that cannot be effectively solved with a single frame. Specifically, we first introduce a novel static-to-dynamic method to define these temporal-related tasks. By transforming various static tasks into dynamic ones, we enable the systematic generation of video tasks that require a broad spectrum of temporal skills, ranging from perception to cognition. Then, guided by the task definition, we automatically convert public video annotations into multiple-choice QA to evaluate each task. On one hand, such a distinct paradigm allows us to build MVBench efficiently, without much manual intervention. On the other hand, it guarantees evaluation fairness with ground-truth video annotations, avoiding the biased scoring of LLMs.
+    """
+SUBMIT_INTRODUCTION = """# Submit on MVBench Benchmark Introduction
+"""
+TABLE_INTRODUCTION = """
+    """
+LEADERBORAD_INFO = """
+      With the rapid development of Multi-modal Large Language Models (MLLMs), a number of diagnostic benchmarks have recently emerged to evaluate the comprehension capabilities of these models. However, most benchmarks predominantly assess spatial understanding in the static image tasks, while overlooking temporal understanding in the dynamic video tasks. To alleviate this issue, we introduce a comprehensive Multi-modal Video understanding Benchmark, namely MVBench, which covers 20 challenging video tasks that cannot be effectively solved with a single frame. Specifically, we first introduce a novel static-to-dynamic method to define these temporal-related tasks. By transforming various static tasks into dynamic ones, we enable the systematic generation of video tasks that require a broad spectrum of temporal skills, ranging from perception to cognition. Then, guided by the task definition, we automatically convert public video annotations into multiple-choice QA to evaluate each task. On one hand, such a distinct paradigm allows us to build MVBench efficiently, without much manual intervention. On the other hand, it guarantees evaluation fairness with ground-truth video annotations, avoiding the biased scoring of LLMs. Moreover, we further develop a robust video MLLM baseline, i.e., VideoChat2, by progressive multi-modal training with diverse instruction-tuning data. The extensive results on our MVBench reveal that, the existing MLLMs are far from satisfactory in temporal understanding, while our VideoChat2 largely surpasses these leading models by over 15% on MVBench. All models and data are available at this https URL.
+"""
+CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
+CITATION_BUTTON_TEXT = r"""@article{li2023mvbench,
+  title={MVBench: A Comprehensive Multi-modal Video Understanding Benchmark},
+  author={Li, Kunchang and Wang, Yali and He, Yinan and Li, Yizhuo and Wang, Yi and Liu, Yi and Wang, Zun and Xu, Jilan and Chen, Guo and Luo, Ping and others},
+  journal={arXiv preprint arXiv:2311.17005},
+  year={2023}
+}"""

requirements.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+gradio
+gradio_client
+pandas

result.csv ADDED Viewed

	@@ -0,0 +1,18 @@

+Type,Model,Language Model,Avg,Action Antonym,Action Count,Action Localization,Action Prediction,Action Sequence,Character Count,Counterfactual Inference,Egocentric Navigation,Episodic Reasoning,Fine grained Action,Fine grained Pose,Moving Attribute,Moving Count,Moving Direction,Object Existence,Object Interaction,Object Shuffle,Scene Transition,State Change,Unexpected Action
+LLM,Random,NOLLM,28.0,33.3,33.3,25.0,25.0,25.0,33.3,30.9,25.0,20.0,25.0,25.0,33.3,25.0,25.0,33.3,25.0,33.3,25.0,33.3,25.0
+ImageLLM,mPLUG-Owl-I,LLaMA-7B,29.4,44.5,34.5,24.0,20.0,25.0,37.0,37.0,25.5,21.0,27.0,24.0,31.5,22.0,23.0,36.0,24.0,34.0,34.5,40.0,23.5
+ImageLLM,LLaMA-Adapter,LLaMA-7B,31.7,51.0,29.0,21.5,28.0,23.0,31.5,32.0,22.5,28.0,30.0,25.0,41.5,22.5,25.5,53.5,32.5,33.5,30.5,39.5,33.0
+ImageLLM,BLIP2,FlanT5-XL,31.4,33.5,25.5,26.0,29.0,24.5,30.0,31.0,26.0,37.0,17.0,27.0,40.0,30.0,25.5,51.5,26.0,31.0,32.5,42.0,42.0
+ImageLLM,Otter-I,MPT-7B,33.5,39.5,20.0,25.5,32.0,34.5,27.0,36.5,32.0,29.0,30.5,28.0,28.5,32.5,19.0,48.5,44.0,29.5,55.0,39.0,38.5
+ImageLLM,MiniGPT-4,Vicuna-7B,18.8,26.0,32.5,12.0,18.0,16.0,29.5,3.0,19.0,9.9,21.5,26.0,8.0,15.5,11.5,29.5,25.5,13.0,9.5,34.0,16.0
+ImageLLM,InstructBLIP,Vicuna-7B,32.5,46.0,42.5,23.0,16.5,20.0,30.0,38.0,25.5,30.5,24.5,25.5,40.5,26.5,22.0,51.0,26.0,37.5,46.5,32.0,46.0
+ImageLLM,LLaVA,Vicuna-7B,36.0,63.0,34.0,20.5,39.5,28.0,36.0,42.0,27.0,26.5,30.5,25.0,38.5,20.5,23.0,53.0,41.0,41.5,45.0,47.0,39.0
+VideoLLM,Otter-V,LLaMA-7B,26.8,27.5,26.0,23.5,23.0,23.0,22.0,19.5,23.5,19.0,27.0,22.0,18.0,28.5,24.5,53.0,28.0,33.0,27.5,38.5,29.5
+VideoLLM,mPLUG-Owl-V,LLaMA-7B,29.7,34.0,31.5,23.0,28.0,22.0,31.0,29.5,26.0,20.5,29.0,24.0,40.0,27.0,27.0,40.5,27.0,31.5,29.0,44.0,29.0
+VideoLLM,VideoChatGPT,Vicuna-7B,32.7,62.0,30.5,20.0,26.0,23.5,33.0,35.5,29.5,26.0,22.5,29.0,39.5,25.5,23.0,54.0,28.0,40.0,31.0,48.5,26.5
+VideoLLM,VideoLLaMA,Vicuna-7B,34.1,51.0,34.0,22.5,25.5,27.5,40.0,37.0,30.0,21.0,29.0,32.5,32.5,22.5,22.5,48.0,40.5,38.0,43.0,45.5,39.0
+VideoLLM,VideoChat,Vicuna-7B,35.5,56.0,35.0,27.0,26.5,33.5,41.0,36.0,23.5,23.5,33.5,26.5,42.5,20.5,25.5,53.0,40.5,30.0,48.5,46.0,40.5
+VideoLLM,VideoChat2_text,Vicuna-7B,34.7,49.5,41.5,27.0,27.0,24.5,36.0,40.0,33.0,32.0,27.0,26.5,32.5,27.5,25.5,53.0,28.0,40.0,38.5,46.5,38.0
+VideoLLM,VideoChat2,Vicuna-7B,51.1,83.5,39.0,23.0,47.5,66.0,36.5,65.5,35.0,40.5,49.5,49.0,58.5,42.0,23.0,58.0,71.5,42.5,88.5,44.0,60.0
+VideoLLM,GPT-4V,GPT-4,43.7,72.0,39.0,40.5,63.5,55.5,52.0,11.0,31.0,59.0,46.5,47.5,22.5,12.0,12.0,18.5,59.0,29.5,83.5,45.0,73.5
+ImageLLM,GiminiPro,Gimini,37.7,43.7,3.9,40.0,41.8,35.4,38.7,33.7,36.4,36.4,36.2,26.5,41.5,18.0,16.5,43.5,37.5,39.8,75.4,42.3,67.1