Spaces:

opencompass
/

Open_LMM_Reasoning_Leaderboard

Running

App Files Files Community

KennyUTC commited on Dec 17, 2024

Commit

ad8152e

1 Parent(s): 76e34e0

update leaderboard

Browse files

Files changed (4) hide show

README.md +0 -34
app.py +165 -192
gen_table.py +182 -0
meta_data.py +26 -0

README.md CHANGED Viewed

@@ -9,37 +9,3 @@ pinned: true
 license: apache-2.0
 short_description: A Leaderboard that demonstrates LMM reasoning capabilities
 ---
-# Start the configuration
-Most of the variables to change for a default leaderboard are in `src/env.py` (replace the path for your leaderboard) and `src/about.py` (for tasks).
-Results files should have the following format and be stored as json files:
-```json
-{
-    "config": {
-        "model_dtype": "torch.float16", # or torch.bfloat16 or 8bit or 4bit
-        "model_name": "path of the model on the hub: org/model",
-        "model_sha": "revision on the hub",
-    },
-    "results": {
-        "task_name": {
-            "metric_name": score,
-        },
-        "task_name2": {
-            "metric_name": score,
-        }
-    }
-}
-```
-Request files are created automatically by this tool.
-If you encounter problem on the space, don't hesitate to restart it to remove the create eval-queue, eval-queue-bk, eval-results and eval-results-bk created folder.
-# Code logic for more complex edits
-You'll find
-- the main table' columns names and properties in `src/display/utils.py`
-- the logic to read all results and request files, then convert them in dataframe lines, in `src/leaderboard/read_evals.py`, and `src/populate.py`
-- the logic to allow or filter submissions in `src/submission/submit.py` and `src/submission/check_validity.py`

 license: apache-2.0
 short_description: A Leaderboard that demonstrates LMM reasoning capabilities
 ---

app.py CHANGED Viewed

@@ -1,204 +1,177 @@
 import gradio as gr
-from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
-import pandas as pd
-from apscheduler.schedulers.background import BackgroundScheduler
-from huggingface_hub import snapshot_download
-from src.about import (
-    CITATION_BUTTON_LABEL,
-    CITATION_BUTTON_TEXT,
-    EVALUATION_QUEUE_TEXT,
-    INTRODUCTION_TEXT,
-    LLM_BENCHMARKS_TEXT,
-    TITLE,
-)
-from src.display.css_html_js import custom_css
-from src.display.utils import (
-    BENCHMARK_COLS,
-    COLS,
-    EVAL_COLS,
-    EVAL_TYPES,
-    AutoEvalColumn,
-    ModelType,
-    fields,
-    WeightType,
-    Precision
-)
-from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
-from src.populate import get_evaluation_queue_df, get_leaderboard_df
-from src.submission.submit import add_new_eval
-def restart_space():
-    API.restart_space(repo_id=REPO_ID)
-### Space initialisation
-try:
-    print(EVAL_REQUESTS_PATH)
-    snapshot_download(
-        repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
-    )
-except Exception:
-    restart_space()
-try:
-    print(EVAL_RESULTS_PATH)
-    snapshot_download(
-        repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
-    )
-except Exception:
-    restart_space()
-LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
-(
-    finished_eval_queue_df,
-    running_eval_queue_df,
-    pending_eval_queue_df,
-) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
-def init_leaderboard(dataframe):
-    if dataframe is None or dataframe.empty:
-        raise ValueError("Leaderboard DataFrame is empty or None.")
-    return Leaderboard(
-        value=dataframe,
-        datatype=[c.type for c in fields(AutoEvalColumn)],
-        select_columns=SelectColumns(
-            default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
-            cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
-            label="Select Columns to Display:",
-        ),
-        search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
-        hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
-        filter_columns=[
-            ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
-            ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
-            ColumnFilter(
-                AutoEvalColumn.params.name,
-                type="slider",
-                min=0.01,
-                max=150,
-                label="Select the number of parameters (B)",
-            ),
-            ColumnFilter(
-                AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
-            ),
-        ],
-        bool_checkboxgroup_label="Hide models",
-        interactive=False,
-    )
-demo = gr.Blocks(css=custom_css)
-with demo:
-    gr.HTML(TITLE)
-    gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
-    with gr.Tabs(elem_classes="tab-buttons") as tabs:
-        with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
-            leaderboard = init_leaderboard(LEADERBOARD_DF)
-        with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
-            gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
-        with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
-            with gr.Column():
-                with gr.Row():
-                    gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
-                with gr.Column():
-                    with gr.Accordion(
-                        f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
-                        open=False,
-                    ):
-                        with gr.Row():
-                            finished_eval_table = gr.components.Dataframe(
-                                value=finished_eval_queue_df,
-                                headers=EVAL_COLS,
-                                datatype=EVAL_TYPES,
-                                row_count=5,
-                            )
-                    with gr.Accordion(
-                        f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
-                        open=False,
-                    ):
-                        with gr.Row():
-                            running_eval_table = gr.components.Dataframe(
-                                value=running_eval_queue_df,
-                                headers=EVAL_COLS,
-                                datatype=EVAL_TYPES,
-                                row_count=5,
-                            )
-                    with gr.Accordion(
-                        f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
-                        open=False,
-                    ):
-                        with gr.Row():
-                            pending_eval_table = gr.components.Dataframe(
-                                value=pending_eval_queue_df,
-                                headers=EVAL_COLS,
-                                datatype=EVAL_TYPES,
-                                row_count=5,
-                            )
-            with gr.Row():
-                gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
             with gr.Row():
-                with gr.Column():
-                    model_name_textbox = gr.Textbox(label="Model name")
-                    revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
-                    model_type = gr.Dropdown(
-                        choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
-                        label="Model type",
-                        multiselect=False,
-                        value=None,
-                        interactive=True,
-                    )
-                with gr.Column():
-                    precision = gr.Dropdown(
-                        choices=[i.value.name for i in Precision if i != Precision.Unknown],
-                        label="Precision",
-                        multiselect=False,
-                        value="float16",
-                        interactive=True,
                     )
-                    weight_type = gr.Dropdown(
-                        choices=[i.value.name for i in WeightType],
-                        label="Weights type",
-                        multiselect=False,
-                        value="Original",
-                        interactive=True,
                     )
-                    base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
-            submit_button = gr.Button("Submit Eval")
-            submission_result = gr.Markdown()
-            submit_button.click(
-                add_new_eval,
-                [
-                    model_name_textbox,
-                    base_model_name_textbox,
-                    revision_name_textbox,
-                    precision,
-                    weight_type,
-                    model_type,
-                ],
-                submission_result,
-            )
     with gr.Row():
-        with gr.Accordion("📙 Citation", open=False):
             citation_button = gr.Textbox(
                 value=CITATION_BUTTON_TEXT,
                 label=CITATION_BUTTON_LABEL,
-                lines=20,
-                elem_id="citation-button",
-                show_copy_button=True,
-            )
-scheduler = BackgroundScheduler()
-scheduler.add_job(restart_space, "interval", seconds=1800)
-scheduler.start()
-demo.queue(default_concurrency_limit=40).launch()

+import abc
 import gradio as gr
+from gen_table import *
+from meta_data import *
+head_style = """
+<style>
+@media (min-width: 1536px)
+{
+    .gradio-container {
+        min-width: var(--size-full) !important;
+    }
+}
+</style>
+"""
+with gr.Blocks(title="Math Leaderboard", head=head_style) as demo:
+    results = load_results()['results']
+    N_MODEL = len(results)
+    DATASETS = []
+    for m in results:
+        DATASETS.extend(results[m].keys())
+    DATASETS = [d for d in set(DATASETS) if d != 'META']
+    N_DATA = len(DATASETS)
+    structs = [abc.abstractproperty() for _ in range(N_DATA)]
+    gr.Markdown(LEADERBORAD_INTRODUCTION)
+    with gr.Tabs(elem_classes='tab-buttons') as tabs:
+        with gr.TabItem('🏅 LMM Math Leaderboard', elem_id='main', id=0):
+            _, check_box = BUILD_L1_DF(results)
+            table = generate_table(results)
+            table['Rank'] = list(range(1, len(table) + 1))
+            type_map = check_box['type_map']
+            type_map['Rank'] = 'number'
+            checkbox_group = gr.CheckboxGroup(
+                choices=check_box['all'],
+                value=check_box['required'],
+                label='Evaluation Dimension',
+                interactive=True,
+            )
+            headers = ['Rank'] + check_box['essential'] + checkbox_group.value
             with gr.Row():
+                model_size = gr.CheckboxGroup(
+                    choices=MODEL_SIZE,
+                    value=MODEL_SIZE,
+                    label='Model Size',
+                    interactive=True
+                )
+                model_type = gr.CheckboxGroup(
+                    choices=MODEL_TYPE,
+                    value=MODEL_TYPE,
+                    label='Model Type',
+                    interactive=True
+                )
+            data_component = gr.components.DataFrame(
+                value=table[headers],
+                type='pandas',
+                datatype=[type_map[x] for x in headers],
+                interactive=False,
+                visible=True)
+            def filter_df(fields, model_size, model_type):
+                results = load_results()['results']
+                headers = ['Rank'] + check_box['essential'] + fields
+                df = generate_table(results)
+                df['flag'] = [model_size_flag(x, model_size) for x in df['Param (B)']]
+                df = df[df['flag']]
+                df.pop('flag')
+                if len(df):
+                    df['flag'] = [model_type_flag(df.iloc[i], model_type) for i in range(len(df))]
+                    df = df[df['flag']]
+                    df.pop('flag')
+                df['Rank'] = list(range(1, len(df) + 1))
+                comp = gr.components.DataFrame(
+                    value=df[headers],
+                    type='pandas',
+                    datatype=[type_map[x] for x in headers],
+                    interactive=False,
+                    visible=True)
+                return comp
+            for cbox in [checkbox_group, model_size, model_type]:
+                cbox.change(fn=filter_df, inputs=[checkbox_group, model_size, model_type], outputs=data_component)
+        for i, dataset in enumerate(DATASETS):
+            tab_name_map = {
+                'MathVista': 'MathVista (Test Mini)',
+                'MathVerse': 'MathVerse (Vision Only)',
+            }
+            with gr.TabItem(
+                f'📊 {dataset if dataset not in tab_name_map else tab_name_map[dataset]}', elem_id=dataset, id=i + 2):
+                s = structs[i]
+                s.table, s.check_box = BUILD_L2_DF(results, dataset)
+                s.type_map = s.check_box['type_map']
+                s.type_map['Rank'] = 'number'
+                s.checkbox_group = gr.CheckboxGroup(
+                    choices=s.check_box['all'],
+                    value=s.check_box['required'],
+                    label=f'{dataset} CheckBoxes',
+                    interactive=True,
+                )
+                s.headers = ['Rank'] + s.check_box['essential'] + s.checkbox_group.value
+                s.table['Rank'] = list(range(1, len(s.table) + 1))
+                with gr.Row():
+                    s.model_size = gr.CheckboxGroup(
+                        choices=MODEL_SIZE,
+                        value=MODEL_SIZE,
+                        label='Model Size',
+                        interactive=True
                     )
+                    s.model_type = gr.CheckboxGroup(
+                        choices=MODEL_TYPE,
+                        value=MODEL_TYPE,
+                        label='Model Type',
+                        interactive=True
                     )
+                s.data_component = gr.components.DataFrame(
+                    value=s.table[s.headers],
+                    type='pandas',
+                    datatype=[s.type_map[x] for x in s.headers],
+                    interactive=False,
+                    visible=True)
+                s.dataset = gr.Textbox(value=dataset, label=dataset, visible=False)
+                def filter_df_l2(dataset_name, fields, model_size, model_type):
+                    results = load_results()['results']
+                    s = structs[DATASETS.index(dataset_name)]
+                    headers = ['Rank'] + s.check_box['essential'] + fields
+                    df = cp.deepcopy(s.table)
+                    df['flag'] = [model_size_flag(x, model_size) for x in df['Param (B)']]
+                    df = df[df['flag']]
+                    df.pop('flag')
+                    if len(df):
+                        df['flag'] = [model_type_flag(df.iloc[i], model_type) for i in range(len(df))]
+                        df = df[df['flag']]
+                        df.pop('flag')
+                    df['Rank'] = list(range(1, len(df) + 1))
+                    comp = gr.components.DataFrame(
+                        value=df[headers],
+                        type='pandas',
+                        datatype=[s.type_map[x] for x in headers],
+                        interactive=False,
+                        visible=True)
+                    return comp
+                for cbox in [s.checkbox_group, s.model_size, s.model_type]:
+                    cbox.change(
+                        fn=filter_df_l2,
+                        inputs=[s.dataset, s.checkbox_group, s.model_size, s.model_type],
+                        outputs=s.data_component)
     with gr.Row():
+        with gr.Accordion('Citation', open=False):
             citation_button = gr.Textbox(
                 value=CITATION_BUTTON_TEXT,
                 label=CITATION_BUTTON_LABEL,
+                elem_id='citation-button')
+if __name__ == '__main__':
+    demo.launch(server_name='0.0.0.0')

gen_table.py ADDED Viewed

	@@ -0,0 +1,182 @@

+import copy as cp
+import json
+from collections import defaultdict
+from urllib.request import urlopen
+import gradio as gr
+import numpy as np
+import pandas as pd
+from meta_data import META_FIELDS, URL, DATASETS_ALL, DATASETS_ESS
+def listinstr(lst, s):
+    assert isinstance(lst, list)
+    for item in lst:
+        if item in s:
+            return True
+    return False
+def upper_key(k):
+    if k == 'ocr':
+        return 'OCR'
+    elif '_' in k:
+        k = k.split('_')
+        k = [x[0].upper() + x[1:] for x in k]
+        k = ' '.join(k)
+        return k
+    else:
+        return k
+def load_results():
+    data = json.loads(urlopen(URL).read())
+    return data
+def nth_large(val, vals):
+    return sum([1 for v in vals if v > val]) + 1
+def model_size_flag(sz, FIELDS):
+    if pd.isna(sz) and 'Unknown' in FIELDS:
+        return True
+    if pd.isna(sz):
+        return False
+    sz = int(sz)
+    if '<4B' in FIELDS and sz < 4:
+        return True
+    if '4B-10B' in FIELDS and sz >= 4 and sz < 10:
+        return True
+    if '10B-20B' in FIELDS and sz >= 10 and sz < 20:
+        return True
+    if '20B-40B' in FIELDS and sz >= 20 and sz < 40:
+        return True
+    if '>40B' in FIELDS and sz >= 40:
+        return True
+    return False
+def model_type_flag(line, FIELDS):
+    if 'OpenSource' in FIELDS and line['OpenSource'] == 'Yes':
+        return True
+    if 'API' in FIELDS and line['OpenSource'] == 'No':
+        return True
+    return False
+def BUILD_L1_DF(results):
+    check_box = {}
+    check_box['essential'] = ['Method', 'Org', 'Param (B)', 'Language Model', 'Vision Model']
+    # revise there to set default dataset
+    check_box['required'] = ['Overall'] + DATASETS_ESS
+    check_box['all'] = ['Overall'] + DATASETS_ALL
+    type_map = defaultdict(lambda: 'number')
+    type_map['Method'] = 'html'
+    type_map['Language Model'] = type_map['Vision Model'] = type_map['Org'] = 'html'
+    type_map['OpenSource'] = type_map['Verified'] = 'str'
+    check_box['type_map'] = type_map
+    df = generate_table(results)
+    return df, check_box
+def BUILD_L2_DF(results, dataset):
+    res = defaultdict(list)
+    sub = [v for v in results.values() if dataset in v]
+    assert len(sub), dataset
+    fields = list(sub[0][dataset].keys())
+    non_overall_fields = [x for x in fields if 'Overall' not in x]
+    overall_fields = [x for x in fields if 'Overall' in x]
+    for m in results:
+        item = results[m]
+        if dataset not in item:
+            continue
+        for k in META_FIELDS:
+            if k == 'Param (B)':
+                param = item['META']['Parameters']
+                res[k].append(float(param.replace('B', '')) if param != '' else None)
+            elif k == 'Method':
+                name, url = item['META']['Method']
+                res[k].append(f'<a href="{url}">{name}</a>')
+            else:
+                s = item['META'][k].replace('\n', '<br>')
+                s = s.replace(' & ', '<br>')
+                res[k].append(s)
+        for d in overall_fields:
+            res[d].append(float(item[dataset][d]))
+        for d in non_overall_fields:
+            res[d].append(float(item[dataset][d]))
+    df = pd.DataFrame(res)
+    all_fields = overall_fields + non_overall_fields
+    # Use the first 5 non-overall fields as required fields
+    required_fields = overall_fields if len(overall_fields) else non_overall_fields[:5]
+    df = df.sort_values('Overall')
+    df = df.iloc[::-1]
+    check_box = {}
+    check_box['essential'] = ['Method', 'Org', 'Param (B)', 'Language Model', 'Vision Model']
+    check_box['required'] = required_fields
+    check_box['all'] = all_fields
+    type_map = defaultdict(lambda: 'number')
+    type_map['Method'] = 'html'
+    type_map['Language Model'] = type_map['Vision Model'] = type_map['Org'] = 'html'
+    type_map['OpenSource'] = type_map['Verified'] = 'str'
+    check_box['type_map'] = type_map
+    return df, check_box
+def generate_table(results):
+    res = defaultdict(list)
+    for i, m in enumerate(results):
+        item = results[m]
+        avg = 0
+        for k in META_FIELDS:
+            if k == 'Param (B)':
+                param = item['META']['Parameters']
+                res[k].append(float(param.replace('B', '')) if param != '' else None)
+            elif k == 'Method':
+                name, url = item['META']['Method']
+                res[k].append(f'<a href="{url}">{name}</a>')
+            else:
+                s = item['META'][k].replace('\n', '<br>')
+                s = s.replace(' & ', '<br>')
+                res[k].append(s)
+        for d in DATASETS_ALL:
+            key_name = 'Overall'
+            if d in item:
+                val = float(item[d][key_name])
+                val = float(f'{val:.1f}')
+                res[d].append(val)
+            else:
+                res[d].append(None)
+            if d in DATASETS_ESS:
+                if d in item and avg is not None:
+                    avg += res[d][-1]
+                else:
+                    avg = None
+        if avg is not None:
+            avg = float(f'{avg / len(DATASETS_ESS):.1f}')
+        res['Overall'].append(avg)
+    df = pd.DataFrame(res)
+    overall_isna = df[pd.isna(df['Overall'])]
+    overall_notna = df[~pd.isna(df['Overall'])]
+    overall_notna = overall_notna.sort_values('Overall')
+    overall_notna = overall_notna.iloc[::-1]
+    overall_isna = overall_isna.sort_values('MathVista')
+    overall_isna = overall_isna.iloc[::-1]
+    df = pd.concat([overall_notna, overall_isna])
+    return df

meta_data.py ADDED Viewed

	@@ -0,0 +1,26 @@

+# CONSTANTS-URL
+URL = "http://opencompass.openxlab.space/assets/MathLB.json"
+# CONSTANTS-CITATION
+CITATION_BUTTON_TEXT = r"""\
+@inproceedings{duan2024vlmevalkit,
+  title={Vlmevalkit: An open-source toolkit for evaluating large multi-modality models},
+  author={Duan, Haodong and Yang, Junming and Qiao, Yuxuan and Fang, Xinyu and Chen, Lin and Liu, Yuan and Dong, Xiaoyi and Zang, Yuhang and Zhang, Pan and Wang, Jiaqi and others},
+  booktitle={Proceedings of the 32nd ACM International Conference on Multimedia},
+  pages={11198--11201},
+  year={2024}
+}
+"""
+CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
+# CONSTANTS-TEXT
+LEADERBORAD_INTRODUCTION = """# Open LMM Reasoning Leaderboard
+This leaderboard aims at providing a comprehensive evaluation of the reasoning capabilities of LMMs. \
+Currently, it is a collection of evaluation results on multiple multi-modal mathematical reasoning benchmarks.
+"""
+# CONSTANTS-FIELDS
+DATASETS_ALL = ['MathVista', 'MathVision', 'MathVerse', 'DynaMath', 'MMMath', 'OlympiadBench']
+DATASETS_ESS = ['MathVista', 'MathVision', 'MathVerse', 'DynaMath']
+META_FIELDS = ['Method', 'Param (B)', 'Language Model', 'Vision Model', 'OpenSource', 'Verified', 'Org']
+MODEL_SIZE = ['<4B', '4B-10B', '10B-20B', '20B-40B', '>40B', 'Unknown']
+MODEL_TYPE = ['OpenSource', 'API']