Spaces:

omlab
/

open-agent-leaderboard

Running

App Files Files Community

qq-hzlh commited on Jan 9

Commit

fc71d05

verified ·

1 Parent(s): 3954c20

Upload 5 files

Browse files

Files changed (5) hide show

app.py +188 -189
detail_math_score.json +345 -0
gen_table.py +218 -0
meta_data.py +68 -0
overall_math_score.json +155 -0

app.py CHANGED Viewed

@@ -1,204 +1,203 @@
 import gradio as gr
-from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
-import pandas as pd
-from apscheduler.schedulers.background import BackgroundScheduler
-from huggingface_hub import snapshot_download
-from src.about import (
-    CITATION_BUTTON_LABEL,
-    CITATION_BUTTON_TEXT,
-    EVALUATION_QUEUE_TEXT,
-    INTRODUCTION_TEXT,
-    LLM_BENCHMARKS_TEXT,
-    TITLE,
-)
-from src.display.css_html_js import custom_css
-from src.display.utils import (
-    BENCHMARK_COLS,
-    COLS,
-    EVAL_COLS,
-    EVAL_TYPES,
-    AutoEvalColumn,
-    ModelType,
-    fields,
-    WeightType,
-    Precision
-)
-from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
-from src.populate import get_evaluation_queue_df, get_leaderboard_df
-from src.submission.submit import add_new_eval
-def restart_space():
-    API.restart_space(repo_id=REPO_ID)
-### Space initialisation
-try:
-    print(EVAL_REQUESTS_PATH)
-    snapshot_download(
-        repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
-    )
-except Exception:
-    restart_space()
-try:
-    print(EVAL_RESULTS_PATH)
-    snapshot_download(
-        repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
-    )
-except Exception:
-    restart_space()
-LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
-(
-    finished_eval_queue_df,
-    running_eval_queue_df,
-    pending_eval_queue_df,
-) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
-def init_leaderboard(dataframe):
-    if dataframe is None or dataframe.empty:
-        raise ValueError("Leaderboard DataFrame is empty or None.")
-    return Leaderboard(
-        value=dataframe,
-        datatype=[c.type for c in fields(AutoEvalColumn)],
-        select_columns=SelectColumns(
-            default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
-            cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
-            label="Select Columns to Display:",
-        ),
-        search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
-        hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
-        filter_columns=[
-            ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
-            ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
-            ColumnFilter(
-                AutoEvalColumn.params.name,
-                type="slider",
-                min=0.01,
-                max=150,
-                label="Select the number of parameters (B)",
-            ),
-            ColumnFilter(
-                AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
-            ),
-        ],
-        bool_checkboxgroup_label="Hide models",
-        interactive=False,
-    )
-demo = gr.Blocks(css=custom_css)
-with demo:
-    gr.HTML(TITLE)
-    gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
-    with gr.Tabs(elem_classes="tab-buttons") as tabs:
-        with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
-            leaderboard = init_leaderboard(LEADERBOARD_DF)
-        with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
-            gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
-        with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
-            with gr.Column():
-                with gr.Row():
-                    gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
-                with gr.Column():
-                    with gr.Accordion(
-                        f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
-                        open=False,
-                    ):
-                        with gr.Row():
-                            finished_eval_table = gr.components.Dataframe(
-                                value=finished_eval_queue_df,
-                                headers=EVAL_COLS,
-                                datatype=EVAL_TYPES,
-                                row_count=5,
-                            )
-                    with gr.Accordion(
-                        f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
-                        open=False,
-                    ):
-                        with gr.Row():
-                            running_eval_table = gr.components.Dataframe(
-                                value=running_eval_queue_df,
-                                headers=EVAL_COLS,
-                                datatype=EVAL_TYPES,
-                                row_count=5,
-                            )
-                    with gr.Accordion(
-                        f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
-                        open=False,
-                    ):
-                        with gr.Row():
-                            pending_eval_table = gr.components.Dataframe(
-                                value=pending_eval_queue_df,
-                                headers=EVAL_COLS,
-                                datatype=EVAL_TYPES,
-                                row_count=5,
-                            )
-            with gr.Row():
-                gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
             with gr.Row():
-                with gr.Column():
-                    model_name_textbox = gr.Textbox(label="Model name")
-                    revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
-                    model_type = gr.Dropdown(
-                        choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
-                        label="Model type",
-                        multiselect=False,
-                        value=None,
-                        interactive=True,
                     )
-                with gr.Column():
-                    precision = gr.Dropdown(
-                        choices=[i.value.name for i in Precision if i != Precision.Unknown],
-                        label="Precision",
-                        multiselect=False,
-                        value="float16",
-                        interactive=True,
                     )
-                    weight_type = gr.Dropdown(
-                        choices=[i.value.name for i in WeightType],
-                        label="Weights type",
-                        multiselect=False,
-                        value="Original",
-                        interactive=True,
                     )
-                    base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
-            submit_button = gr.Button("Submit Eval")
-            submission_result = gr.Markdown()
-            submit_button.click(
-                add_new_eval,
-                [
-                    model_name_textbox,
-                    base_model_name_textbox,
-                    revision_name_textbox,
-                    precision,
-                    weight_type,
-                    model_type,
-                ],
-                submission_result,
             )
     with gr.Row():
         with gr.Accordion("📙 Citation", open=False):
-            citation_button = gr.Textbox(
-                value=CITATION_BUTTON_TEXT,
-                label=CITATION_BUTTON_LABEL,
-                lines=20,
                 elem_id="citation-button",
                 show_copy_button=True,
             )
-scheduler = BackgroundScheduler()
-scheduler.add_job(restart_space, "interval", seconds=1800)
-scheduler.start()
-demo.queue(default_concurrency_limit=40).launch()

+import abc
 import gradio as gr
+from gen_table import *
+from meta_data import *
+# import pandas as pd
+# pd.set_option('display.max_colwidth', 0)
+head_style = """
+<style>
+@media (min-width: 1536px)
+{
+    .gradio-container {
+        min-width: var(--size-full) !important;
+    }
+}
+</style>
+"""
+with gr.Blocks(title="Open Agent Leaderboard", head=head_style) as demo:
+    struct = load_results(OVERALL_MATH_SCORE_FILE)
+    timestamp = struct['time']
+    EVAL_TIME = format_timestamp(timestamp)
+    results = struct['results']
+    N_MODEL = len(results)
+    N_DATA = len(results['IO'])
+    DATASETS = list(results['IO'])
+    DATASETS.remove('META')
+    print(DATASETS)
+    with gr.Tabs(elem_classes='tab-buttons') as tabs:
+        gr.Markdown(LEADERBORAD_INTRODUCTION.format(EVAL_TIME))
+        with gr.TabItem('🏅 Open Agent Overall Math Leaderboard', elem_id='math', id=0):
+            gr.Markdown(LEADERBOARD_MD['MATH_MAIN'])
+            check_box = BUILD_L1_DF(results, DEFAULT_MATH_BENCH)
+            table = generate_table(results, DEFAULT_MATH_BENCH)
+            type_map = check_box['type_map']
+            type_map['Rank'] = 'number'
+            checkbox_group = gr.CheckboxGroup(
+                choices=check_box['all'],
+                value=check_box['required'],
+                label='Evaluation Dimension',
+                interactive=True,
+            )
+            headers = ['Rank'] + check_box['essential'] + checkbox_group.value
+            data_component = gr.components.DataFrame(
+                value=table[headers],
+                type='pandas',
+                datatype=[type_map[x] for x in headers],
+                interactive=False,
+                wrap=True,
+                visible=True)
+            def filter_df(fields, *args):
+                # 获取基础列和选中的列
+                headers = ['Rank'] + check_box['essential'] + fields
+                df = table.copy()
+                comp = gr.components.DataFrame(
+                    value=table[headers],  # 只显示选中的列
+                    type='pandas',
+                    datatype=[type_map[x] for x in headers],
+                    interactive=False,
+                    wrap=True,
+                    visible=True)
+                return comp
+            # checkbox_group的change事件只需要传入checkbox_group
+            checkbox_group.change(
+                fn=filter_df,
+                inputs=[checkbox_group],
+                outputs=data_component
+            )
+        # detail math leaderboard
+        with gr.TabItem('🏅 Open Agent Detail Math Leaderboard', elem_id='math_detail', id=1):
+            gr.Markdown(LEADERBOARD_MD['MATH_DETAIL'])
+            struct_detail = load_results(DETAIL_MATH_SCORE_FILE)
+            timestamp = struct_detail['time']
+            EVAL_TIME = format_timestamp(timestamp)
+            results_detail = struct_detail['results']
+            table, check_box = BUILD_L2_DF(results_detail, DEFAULT_MATH_BENCH)
+            # table = generate_table_detail(results_detail, DEFAULT_MATH_BENCH)
+            type_map = check_box['type_map']
+            type_map['Rank'] = 'number'
+            checkbox_group = gr.CheckboxGroup(
+                choices=check_box['all'],
+                value=check_box['required'],
+                label='Evaluation Dimension',
+                interactive=True,
+            )
+            headers = ['Rank'] + checkbox_group.value
             with gr.Row():
+                algo_name = gr.CheckboxGroup(
+                        choices=ALGORITHMS,
+                        value=ALGORITHMS,
+                        label='Algorithm',
+                        interactive=True
                     )
+                dataset_name = gr.CheckboxGroup(
+                        choices=DATASETS,
+                        value=DATASETS,
+                        label='Datasets',
+                        interactive=True
                     )
+                llm_name = gr.CheckboxGroup(
+                        choices=LLM,
+                        value=LLM,
+                        label='LLM',
+                        interactive=True
                     )
+            data_component = gr.components.DataFrame(
+                value=table[headers],
+                type='pandas',
+                datatype=[type_map[x] for x in headers],
+                interactive=False,
+                wrap=True,
+                visible=True)
+            def filter_df(fields, algos, datasets, llms):
+                headers = ['Rank'] + check_box['essential'] + fields
+                df = table.copy()
+                # 过滤数据
+                df['flag'] = df.apply(lambda row: (
+                    row['Algorithm'] in algos and
+                    row['Dataset'] in datasets and
+                    row['LLM'] in llms
+                ), axis=1)
+                df = df[df['flag']].copy()
+                df.pop('flag')
+                # 按数据集分组，在每个组内根据Score排序并计算排名
+                if 'Score' in df.columns:
+                    # 创建一个临时的排名列
+                    df['Rank'] = df.groupby('Dataset')['Score'].rank(method='first', ascending=False)
+                    # 确保排名为整数
+                    df['Rank'] = df['Rank'].astype(int)
+                comp = gr.components.DataFrame(
+                    value=df[headers],
+                    type='pandas',
+                    datatype=[type_map[x] for x in headers],
+                    interactive=False,
+                    wrap=True,
+                    visible=True)
+                return comp
+            # 为所有复选框组添加change事件
+            checkbox_group.change(
+                fn=filter_df,
+                inputs=[checkbox_group, algo_name, dataset_name, llm_name],
+                outputs=data_component
+            )
+            algo_name.change(
+                fn=filter_df,
+                inputs=[checkbox_group, algo_name, dataset_name, llm_name],
+                outputs=data_component
             )
+            dataset_name.change(
+                fn=filter_df,
+                inputs=[checkbox_group, algo_name, dataset_name, llm_name],
+                outputs=data_component
+            )
+            llm_name.change(
+                fn=filter_df,
+                inputs=[checkbox_group, algo_name, dataset_name, llm_name],
+                outputs=data_component
+            )
     with gr.Row():
         with gr.Accordion("📙 Citation", open=False):
+            gr.Textbox(
+                value=CITATION_BUTTON_TEXT, lines=7,
+                label="Copy the BibTeX snippet to cite this source",
                 elem_id="citation-button",
                 show_copy_button=True,
             )
+if __name__ == '__main__':
+    demo.launch(server_name='0.0.0.0')

detail_math_score.json ADDED Viewed

	@@ -0,0 +1,345 @@

+{
+    "time": "2025-01-09 17:13:45",
+    "results": {
+        "IO": {
+            "gpt-3.5-turbo": {
+                "META": {
+                    "Algorithm": "IO",
+                    "LLM": "gpt-3.5-turbo",
+                    "Eval Date": "2025/01/07"
+                },
+                "gsm8k": {
+                    "Score": 37.83,
+                    "Pass rate": 99.92,
+                    "X-shot": 8,
+                    "Parameters": "",
+                    "Samples": 1319,
+                    "Total input tokens": 546990,
+                    "Average input tokens": 415,
+                    "Total output tokens": 39563,
+                    "Average output tokens": 30,
+                    "All tokens": 586553,
+                    "Cost($)": 0.3328
+                },
+                "AQuA": {
+                    "Score": 38.98,
+                    "Pass rate": 100.00,
+                    "X-shot": 0,
+                    "Parameters": "",
+                    "Samples": 254,
+                    "Total input tokens": 25701,
+                    "Average input tokens": 101,
+                    "Total output tokens": 16770,
+                    "Average output tokens": 66,
+                    "All tokens": 42471,
+                    "Cost($)": 0.0380
+                }
+            },
+            "Doubao-lite-32k": {
+                "META": {
+                    "Algorithm": "IO",
+                    "LLM": "Doubao-lite-32k",
+                    "Eval Date": "2025/01/07"
+                },
+                "gsm8k": {
+                    "Score": 72.02,
+                    "Pass rate": 99.92,
+                    "X-shot": 8,
+                    "Parameters": "",
+                    "Samples": 1319,
+                    "Total input tokens": 617377,
+                    "Average input tokens": 468,
+                    "Total output tokens": 123106,
+                    "Average output tokens": 93,
+                    "All tokens": 740483,
+                    "Cost($)": 0.0354
+                },
+                "AQuA": {
+                    "Score": 79.13,
+                    "Pass rate": 100.00,
+                    "X-shot": 0,
+                    "Parameters": "",
+                    "Samples": 254,
+                    "Total input tokens": 33058,
+                    "Average input tokens": 130,
+                    "Total output tokens": 54684,
+                    "Average output tokens": 215,
+                    "All tokens": 87742,
+                    "Cost($)": 0.0058
+                }
+            }
+        },
+        "COT": {
+            "gpt-3.5-turbo": {
+                "META": {
+                    "Algorithm": "COT",
+                    "LLM": "gpt-3.5-turbo",
+                    "Eval Date": "2025/01/07"
+                },
+                "gsm8k": {
+                    "Score": 78.70,
+                    "Pass rate": 100.00,
+                    "X-shot": 8,
+                    "Parameters": "",
+                    "Samples": 1319,
+                    "Total input tokens": 953242,
+                    "Average input tokens": 723,
+                    "Total output tokens": 134799,
+                    "Average output tokens": 102,
+                    "All tokens": 1088041,
+                    "Cost($)": 0.6788
+                },
+                "AQuA": {
+                    "Score": 61.02,
+                    "Pass rate": 93.70,
+                    "X-shot": 0,
+                    "Parameters": "",
+                    "Samples": 254,
+                    "Total input tokens": 25447,
+                    "Average input tokens": 100,
+                    "Total output tokens": 55346,
+                    "Average output tokens": 218,
+                    "All tokens": 80793,
+                    "Cost($)": 0.0957
+                }
+            },
+            "Doubao-lite-32k": {
+                "META": {
+                    "Algorithm": "COT",
+                    "LLM": "Doubao-lite-32k",
+                    "Eval Date": "2025/01/07"
+                },
+                "gsm8k": {
+                    "Score": 89.31,
+                    "Pass rate": 100.00,
+                    "X-shot": 8,
+                    "Parameters": "",
+                    "Samples": 1319,
+                    "Total input tokens": 1042095,
+                    "Average input tokens": 790,
+                    "Total output tokens": 159725,
+                    "Average output tokens": 121,
+                    "All tokens": 1201820,
+                    "Cost($)": 0.0557
+                },
+                "AQuA": {
+                    "Score": 82.68,
+                    "Pass rate": 97.24,
+                    "X-shot": 0,
+                    "Parameters": "",
+                    "Samples": 254,
+                    "Total input tokens": 27978,
+                    "Average input tokens": 110,
+                    "Total output tokens": 66599,
+                    "Average output tokens": 262,
+                    "All tokens": 94577,
+                    "Cost($)": 0.0066
+                }
+            }
+        },
+        "SC-COT": {
+            "gpt-3.5-turbo": {
+                "META": {
+                    "Algorithm": "SC-COT",
+                    "LLM": "gpt-3.5-turbo",
+                    "Eval Date": "2025/01/07"
+                },
+                "gsm8k": {
+                    "Score": 80.06,
+                    "Pass rate": 99.62,
+                    "X-shot": 8,
+                    "Parameters": "temperature=1, num_path=5",
+                    "Samples": 1319,
+                    "Total input tokens": 5260319,
+                    "Average input tokens": 3988,
+                    "Total output tokens": 1595016,
+                    "Average output tokens": 1209,
+                    "All tokens": 6855335,
+                    "Cost($)": 5.0227
+                },
+                "AQuA": {
+                    "Score": 67.32,
+                    "Pass rate": 100.00,
+                    "X-shot": 0,
+                    "Parameters": "temperature=1, path_num=5",
+                    "Samples": 254,
+                    "Total input tokens": 219241,
+                    "Average input tokens": 863,
+                    "Total output tokens": 359629,
+                    "Average output tokens": 1416,
+                    "All tokens": 578870,
+                    "Cost($)": 0.6491
+                }
+            },
+            "Doubao-lite-32k": {
+                "META": {
+                    "Algorithm": "SC-COT",
+                    "LLM": "Doubao-lite-32k",
+                    "Eval Date": "2025/01/07"
+                },
+                "gsm8k": {
+                    "Score": 88.63,
+                    "Pass rate": 99.77,
+                    "X-shot": 8,
+                    "Parameters": "temperature=1, num_path=5",
+                    "Samples": 1319,
+                    "Total input tokens": 1150443,
+                    "Average input tokens": 872,
+                    "Total output tokens": 1295750,
+                    "Average output tokens": 982,
+                    "All tokens": 2446193,
+                    "Cost($)": 0.1533
+                },
+                "AQuA": {
+                    "Score": 83.46,
+                    "Pass rate": 97.24,
+                    "X-shot": 0,
+                    "Parameters": "temperature=1, num_path=5",
+                    "Samples": 254,
+                    "Total input tokens": 259804,
+                    "Average input tokens": 1023,
+                    "Total output tokens": 369741,
+                    "Average output tokens": 1456,
+                    "All tokens": 629545,
+                    "Cost($)": 0.0409
+                }
+            }
+        },
+        "POT": {
+            "gpt-3.5-turbo": {
+                "META": {
+                    "Algorithm": "POT",
+                    "LLM": "gpt-3.5-turbo",
+                    "Eval Date": "2025/01/07"
+                },
+                "gsm8k": {
+                    "Score": 76.88,
+                    "Pass rate": 99.24,
+                    "X-shot": 8,
+                    "Parameters": "",
+                    "Samples": 1319,
+                    "Total input tokens": 1090418,
+                    "Average input tokens": 827,
+                    "Total output tokens": 96662,
+                    "Average output tokens": 73,
+                    "All tokens": 1187080,
+                    "Cost($)": 0.6902
+                },
+                "AQuA": {
+                    "Score": 51.97,
+                    "Pass rate": 92.91,
+                    "X-shot": 0,
+                    "Parameters": "",
+                    "Samples": 254,
+                    "Total input tokens": 223438,
+                    "Average input tokens": 880,
+                    "Total output tokens": 29323,
+                    "Average output tokens": 115,
+                    "All tokens": 252761,
+                    "Cost($)": 0.1557
+                }
+            },
+            "Doubao-lite-32k": {
+                "META": {
+                    "Algorithm": "POT",
+                    "LLM": "Doubao-lite-32k",
+                    "Eval Date": "2025/01/07"
+                },
+                "gsm8k": {
+                    "Score": 79.15,
+                    "Pass rate": 92.65,
+                    "X-shot": 8,
+                    "Parameters": "",
+                    "Samples": 1319,
+                    "Total input tokens": 1170038,
+                    "Average input tokens": 887,
+                    "Total output tokens": 116987,
+                    "Average output tokens": 89,
+                    "All tokens": 1287025,
+                    "Cost($)": 0.0575
+                },
+                "AQuA": {
+                    "Score": 52.36,
+                    "Pass rate": 82.28,
+                    "X-shot": 0,
+                    "Parameters": "",
+                    "Samples": 254,
+                    "Total input tokens": 256721,
+                    "Average input tokens": 1011,
+                    "Total output tokens": 44729,
+                    "Average output tokens": 176,
+                    "All tokens": 301450,
+                    "Cost($)": 0.0142
+                }
+            }
+        },
+        "ReAct-Pro": {
+            "gpt-3.5-turbo": {
+                "META": {
+                    "Algorithm": "ReAct-Pro",
+                    "LLM": "gpt-3.5-turbo",
+                    "Eval Date": "2025/01/07"
+                },
+                "gsm8k": {
+                    "Score": 74.91,
+                    "Pass rate": 99.39,
+                    "X-shot": 8,
+                    "Parameters": "max_steps=10",
+                    "Samples": 1319,
+                    "Total input tokens": 6506164,
+                    "Average input tokens": 4933,
+                    "Total output tokens": 140122,
+                    "Average output tokens": 106,
+                    "All tokens": 6646286,
+                    "Cost($)": 3.4633
+                },
+                "AQuA": {
+                    "Score": 64.57,
+                    "Pass rate": 98.03,
+                    "X-shot": 0,
+                    "Parameters": "max_steps=10",
+                    "Samples": 254,
+                    "Total input tokens": 862614,
+                    "Average input tokens": 3396,
+                    "Total output tokens": 40973,
+                    "Average output tokens": 161,
+                    "All tokens": 903587,
+                    "Cost($)": 0.4928
+                }
+            },
+            "Doubao-lite-32k": {
+                "META": {
+                    "Algorithm": "ReAct-Pro",
+                    "LLM": "Doubao-lite-32k",
+                    "Eval Date": "2025/01/07"
+                },
+                "gsm8k": {
+                    "Score": 85.60,
+                    "Pass rate": 99.62,
+                    "X-shot": 8,
+                    "Parameters": "max_steps=10",
+                    "Samples": 1319,
+                    "Total input tokens": 5862016,
+                    "Average input tokens": 4444,
+                    "Total output tokens": 136623,
+                    "Average output tokens": 104,
+                    "All tokens": 5998639,
+                    "Cost($)": 0.2513
+                },
+                "AQuA": {
+                    "Score": 77.56,
+                    "Pass rate": 96.06,
+                    "X-shot": 0,
+                    "Parameters": "max_steps=10",
+                    "Samples": 254,
+                    "Total input tokens": 977890,
+                    "Average input tokens": 3850,
+                    "Total output tokens": 54951,
+                    "Average output tokens": 216,
+                    "All tokens": 1032841,
+                    "Cost($)": 0.0446
+                }
+            }
+        }
+    }
+}

gen_table.py ADDED Viewed

	@@ -0,0 +1,218 @@

+import copy as cp
+import json
+from collections import defaultdict
+from urllib.request import urlopen
+import gradio as gr
+import numpy as np
+import pandas as pd
+from meta_data import OVERALL_MATH_SCORE_FILE, DEFAULT_MATH_BENCH, META_FIELDS
+def listinstr(lst, s):
+    assert isinstance(lst, list)
+    for item in lst:
+        if item in s:
+            return True
+    return False
+def load_results(file_name=OVERALL_MATH_SCORE_FILE):
+    data = json.loads(open(file_name, "r").read())
+    return data
+def format_timestamp(timestamp):
+    date = timestamp[:10]
+    time = timestamp[11:13] + ':' + timestamp[14:16] + ':' + timestamp[17:19]
+    return date + ' ' + time
+def nth_large(val, vals):
+    return sum([1 for v in vals if v > val]) + 1
+def BUILD_L1_DF(results, fields):
+    check_box = {}
+    check_box['essential'] = ['Algorithm', 'LLM', 'Eval Date']
+    # revise there to set default dataset
+    check_box['required'] = ['Avg Score'] +  [item for f in fields for item in (f'{f}-Score', f'{f}-Cost($)')]
+    check_box['avg'] = ['Avg Score']
+    check_box['all'] = check_box['avg'] +  [item for f in fields for item in (f'{f}-Score', f'{f}-Cost($)')]
+    type_map = defaultdict(lambda: 'number')
+    type_map['Algorithm'] = 'html'
+    type_map['LLM'] = type_map['Vision Model'] = 'html'
+    type_map['Eval Date'] = 'str'
+    check_box['type_map'] = type_map
+    # df = generate_table(results, fields)
+    return check_box
+def BUILD_L2_DF(results, fields):
+    res = defaultdict(list)
+    # Iterate over each algorithm and its corresponding models
+    for algo_name, algo_data in results.items():
+        for model_name, model_data in algo_data.items():
+            # Get META information
+            meta = model_data['META']
+            # Create a record for each dataset
+            for dataset in fields:
+                if dataset not in model_data:
+                    continue
+                # Add metadata
+                for k, v in meta.items():
+                    res[k].append(v)
+                # Add dataset name
+                res['Dataset'].append(dataset)
+                # Get dataset data
+                dataset_data = model_data[dataset]
+                # Add all fields
+                for field, value in dataset_data.items():
+                    res[field].append(value)
+    # Create DataFrame
+    df = pd.DataFrame(res)
+    # Sort by Dataset and Score in descending order
+    df = df.sort_values(['Dataset', 'Score'], ascending=[True, False])
+    # Add rank for each dataset separately
+    df['Rank'] = df.groupby('Dataset').cumcount() + 1
+    # Rearrange column order
+    columns = ['Rank', 'Algorithm', 'Dataset', 'LLM', 'Eval Date', 'Score', 'Pass rate', 'X-shot', 'Parameters']
+    remaining_columns = [col for col in df.columns if col not in columns]
+    df = df[columns + remaining_columns]
+    # Set checkbox configuration
+    check_box = {}
+    check_box['essential'] = ['Algorithm', 'Dataset', 'LLM', 'Eval Date']
+    check_box['required'] = check_box['essential'] + ['Score', 'Pass rate', 'X-shot', 'Parameters', 'Samples', 'All tokens', 'Cost($)']
+    check_box['all'] = ['Score', 'Pass rate', 'X-shot', 'Parameters', 'Samples', 'Total input tokens', 'Average input tokens', 'Total output tokens', 'Average output tokens', 'All tokens', 'Cost($)']
+    type_map = defaultdict(lambda: 'number')
+    type_map['Algorithm'] = 'html'
+    type_map['LLM'] = type_map['Vision Model'] = 'html'
+    type_map['Eval Date'] = 'str'
+    type_map['Dataset'] = 'str'
+    type_map['Parameters'] = 'str'
+    type_map['All tokens'] = 'number'
+    type_map['Cost($)'] = 'number'
+    check_box['type_map'] = type_map
+    return df, check_box
+def generate_table(results, fields):
+    res = defaultdict(list)
+    for i, m in enumerate(results):
+        item = results[m]
+        meta = item['META']
+        for k in META_FIELDS:
+            res[k].append(meta[k])
+        scores, costs = [], []
+        for d in fields:
+            if d in item.keys():
+                res[d+"-Score"].append(item[d]["Score"])
+                res[d+"-Cost($)"].append(item[d]["Cost($)"])
+                scores.append(item[d]["Score"])
+                costs.append(item[d]["Cost($)"])
+            else:
+                res[d+"-Score"].append(None)
+                res[d+"-Cost($)"].append(None)
+                scores.append(None)
+                costs.append(None)
+        res['Avg Score'].append(round(np.mean(scores), 2) if None not in scores else None)
+    df = pd.DataFrame(res)
+    # Sort by Avg Score and assign rank
+    valid = df[~pd.isna(df['Avg Score'])].copy()
+    missing = df[pd.isna(df['Avg Score'])].copy()
+    # Assign rank to valid rows (using integer type)
+    valid = valid.sort_values('Avg Score', ascending=False)
+    valid['Rank'] = pd.Series(range(1, len(valid) + 1)[::-1], dtype=int)
+    # Assign last rank to missing rows (using integer type)
+    if not missing.empty:
+        missing['Rank'] = pd.Series([len(valid) + 1] * len(missing), dtype=int)
+    # Merge and sort by Rank
+    df = pd.concat([valid, missing])
+    df = df.sort_values('Rank')
+    # Rearrange column order to ensure Rank is the first column
+    columns = ['Rank', 'Algorithm', 'LLM', 'Eval Date', 'Avg Score']  # Fixed column order
+    for d in fields:
+        columns.extend([f"{d}-Score", f"{d}-Cost($)"])  # Add dataset-related columns
+    # Ensure all columns exist and reorder
+    existing_columns = [col for col in columns if col in df.columns]
+    remaining_columns = [col for col in df.columns if col not in columns]
+    df = df[existing_columns + remaining_columns]  # Reorder columns
+    # Sort by Score in descending order
+    df = df.sort_values(['Avg Score'], ascending=[False])
+    # Add rank for each dataset separately
+    df['Rank'] = range(1, len(df) + 1)
+    # Rearrange column order
+    columns = ['Rank', 'Algorithm', 'LLM', 'Eval Date', 'Avg Score']
+    remaining_columns = [col for col in df.columns if col not in columns]
+    df = df[columns + remaining_columns]
+    return df
+def generate_table_detail(results, fields):
+    res = defaultdict(list)
+    # Iterate over each algorithm and its corresponding models
+    for algo_name, algo_data in results.items():
+        for model_name, model_data in algo_data.items():
+            # Get META information
+            meta = model_data['META']
+            # Create a record for each dataset
+            for dataset in fields:
+                if dataset not in model_data:
+                    continue
+                # Add metadata
+                for k, v in meta.items():
+                    res[k].append(v)
+                # Add dataset name
+                res['Dataset'].append(dataset)
+                # Get dataset data
+                dataset_data = model_data[dataset]
+                # Add all fields
+                for field, value in dataset_data.items():
+                    res[field].append(value)
+    # Create DataFrame
+    df = pd.DataFrame(res)
+    # Sort by Dataset and Score in descending order
+    df = df.sort_values(['Dataset', 'Score'], ascending=[True, False])
+    # Add rank for each dataset separately
+    df['Rank'] = df.groupby('Dataset').cumcount() + 1
+    # Rearrange column order
+    columns = ['Rank', 'Dataset', 'Algorithm', 'LLM', 'Eval Date', 'Score', 'Pass rate', 'X-shot', 'Parameters']
+    remaining_columns = [col for col in df.columns if col not in columns]
+    df = df[columns + remaining_columns]
+    return df

meta_data.py ADDED Viewed

	@@ -0,0 +1,68 @@

+# CONSTANTS-URL
+URL = "http://opencompass.openxlab.space/assets/OpenVLM.json"
+OVERALL_MATH_SCORE_FILE = "overall_math_score.json"
+DETAIL_MATH_SCORE_FILE = "detail_math_score.json"
+# CONSTANTS-TEXT
+LEADERBORAD_INTRODUCTION = """# Open Agent Leaderboard
+### Welcome to the Open Agent Leaderboard! We share the evaluation results of open agents: COT, SC_COT, POT, ReAct, etc. The agents are impletemented by the OpenSource Framework: [*OmAgent*](https://github.com/om-ai-lab/OmAgent)
+This leaderboard was last updated: {}.
+To add your own agent to the leaderboard, please create a PR in [*OmAgent*](https://github.com/om-ai-lab/OmAgent), then we will help with the evaluation and updating the leaderboard. For any questions or concerns, please feel free to contact us.
+"""
+DEFAULT_MATH_BENCH = [
+    'gsm8k', 'AQuA'
+]
+# The README file for each benchmark
+LEADERBOARD_MD = {}
+LEADERBOARD_MD['MATH_MAIN'] = f"""
+## Math task main Evaluation Results
+- Metrics:
+  - Avg Score: The average score on all math Benchmarks (normalized to 0 - 100, the higher the better).
+  - Rank: The average rank on all math Benchmarks (the lower the better).
+  - Score: The evaluation score on each math Benchmarks (the higher the better).
+  - Cost: The cost on each math Benchmarks (the lower the better).
+- By default, we present the overall evaluation results based on {', '.join(DEFAULT_MATH_BENCH)}, sorted by the descending order of Avg Score.
+"""
+LEADERBOARD_MD['MATH_DETAIL'] = f"""
+## Math task detail Evaluation Results
+- By default, we present the overall evaluation results based on {', '.join(DEFAULT_MATH_BENCH)}
+- default parameters: temperature=0.0
+- LLM prices:
+  - gpt-3.5-turbo:
+      - 0.0005$/1M tokens (input)
+    - 0.0015$/1M tokens (output)
+  - Doubao-lite-32k (1 USD = 7.3249 CNY):
+      - 0.00004096$/1M tokens (input)
+    - 0.0001$/1M tokens (output)
+- ReAct-Pro*: We modified ReAct to ReAct-Pro, following the Reflexion repository. Implementation details can be found in the [*OmAgent*](https://github.com/om-ai-lab/OmAgent) repository.
+"""
+META_FIELDS = [
+    'Algorithm', 'LLM', 'Eval Date'
+]
+DATASETS = [
+    'gsm8k', 'AQuA'
+]
+LLM = [
+    'Doubao-lite-32k', 'gpt-3.5-turbo'
+]
+ALGORITHMS = [
+  'IO',  'COT', 'SC_COT', 'POT', 'ReAct-Pro*'
+]
+CITATION_BUTTON_TEXT = r"""@article{zhang2024omagent,
+  title={OmAgent: A Multi-modal Agent Framework for Complex Video Understanding with Task Divide-and-Conquer},
+  author={Zhang, Lu and Zhao, Tiancheng and Ying, Heting and Ma, Yibo and Lee, Kyusong},
+  journal={arXiv preprint arXiv:2406.16620},
+  year={2024}
+}"""

overall_math_score.json ADDED Viewed

	@@ -0,0 +1,155 @@

+{
+    "time": "2025-01-09 17:13:45",
+    "results": {
+        "IO": {
+            "META": {
+                "Algorithm": "IO",
+                "LLM": "gpt-3.5-turbo",
+                "Eval Date": "2025/01/07"
+            },
+            "gsm8k": {
+                "Score": 37.83,
+                "Cost($)": 0.3328
+            },
+            "AQuA": {
+                "Score": 38.98,
+                "Cost($)": 0.0380
+            }
+        },
+        "COT": {
+            "META": {
+                "Algorithm": "COT",
+                "LLM": "gpt-3.5-turbo",
+                "Eval Date": "2025/01/07"
+            },
+            "gsm8k": {
+                "Score": 78.70,
+                "Cost($)": 0.6788
+            },
+            "AQuA": {
+                "Score": 61.02,
+                "Cost($)": 0.0957
+            }
+        },
+        "SC-COT": {
+            "META": {
+                "Algorithm": "SC-COT",
+                "LLM": "gpt-3.5-turbo",
+                "Eval Date": "2025/01/07"
+            },
+            "gsm8k": {
+                "Score": 80.06,
+                "Cost($)": 5.0227
+            },
+            "AQuA": {
+                "Score": 67.32,
+                "Cost($)": 0.6491
+            }
+        },
+        "POT": {
+            "META": {
+                "Algorithm": "POT",
+                "LLM": "gpt-3.5-turbo",
+                "Eval Date": "2025/01/07"
+            },
+            "gsm8k": {
+                "Score": 76.88,
+                "Cost($)": 0.6902
+            },
+            "AQuA": {
+                "Score": 51.97,
+                "Cost($)": 0.1557
+            }
+        },
+        "ReAct-Pro*": {
+            "META": {
+                "Algorithm": "ReAct-Pro*",
+                "LLM": "gpt-3.5-turbo",
+                "Eval Date": "2025/01/07"
+            },
+            "gsm8k": {
+                "Score": 74.91,
+                "Cost($)": 3.4633
+            },
+            "AQuA": {
+                "Score": 64.57,
+                "Cost($)": 0.4928
+            }
+        },
+        "IO-Doubao": {
+            "META": {
+                "Algorithm": "IO",
+                "LLM": "Doubao-lite-32k",
+                "Eval Date": "2025/01/07"
+            },
+            "gsm8k": {
+                "Score": 72.02,
+                "Cost($)": 0.0354
+            },
+            "AQuA": {
+                "Score": 79.13,
+                "Cost($)": 0.0058
+            }
+        },
+        "COT-Doubao": {
+            "META": {
+                "Algorithm": "COT",
+                "LLM": "Doubao-lite-32k",
+                "Eval Date": "2025/01/07"
+            },
+            "gsm8k": {
+                "Score": 89.31,
+                "Cost($)": 0.0557
+            },
+            "AQuA": {
+                "Score": 82.68,
+                "Cost($)": 0.0066
+            }
+        },
+        "SC-COT-Doubao": {
+            "META": {
+                "Algorithm": "SC-COT",
+                "LLM": "Doubao-lite-32k",
+                "Eval Date": "2025/01/07"
+            },
+            "gsm8k": {
+                "Score": 88.63,
+                "Cost($)": 0.1533
+            },
+            "AQuA": {
+                "Score": 83.46,
+                "Cost($)": 0.0409
+            }
+        },
+        "POT-Doubao": {
+            "META": {
+                "Algorithm": "POT",
+                "LLM": "Doubao-lite-32k",
+                "Eval Date": "2025/01/07"
+            },
+            "gsm8k": {
+                "Score": 79.15,
+                "Cost($)": 0.0575
+            },
+            "AQuA": {
+                "Score": 52.36,
+                "Cost($)": 0.0142
+            }
+        },
+        "ReAct-Pro-Doubao": {
+            "META": {
+                "Algorithm": "ReAct-Pro",
+                "LLM": "Doubao-lite-32k",
+                "Eval Date": "2025/01/07"
+            },
+            "gsm8k": {
+                "Score": 85.60,
+                "Cost($)": 0.2513
+            },
+            "AQuA": {
+                "Score": 77.56,
+                "Cost($)": 0.0446
+            }
+        }
+    }
+}