de-arena

Sleeping

File size: 23,794 Bytes

import gradio as gr
from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns, SearchColumns
import pandas as pd
from apscheduler.schedulers.background import BackgroundScheduler
from huggingface_hub import snapshot_download

from src.about import (
    CITATION_BUTTON_LABEL,
    CITATION_BUTTON_TEXT,
    EVALUATION_QUEUE_TEXT,
    INTRODUCTION_TEXT,
    LLM_BENCHMARKS_TEXT,
    TITLE,
    SUB_TITLE,
    EXTERNAL_LINKS,
    COMING_SOON_TEXT
)
from src.display.css_html_js import custom_css
from src.display.utils import (
    BENCHMARK_COLS,
    COLS,
    EVAL_COLS,
    EVAL_TYPES,
    AutoEvalColumn,
    ModelType,
    fields,
    WeightType,
    Precision
)
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
from src.populate import get_evaluation_queue_df, get_leaderboard_df, get_model_leaderboard_df
from src.submission.submit import add_new_eval


def restart_space():
    API.restart_space(repo_id=REPO_ID)

### Space initialisation
try:
    print(EVAL_REQUESTS_PATH)
    snapshot_download(
        repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
    )
except Exception:
    restart_space()
try:
    print(EVAL_RESULTS_PATH)
    snapshot_download(
        repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
    )
except Exception:
    restart_space()


LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)

(
    finished_eval_queue_df,
    running_eval_queue_df,
    pending_eval_queue_df,
) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)


def init_leaderboard(dataframe):
    if dataframe is None or dataframe.empty:
        raise ValueError("Leaderboard DataFrame is empty or None.")
    
    return Leaderboard(
        value=dataframe,
        datatype=[c.type for c in fields(AutoEvalColumn)],
        select_columns=None,
        # SelectColumns(
        #     default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
        #     cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
        #     label="Select Columns to Display:",
        # ),
        # search_columns=None,
        # search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
        search_columns=SearchColumns(primary_column=AutoEvalColumn.model.name, secondary_columns=[],
                                     placeholder="Search by the model name",
                                     label="Searching"),
        hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
        filter_columns=None,
        # [
        #     ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
        #     ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
        #     ColumnFilter(
        #         AutoEvalColumn.params.name,
        #         type="slider",
        #         min=0.01,
        #         max=150,
        #         label="Select the number of parameters (B)",
        #     ),
        #     ColumnFilter(
        #         AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
        #     ),
        # ],
        # bool_checkboxgroup_label="Hide models",
        interactive=False,
    )

# model_result_path = "./src/results/models_2024-10-07-14:50:12.666068.jsonl"
# model_result_path = "./src/results/models_2024-10-08-03:10:26.811832.jsonl"
# model_result_path = "./src/results/models_2024-10-08-03:25:44.801310.jsonl"
# model_result_path = "./src/results/models_2024-10-08-17:39:21.001582.jsonl"
model_result_path = "./src/results/models_2024-10-09-05:17:38.810960.json"
# model_leaderboard_df = get_model_leaderboard_df(model_result_path)


def overall_leaderboard(dataframe):
    if dataframe is None or dataframe.empty:
        raise ValueError("Leaderboard DataFrame is empty or None.")
    
    return Leaderboard(
        value=dataframe,
        datatype=[c.type for c in fields(AutoEvalColumn)],
        select_columns=None,
        search_columns=SearchColumns(primary_column=AutoEvalColumn.model.name, secondary_columns=[],
                                     placeholder="Search by the model name",
                                     label="Searching"),
        hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
        filter_columns=None,
        interactive=False,
    )
    


demo = gr.Blocks(css=custom_css)
with demo:
    gr.HTML(TITLE)
    gr.HTML(SUB_TITLE)
    gr.HTML(EXTERNAL_LINKS)
    gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")

    with gr.Tabs(elem_classes="tab-buttons") as tabs:
        
        with gr.TabItem("🏅 Overview", elem_id="llm-benchmark-tab-table", id=0):

            DESCRIPTION_TEXT = """
            Total #models: 52 (Last updated: 2024-10-08)
            
            This page provids a comprehensive overview of model ranks across various dimensions. Models are sorted based on their averaged rank across all dimensions. 
            (Some missing values are due to the slow or problemtic model responses, and we will update the leaderboard once we have the complete results.)
            """
            gr.Markdown(DESCRIPTION_TEXT, elem_classes="markdown-text")

            leaderboard = overall_leaderboard(
                get_model_leaderboard_df(
                    model_result_path,
                    benchmark_cols=[
                        # AutoEvalColumn.rank_overall.name,
                        AutoEvalColumn.model.name, 
                        AutoEvalColumn.rank_overall.name,
                        AutoEvalColumn.rank_math_algebra.name,
                        AutoEvalColumn.rank_math_geometry.name,
                        AutoEvalColumn.rank_math_probability.name,
                        AutoEvalColumn.rank_reason_logical.name,
                        AutoEvalColumn.rank_reason_social.name,
                        # AutoEvalColumn.rank_chemistry.name,
                        ],
                    rank_col=[],
                )
            )
            
        with gr.TabItem("🎯 Overall", elem_id="llm-benchmark-tab-table", id=1):
            DESCRIPTION_TEXT = """
            Overall dimension measures the comprehensive performance of LLMs across diverse tasks. 
            We start with diverse questions from the widely-used [MT-Bench](https://arxiv.org/abs/2306.05685), 
            coving a wide range of domains, including writing, roleplay, extraction, reasoning, math, coding, knowledge I (STEM), and knowledge II (humanities/social science).
            """
            gr.Markdown(DESCRIPTION_TEXT, elem_classes="markdown-text")
            
            leaderboard = overall_leaderboard(
                get_model_leaderboard_df(
                    model_result_path,
                    benchmark_cols=[
                        AutoEvalColumn.rank_overall.name,
                        AutoEvalColumn.model.name, 
                        AutoEvalColumn.score_overall.name,
                        AutoEvalColumn.sd_overall.name,
                        AutoEvalColumn.license.name,
                        AutoEvalColumn.organization.name,
                        AutoEvalColumn.knowledge_cutoff.name,
                        ],
                    rank_col=[AutoEvalColumn.rank_overall.name],
                ))
            
        with gr.TabItem("🔢 Math", elem_id="math-tab-table", id=2):
            DESCRIPTION_TEXT="""
            Algebra, Geometry, and Probability are the current three main math domains in the leaderboard. 
            To mitigate the potential impact of data contimination, we have carefully selected the datasets from various sources.
            We prioritize **recent math datasets** and focus on **college and beyond level** math questions. 
            The current datasets include
            [MATH](https://arxiv.org/abs/2103.03874), 
            [MATH-500](https://github.com/openai/prm800k/tree/main/prm800k/math_splits), 
            [Omni](https://omni-math.github.io/), 
            [MathQA](https://arxiv.org/abs/1905.13319), 
            [MathBench](https://arxiv.org/abs/2405.12209), 
            [SciBench](https://arxiv.org/abs/2307.10635), and more!
            
            We plan to include more math domains, such as calculus, number theory, and more in the future.
            """
            gr.Markdown(DESCRIPTION_TEXT, elem_classes="markdown-text")

            # leaderboard = init_leaderboard(LEADERBOARD_DF)
            with gr.TabItem("🧮 Algebra", elem_id="algebra_subtab", id=0, elem_classes="subtab"): 
                leaderboard = overall_leaderboard(
                    get_model_leaderboard_df(
                        model_result_path,
                        benchmark_cols=[
                            AutoEvalColumn.rank_math_algebra.name,
                            AutoEvalColumn.model.name, 
                            AutoEvalColumn.score_math_algebra.name,
                            AutoEvalColumn.sd_math_algebra.name,
                            AutoEvalColumn.license.name,
                            AutoEvalColumn.organization.name,
                            AutoEvalColumn.knowledge_cutoff.name,
                            ],
                        rank_col=[AutoEvalColumn.rank_math_algebra.name],
                    )
                )
                
            with gr.TabItem("📐 Geometry", elem_id="geometry_subtab", id=1, elem_classes="subtab"): 
                leaderboard = overall_leaderboard(
                    get_model_leaderboard_df(
                        model_result_path,
                        benchmark_cols=[
                            AutoEvalColumn.rank_math_geometry.name,
                            AutoEvalColumn.model.name, 
                            AutoEvalColumn.score_math_geometry.name,
                            AutoEvalColumn.sd_math_geometry.name,
                            AutoEvalColumn.license.name,
                            AutoEvalColumn.organization.name,
                            AutoEvalColumn.knowledge_cutoff.name,
                            ],
                        rank_col=[AutoEvalColumn.rank_math_geometry.name],
                    )
                )

            with gr.TabItem("📊 Probability", elem_id="prob_subtab", id=2, elem_classes="subtab"):
                leaderboard = overall_leaderboard(
                    get_model_leaderboard_df(
                        model_result_path,
                        benchmark_cols=[
                            AutoEvalColumn.rank_math_probability.name,
                            AutoEvalColumn.model.name, 
                            AutoEvalColumn.score_math_probability.name,
                            AutoEvalColumn.sd_math_probability.name,
                            AutoEvalColumn.license.name,
                            AutoEvalColumn.organization.name,
                            AutoEvalColumn.knowledge_cutoff.name,
                            ],
                        rank_col=[AutoEvalColumn.rank_math_probability.name],
                    )
                )
                
        with gr.TabItem("🧠 Reasoning", elem_id="reasonong-tab-table", id=3):
            DESCRIPTION_TEXT = """
            Reasoning is a broad domain for evaluating LLMs, but traditional tasks like commonsense reasoning have become less effective in differentiating modern LLMs. 
            We now present two challenging types of reasoning: logical reasoning and social reasoning, both of which present more meaningful and sophisticated ways to assess LLM performance.
            
            For logical reasoning, we leverage datasets from sources such as
            [BIG-Bench Hard (BBH)](https://arxiv.org/abs/2210.09261),
            [FOLIO](https://arxiv.org/abs/2209.00840),
            [LogiQA2.0](https://github.com/csitfun/LogiQA2.0),
            [PrOntoQA](https://arxiv.org/abs/2210.01240),
            [ReClor](https://arxiv.org/abs/2002.04326), 
            These cover a range of tasks including deductive reasoning, object counting and tracking, pattern recognition, 
            temporal reasoning, first-order logic reaosning, etc.
            For social reasoning, we collect datasets from
            [MMToM-QA (Text-only)](https://arxiv.org/abs/2401.08743),
            [BigToM](https://arxiv.org/abs/2306.15448),
            [Adv-CSFB](https://arxiv.org/abs/2305.14763),
            [SocialIQA](https://arxiv.org/abs/1904.09728),
            [NormBank](https://arxiv.org/abs/2305.17008), covering challenging social reasoning tasks, 
            such as social commonsense reasoning, social normative reasoning, Theory of Mind (ToM) reasoning, etc.
            
            """
            gr.Markdown(DESCRIPTION_TEXT, elem_classes="markdown-text")

            with gr.TabItem("🧩 Logical", elem_id="logical_subtab", id=0, elem_classes="subtab"):         
                leaderboard = overall_leaderboard(
                    get_model_leaderboard_df(
                        model_result_path,
                        benchmark_cols=[
                            AutoEvalColumn.rank_reason_logical.name,
                            AutoEvalColumn.model.name, 
                            AutoEvalColumn.score_reason_logical.name,
                            AutoEvalColumn.sd_reason_logical.name,
                            AutoEvalColumn.license.name,
                            AutoEvalColumn.organization.name,
                            AutoEvalColumn.knowledge_cutoff.name,
                            ],
                        rank_col=[AutoEvalColumn.rank_reason_logical.name],
                    )
                )

            with gr.TabItem("🗣️ Social", elem_id="social_subtab", id=1, elem_classes="subtab"):         
                leaderboard = overall_leaderboard(
                    get_model_leaderboard_df(
                        model_result_path,
                        benchmark_cols=[
                            AutoEvalColumn.rank_reason_social.name,
                            AutoEvalColumn.model.name, 
                            AutoEvalColumn.score_reason_social.name,
                            AutoEvalColumn.sd_reason_social.name,
                            AutoEvalColumn.license.name,
                            AutoEvalColumn.organization.name,
                            AutoEvalColumn.knowledge_cutoff.name,
                            ],
                        rank_col=[AutoEvalColumn.rank_reason_social.name],
                    )
                )

        with gr.TabItem("🔬 Science", elem_id="science-table", id=4):
            CURRENT_TEXT = """
            Sicnece domain is a critical area for evaluating LLMs. 
            We are working on adding several tasks on scientific domains to the leaderboard. The forthcoming ones are biology, chemistry, and physics. 
            We have diversely and aggressively collected recent science datasets, including but not limited to
            [GPQA](https://arxiv.org/abs/2311.12022),
            [JEEBench](https://aclanthology.org/2023.emnlp-main.468/),
            [MMLU-Pro](https://arxiv.org/abs/2406.01574),
            [OlympiadBench](https://arxiv.org/abs/2402.14008),
            [SciBench](https://arxiv.org/abs/2307.10635),
            [SciEval](https://arxiv.org/abs/2308.13149).
            """
            gr.Markdown(CURRENT_TEXT, elem_classes="markdown-text")
            
            with gr.TabItem("🧪 Chemistry", elem_id="chemistry_subtab", id=0, elem_classes="subtab"):         
                leaderboard = overall_leaderboard(
                    get_model_leaderboard_df(
                        model_result_path,
                        benchmark_cols=[
                            AutoEvalColumn.rank_chemistry.name,
                            AutoEvalColumn.model.name, 
                            AutoEvalColumn.score_chemistry.name,
                            # AutoEvalColumn.sd_reason_social.name,
                            AutoEvalColumn.license.name,
                            AutoEvalColumn.organization.name,
                            AutoEvalColumn.knowledge_cutoff.name,
                            ],
                        rank_col=[AutoEvalColumn.rank_chemistry.name],
                    )
                )

            with gr.TabItem("⚛️ Physics", elem_id="physics_subtab", id=1, elem_classes="subtab"):   
                CURRENT_TEXT = """
                # Coming soon!
                """
                gr.Markdown(CURRENT_TEXT, elem_classes="markdown-text")


            with gr.TabItem("🧬 Biology", elem_id="biology_subtab", id=2, elem_classes="subtab"):   
                CURRENT_TEXT = """
                # Coming soon!
                """
                gr.Markdown(CURRENT_TEXT, elem_classes="markdown-text")


        with gr.TabItem("</> Coding", elem_id="coding-table", id=5):
            CURRENT_TEXT = """
            # Coming soon!
            We are working on adding more tasks in coding domains to the leaderboard. 
            The forthcoming ones focus on Python, Java, and C++, with plans to expand to more languages. 
            We collect a variety of recent coding datasets, including 
            [HumanEval](https://huggingface.co/datasets/openai/openai_humaneval), 
            [MBPP](https://huggingface.co/datasets/google-research-datasets/mbpp), 
            [HumanEvalFix](https://huggingface.co/datasets/bigcode/humanevalpack), 
            [newly crawled LeetCode data](https://leetcode.com/problemset/), 
            filtered code-related queries from [Arena-Hard-Auto](https://github.com/lmarena/arena-hard-auto) and more!
            Our efforts also include synthesizing new code-related queries to ensure diversity!
            """
            gr.Markdown(CURRENT_TEXT, elem_classes="markdown-text")




        with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=6):
            ABOUT_TEXT = """
            # About Us
            
            [Decentralized Arena](https://de-arena.maitrix.org/) is an open-source project that automates and scales the evaluation of large language models (LLMs) across various fine-grained dimensions,
            developed by reseachers from UCSD, CMU, MBZUAI, [Maitrix.org](https://maitrix.org/) and [LLM360](https://www.llm360.ai/). 
            
            Stay tuned for more updates and new features!
            
            ## Team members
            Yanbin Yin, [Zhen Wang](https://zhenwang9102.github.io/), [Kun Zhou](https://lancelot39.github.io/), Xiangdong Zhang,
            [Shibo Hao](https://ber666.github.io/), [Yi Gu](https://www.yigu.page/), Jieyuan Liu, Somanshu Singla, [Tianyang Liu](https://leolty.github.io/),
            [Eric P. Xing](https://www.cs.cmu.edu/~epxing/), [Zhengzhong Liu](https://hunterhector.github.io/), [Haojian Jin](https://www.haojianj.in/),
            [Zhiting Hu](https://zhiting.ucsd.edu/)
            
            ## Contact Us
            - Follow us on X, [Maitrix.org](https://twitter.com/MaitrixOrg) and [LLM360](https://twitter.com/llm360)
            
            """
            gr.Markdown(ABOUT_TEXT, elem_classes="markdown-text")


        '''
        with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
            with gr.Column():
                with gr.Row():
                    gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")

                with gr.Column():
                    with gr.Accordion(
                        f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
                        open=False,
                    ):
                        with gr.Row():
                            finished_eval_table = gr.components.Dataframe(
                                value=finished_eval_queue_df,
                                headers=EVAL_COLS,
                                datatype=EVAL_TYPES,
                                row_count=5,
                            )
                    with gr.Accordion(
                        f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
                        open=False,
                    ):
                        with gr.Row():
                            running_eval_table = gr.components.Dataframe(
                                value=running_eval_queue_df,
                                headers=EVAL_COLS,
                                datatype=EVAL_TYPES,
                                row_count=5,
                            )

                    with gr.Accordion(
                        f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
                        open=False,
                    ):
                        with gr.Row():
                            pending_eval_table = gr.components.Dataframe(
                                value=pending_eval_queue_df,
                                headers=EVAL_COLS,
                                datatype=EVAL_TYPES,
                                row_count=5,
                            )
            with gr.Row():
                gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")

            with gr.Row():
                with gr.Column():
                    model_name_textbox = gr.Textbox(label="Model name")
                    revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
                    model_type = gr.Dropdown(
                        choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
                        label="Model type",
                        multiselect=False,
                        value=None,
                        interactive=True,
                    )

                with gr.Column():
                    precision = gr.Dropdown(
                        choices=[i.value.name for i in Precision if i != Precision.Unknown],
                        label="Precision",
                        multiselect=False,
                        value="float16",
                        interactive=True,
                    )
                    weight_type = gr.Dropdown(
                        choices=[i.value.name for i in WeightType],
                        label="Weights type",
                        multiselect=False,
                        value="Original",
                        interactive=True,
                    )
                    base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")

            submit_button = gr.Button("Submit Eval")
            submission_result = gr.Markdown()
            submit_button.click(
                add_new_eval,
                [
                    model_name_textbox,
                    base_model_name_textbox,
                    revision_name_textbox,
                    precision,
                    weight_type,
                    model_type,
                ],
                submission_result,
            )
        '''
        
    with gr.Row():
        with gr.Accordion("📙 Citation", open=False):
            citation_button = gr.Textbox(
                value=CITATION_BUTTON_TEXT,
                label=CITATION_BUTTON_LABEL,
                lines=20,
                elem_id="citation-button",
                show_copy_button=True,
            )

scheduler = BackgroundScheduler()
scheduler.add_job(restart_space, "interval", seconds=1800)
scheduler.start()
demo.queue(default_concurrency_limit=40).launch()