import gradio as gr from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns, SearchColumns import pandas as pd from apscheduler.schedulers.background import BackgroundScheduler from huggingface_hub import snapshot_download from src.about import ( CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT, EVALUATION_QUEUE_TEXT, INTRODUCTION_TEXT, LLM_BENCHMARKS_TEXT, TITLE, SUB_TITLE, EXTERNAL_LINKS, COMING_SOON_TEXT ) from src.display.css_html_js import custom_css from src.display.utils import ( BENCHMARK_COLS, COLS, EVAL_COLS, EVAL_TYPES, AutoEvalColumn, ModelType, fields, WeightType, Precision ) from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN from src.populate import get_evaluation_queue_df, get_leaderboard_df, get_model_leaderboard_df from src.submission.submit import add_new_eval def restart_space(): API.restart_space(repo_id=REPO_ID) ### Space initialisation try: print(EVAL_REQUESTS_PATH) snapshot_download( repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN ) except Exception: restart_space() try: print(EVAL_RESULTS_PATH) snapshot_download( repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN ) except Exception: restart_space() LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS) ( finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df, ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS) def init_leaderboard(dataframe): if dataframe is None or dataframe.empty: raise ValueError("Leaderboard DataFrame is empty or None.") return Leaderboard( value=dataframe, datatype=[c.type for c in fields(AutoEvalColumn)], select_columns=None, # SelectColumns( # default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default], # cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden], # label="Select Columns to Display:", # ), # search_columns=None, # search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name], search_columns=SearchColumns(primary_column=AutoEvalColumn.model.name, secondary_columns=[], placeholder="Search by the model name", label="Searching"), hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden], filter_columns=None, # [ # ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"), # ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"), # ColumnFilter( # AutoEvalColumn.params.name, # type="slider", # min=0.01, # max=150, # label="Select the number of parameters (B)", # ), # ColumnFilter( # AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True # ), # ], # bool_checkboxgroup_label="Hide models", interactive=False, ) # model_result_path = "./src/results/models_2024-10-20-23:34:57.242641.json" # model_result_path = "./src/results/models_2024-10-24-08:08:59.127307.json" model_result_path = "./src/results/models_2024-11-08-08:36:00.464224.json" # model_leaderboard_df = get_model_leaderboard_df(model_result_path) def overall_leaderboard(dataframe): if dataframe is None or dataframe.empty: raise ValueError("Leaderboard DataFrame is empty or None.") return Leaderboard( value=dataframe, datatype=[c.type for c in fields(AutoEvalColumn)], select_columns=None, search_columns=SearchColumns(primary_column=AutoEvalColumn.model.name, secondary_columns=[], placeholder="Search by the model name", label="Searching"), hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden], filter_columns=None, interactive=False, ) # Your leaderboard name TITLE = """

๐Ÿ… Decentralized Arena Leaderboard

""" SUB_TITLE = """

Automated, Robust, and Transparent LLM Evaluation for Numerous Dimensions

""" # GitHub | EXTERNAL_LINKS = """ """ # What does your leaderboard evaluate? INTRODUCTION_TEXT = """ **Decentralized Arena** automates and scales "Chatbot Arena" for LLM evaluation across various fine-grained dimensions (e.g., math โ€“ algebra, geometry, probability; logical reasoning, social reasoning, biology, chemistry, โ€ฆ). The evaluation is decentralized and democratic, with all LLMs participating in evaluating others. It achieves a 95\% correlation with Chatbot Arena's overall rankings, while being fully transparent and reproducible. """ demo = gr.Blocks(css=custom_css) with demo: gr.HTML(TITLE) gr.HTML(SUB_TITLE) gr.HTML(EXTERNAL_LINKS) # gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text") # gr.HTML('

This is a larger text using HTML in Markdown.

') INTRODUCTION_TEXT_FONT_SIZE = 16 INTRODUCTION_TEXT = ( f'

' 'Decentralized Arena automates, scales, and accelerates Chatbot Arena ' 'for large language model (LLM) evaluation across diverse, fine-grained dimensions, ' 'such as mathematics (algebra, geometry, probability), logical reasoning, social reasoning, science (chemistry, physics, biology), or any user-defined dimensions. ' 'The evaluation is decentralized and democratic, with all participating LLMs assessing each other to ensure unbiased and fair results. ' 'With a 95% correlation to Chatbot Arena\'s overall rankings, the system is fully transparent and reproducible.' '

' f'

' 'We actively invite model developers to participate and expedite their benchmarking efforts ' 'and encourage data stakeholders to freely define and evaluate dimensions of interest for their own objectives.' '

' ) gr.HTML(INTRODUCTION_TEXT) ''' TEXT = ( f'

' '' '

' ) gr.HTML(TEXT) ''' with gr.Tabs(elem_classes="tab-buttons") as tabs: with gr.TabItem("๐Ÿ… Overview", elem_id="llm-benchmark-tab-table", id=0): # DESCRIPTION_TEXT = """ # Total #models: 57 (Last updated: 2024-10-21) # This page prvovides a comprehensive overview of model ranks across various dimensions, based on their averaged ranks. # (Missing values are due to the slow or problemtic model responses to be fixed soom.) # """ # gr.Markdown(DESCRIPTION_TEXT, elem_classes="markdown-text") TEXT = ( f'

' # 'Total #models: 57 (Last updated: 2024-10-21)' 'Total #models: 62 (Last updated: 2024-11-08)' '

' f'

' 'This page prvovides a comprehensive overview of model ranks across various dimensions, based on their averaged ranks or scores.' '(Missing values are due to the slow or problemtic model responses to be fixed soom.)' '

' # '

' # 'We present ' # '

' ) gr.HTML(TEXT) with gr.TabItem("โญ Sort by Rank", elem_id="overall_sort_by_rank_subtab", id=0, elem_classes="subtab"): leaderboard = overall_leaderboard( get_model_leaderboard_df( model_result_path, benchmark_cols=[ # AutoEvalColumn.rank_overall.name, AutoEvalColumn.model.name, AutoEvalColumn.rank_math_algebra.name, AutoEvalColumn.rank_math_geometry.name, AutoEvalColumn.rank_math_probability.name, AutoEvalColumn.rank_reason_logical.name, AutoEvalColumn.rank_reason_social.name, AutoEvalColumn.rank_chemistry.name, AutoEvalColumn.rank_biology.name, AutoEvalColumn.rank_physics.name, AutoEvalColumn.rank_overall.name, # AutoEvalColumn.rank_cpp.name, ], rank_col=['sort_by_rank', 1, 8], ) ) with gr.TabItem("โญ Sort by Score", elem_id="overall_sort_by_score_subtab", id=1, elem_classes="subtab"): leaderboard = overall_leaderboard( get_model_leaderboard_df( model_result_path, benchmark_cols=[ # AutoEvalColumn.rank_overall.name, AutoEvalColumn.model.name, # AutoEvalColumn.license.name, # AutoEvalColumn.organization.name, # AutoEvalColumn.knowledge_cutoff.name, AutoEvalColumn.score_math_algebra.name, AutoEvalColumn.score_math_geometry.name, AutoEvalColumn.score_math_probability.name, AutoEvalColumn.score_reason_logical.name, AutoEvalColumn.score_reason_social.name, AutoEvalColumn.score_chemistry.name, AutoEvalColumn.score_biology.name, AutoEvalColumn.score_physics.name, AutoEvalColumn.score_overall.name, # AutoEvalColumn.score_cpp.name, # AutoEvalColumn.rank_overall.name, # AutoEvalColumn.rank_math_algebra.name, # AutoEvalColumn.rank_math_geometry.name, # AutoEvalColumn.rank_math_probability.name, # AutoEvalColumn.rank_reason_logical.name, # AutoEvalColumn.rank_reason_social.name, # AutoEvalColumn.rank_chemistry.name, # AutoEvalColumn.rank_cpp.name, ], rank_col=['sort_by_score', 1, 8], ) ) with gr.TabItem("๐Ÿ”ข Math", elem_id="math-tab-table", id=2): # DESCRIPTION_TEXT=""" # Algebra, Geometry, and Probability are the current three main math domains in the leaderboard. # To mitigate the potential impact of data contimination, we have carefully selected the datasets from various sources. # We prioritize **recent math datasets** and focus on **college and beyond level** math questions. # The current datasets include # [MATH](https://arxiv.org/abs/2103.03874), # [MATH-500](https://github.com/openai/prm800k/tree/main/prm800k/math_splits), # [Omni](https://omni-math.github.io/), # [MathQA](https://arxiv.org/abs/1905.13319), # [MathBench](https://arxiv.org/abs/2405.12209), # [SciBench](https://arxiv.org/abs/2307.10635), and more! # We plan to include more math domains, such as calculus, number theory, and more in the future. # """ # gr.Markdown(DESCRIPTION_TEXT, elem_classes="markdown-text") TEXT = ( f'

' 'Algebra, Geometry, and Probability are the current three main math domains in the leaderboard. ' 'To mitigate the potential impact of data contimination, we have carefully selected the datasets from various sources. ' 'We prioritize recent math datasets and focus on college and beyond level math questions. ' 'The current datasets include' 'MATH, ' 'MATH-500, ' 'Omni, ' 'MathQA, ' 'MathBench, ' 'SciBench, and more! ' '

' f'

' 'We plan to include more math domains, such as calculus, number theory, and more in the future. ' '

' # '

' # 'We present ' # '

' ) gr.HTML(TEXT) # leaderboard = init_leaderboard(LEADERBOARD_DF) with gr.TabItem("๐Ÿ† Overview", elem_id="math_overview_subtab", id=0, elem_classes="subtab"): with gr.TabItem("โญ Sort by Rank", elem_id="math_overview_sort_by_rank_subtab", id=0, elem_classes="subtab"): leaderboard = overall_leaderboard( get_model_leaderboard_df( model_result_path, benchmark_cols=[ AutoEvalColumn.model.name, # AutoEvalColumn.license.name, # AutoEvalColumn.organization.name, # AutoEvalColumn.knowledge_cutoff.name, # AutoEvalColumn.score_math_algebra.name, # AutoEvalColumn.score_math_geometry.name, # AutoEvalColumn.score_math_probability.name, AutoEvalColumn.rank_math_algebra.name, AutoEvalColumn.rank_math_geometry.name, AutoEvalColumn.rank_math_probability.name, ], rank_col=['sort_by_rank', 1, 4, 'Math'], ) ) with gr.TabItem("โญ Sort by Score", elem_id="math_overview_sort_by_score_subtab", id=1, elem_classes="subtab"): leaderboard = overall_leaderboard( get_model_leaderboard_df( model_result_path, benchmark_cols=[ AutoEvalColumn.model.name, # AutoEvalColumn.license.name, # AutoEvalColumn.organization.name, # AutoEvalColumn.knowledge_cutoff.name, AutoEvalColumn.score_math_algebra.name, AutoEvalColumn.score_math_geometry.name, AutoEvalColumn.score_math_probability.name, # AutoEvalColumn.rank_math_algebra.name, # AutoEvalColumn.rank_math_geometry.name, # AutoEvalColumn.rank_math_probability.name, ], rank_col=['sort_by_score', 1, 4, 'Math'], ) ) with gr.TabItem("๐Ÿงฎ Algebra", elem_id="algebra_subtab", id=1, elem_classes="subtab"): leaderboard = overall_leaderboard( get_model_leaderboard_df( model_result_path, benchmark_cols=[ AutoEvalColumn.rank_math_algebra.name, AutoEvalColumn.model.name, AutoEvalColumn.score_math_algebra.name, # AutoEvalColumn.sd_math_algebra.name, AutoEvalColumn.license.name, AutoEvalColumn.organization.name, AutoEvalColumn.knowledge_cutoff.name, ], rank_col=[AutoEvalColumn.rank_math_algebra.name], ) ) with gr.TabItem("๐Ÿ“ Geometry", elem_id="geometry_subtab", id=2, elem_classes="subtab"): leaderboard = overall_leaderboard( get_model_leaderboard_df( model_result_path, benchmark_cols=[ AutoEvalColumn.rank_math_geometry.name, AutoEvalColumn.model.name, AutoEvalColumn.score_math_geometry.name, # AutoEvalColumn.sd_math_geometry.name, AutoEvalColumn.license.name, AutoEvalColumn.organization.name, AutoEvalColumn.knowledge_cutoff.name, ], rank_col=[AutoEvalColumn.rank_math_geometry.name], ) ) with gr.TabItem("๐Ÿ“Š Probability", elem_id="prob_subtab", id=3, elem_classes="subtab"): leaderboard = overall_leaderboard( get_model_leaderboard_df( model_result_path, benchmark_cols=[ AutoEvalColumn.rank_math_probability.name, AutoEvalColumn.model.name, AutoEvalColumn.score_math_probability.name, # AutoEvalColumn.sd_math_probability.name, AutoEvalColumn.license.name, AutoEvalColumn.organization.name, AutoEvalColumn.knowledge_cutoff.name, ], rank_col=[AutoEvalColumn.rank_math_probability.name], ) ) # with gr.TabItem("Sort_by_rank", elem_id="math_sort_by_rank_subtab", id=4, elem_classes="subtab"): # leaderboard = overall_leaderboard( # get_model_leaderboard_df( # model_result_path, # benchmark_cols=[ # AutoEvalColumn.model.name, # AutoEvalColumn.rank_math_algebra.name, # AutoEvalColumn.rank_math_geometry.name, # AutoEvalColumn.rank_math_probability.name, # ], # rank_col=[], # ) # ) with gr.TabItem("๐Ÿง  Reasoning", elem_id="reasonong-tab-table", id=3): DESCRIPTION_TEXT = """ Reasoning is a broad domain for evaluating LLMs, but traditional tasks like commonsense reasoning have become less effective in differentiating modern LLMs. We now present two challenging types of reasoning: logical reasoning and social reasoning, both of which present more meaningful and sophisticated ways to assess LLM performance. For logical reasoning, we leverage datasets from sources such as [BIG-Bench Hard (BBH)](https://arxiv.org/abs/2210.09261), [FOLIO](https://arxiv.org/abs/2209.00840), [LogiQA2.0](https://github.com/csitfun/LogiQA2.0), [PrOntoQA](https://arxiv.org/abs/2210.01240), [ReClor](https://arxiv.org/abs/2002.04326), These cover a range of tasks including deductive reasoning, object counting and tracking, pattern recognition, temporal reasoning, first-order logic reaosning, etc. For social reasoning, we collect datasets from [MMToM-QA (Text-only)](https://arxiv.org/abs/2401.08743), [BigToM](https://arxiv.org/abs/2306.15448), [Adv-CSFB](https://arxiv.org/abs/2305.14763), [SocialIQA](https://arxiv.org/abs/1904.09728), [NormBank](https://arxiv.org/abs/2305.17008), covering challenging social reasoning tasks, such as social commonsense reasoning, social normative reasoning, Theory of Mind (ToM) reasoning, etc. More fine-grained types of reasoning, such as symbolic, analogical, counterfactual reasoning, are planned to be added in the future. """ gr.Markdown(DESCRIPTION_TEXT, elem_classes="markdown-text") with gr.TabItem("๐Ÿ† Overview", elem_id="reasoning_overview_subtab", id=0, elem_classes="subtab"): with gr.TabItem("โญ Sort by Rank", elem_id="reasoning_overview_sort_by_rank_subtab", id=0, elem_classes="subtab"): leaderboard = overall_leaderboard( get_model_leaderboard_df( model_result_path, benchmark_cols=[ AutoEvalColumn.model.name, # AutoEvalColumn.license.name, # AutoEvalColumn.organization.name, # AutoEvalColumn.knowledge_cutoff.name, AutoEvalColumn.rank_reason_logical.name, AutoEvalColumn.rank_reason_social.name, ], rank_col=['sort_by_rank', 1, 3, 'Reasoning'], ) ) with gr.TabItem("โญ Sort by Score", elem_id="reasoning_overview_sort_by_score_subtab", id=1, elem_classes="subtab"): leaderboard = overall_leaderboard( get_model_leaderboard_df( model_result_path, benchmark_cols=[ AutoEvalColumn.model.name, # AutoEvalColumn.license.name, # AutoEvalColumn.organization.name, # AutoEvalColumn.knowledge_cutoff.name, AutoEvalColumn.score_reason_logical.name, AutoEvalColumn.score_reason_social.name, ], rank_col=['sort_by_score', 1, 3, 'Reasoning'], ) ) with gr.TabItem("๐Ÿงฉ Logical", elem_id="logical_subtab", id=1, elem_classes="subtab"): leaderboard = overall_leaderboard( get_model_leaderboard_df( model_result_path, benchmark_cols=[ AutoEvalColumn.rank_reason_logical.name, AutoEvalColumn.model.name, AutoEvalColumn.score_reason_logical.name, # AutoEvalColumn.sd_reason_logical.name, AutoEvalColumn.license.name, AutoEvalColumn.organization.name, AutoEvalColumn.knowledge_cutoff.name, ], rank_col=[AutoEvalColumn.rank_reason_logical.name], ) ) with gr.TabItem("๐Ÿ—ฃ๏ธ Social", elem_id="social_subtab", id=2, elem_classes="subtab"): leaderboard = overall_leaderboard( get_model_leaderboard_df( model_result_path, benchmark_cols=[ AutoEvalColumn.rank_reason_social.name, AutoEvalColumn.model.name, AutoEvalColumn.score_reason_social.name, # AutoEvalColumn.sd_reason_social.name, AutoEvalColumn.license.name, AutoEvalColumn.organization.name, AutoEvalColumn.knowledge_cutoff.name, ], rank_col=[AutoEvalColumn.rank_reason_social.name], ) ) # with gr.TabItem("Sort_by_rank", elem_id="reasoning_sort_by_rank_subtab", id=3, elem_classes="subtab"): # leaderboard = overall_leaderboard( # get_model_leaderboard_df( # model_result_path, # benchmark_cols=[ # AutoEvalColumn.model.name, # AutoEvalColumn.rank_reason_logical.name, # AutoEvalColumn.rank_reason_social.name, # ], # rank_col=[], # ) # ) with gr.TabItem("๐Ÿ”ฌ Science", elem_id="science-table", id=4): CURRENT_TEXT = """ Scientific tasks are crucial for evaluating LLMs, requiring both domain-specific knowledge and reasoning capabilities. We are adding several fine-grained scientific domains to the leaderboard. The forthcoming ones are biology, chemistry, and physics. We have diversely and aggressively collected recent scientific datasets, including but not limited to [GPQA](https://arxiv.org/abs/2311.12022), [JEEBench](https://aclanthology.org/2023.emnlp-main.468/), [MMLU-Pro](https://arxiv.org/abs/2406.01574), [OlympiadBench](https://arxiv.org/abs/2402.14008), [SciBench](https://arxiv.org/abs/2307.10635), [SciEval](https://arxiv.org/abs/2308.13149). """ gr.Markdown(CURRENT_TEXT, elem_classes="markdown-text") with gr.TabItem("๐Ÿ† Overview", elem_id="science_overview_subtab", id=0, elem_classes="subtab"): with gr.TabItem("โญ Sort by Rank", elem_id="science_overview_sort_by_rank_subtab", id=0, elem_classes="subtab"): leaderboard = overall_leaderboard( get_model_leaderboard_df( model_result_path, benchmark_cols=[ AutoEvalColumn.model.name, # AutoEvalColumn.license.name, # AutoEvalColumn.organization.name, # AutoEvalColumn.knowledge_cutoff.name, AutoEvalColumn.rank_chemistry.name, AutoEvalColumn.rank_biology.name, AutoEvalColumn.rank_physics.name, ], rank_col=['sort_by_rank', 1, 4, 'Science'], ) ) with gr.TabItem("โญ Sort by Score", elem_id="science_overview_sort_by_score_subtab", id=1, elem_classes="subtab"): leaderboard = overall_leaderboard( get_model_leaderboard_df( model_result_path, benchmark_cols=[ AutoEvalColumn.model.name, # AutoEvalColumn.license.name, # AutoEvalColumn.organization.name, # AutoEvalColumn.knowledge_cutoff.name, AutoEvalColumn.score_chemistry.name, AutoEvalColumn.score_biology.name, AutoEvalColumn.score_physics.name, ], rank_col=['sort_by_score', 1, 4, 'Science'], # two numbers are index to select the columns to average and sort ) ) with gr.TabItem("๐Ÿงช Chemistry", elem_id="chemistry_subtab", id=1, elem_classes="subtab"): leaderboard = overall_leaderboard( get_model_leaderboard_df( model_result_path, benchmark_cols=[ AutoEvalColumn.rank_chemistry.name, AutoEvalColumn.model.name, AutoEvalColumn.score_chemistry.name, # AutoEvalColumn.sd_reason_social.name, AutoEvalColumn.license.name, AutoEvalColumn.organization.name, AutoEvalColumn.knowledge_cutoff.name, ], rank_col=[AutoEvalColumn.rank_chemistry.name], ) ) with gr.TabItem("๐Ÿงฌ Biology", elem_id="biology_subtab", id=3, elem_classes="subtab"): # CURRENT_TEXT = """ # # Coming soon! # """ # gr.Markdown(CURRENT_TEXT, elem_classes="markdown-text") leaderboard = overall_leaderboard( get_model_leaderboard_df( model_result_path, benchmark_cols=[ AutoEvalColumn.rank_biology.name, AutoEvalColumn.model.name, AutoEvalColumn.score_biology.name, # AutoEvalColumn.sd_reason_social.name, AutoEvalColumn.license.name, AutoEvalColumn.organization.name, AutoEvalColumn.knowledge_cutoff.name, ], rank_col=[AutoEvalColumn.rank_biology.name], ) ) with gr.TabItem("โš›๏ธ Physics", elem_id="physics_subtab", id=2, elem_classes="subtab"): # CURRENT_TEXT = """ # # Coming soon! # """ # gr.Markdown(CURRENT_TEXT, elem_classes="markdown-text") leaderboard = overall_leaderboard( get_model_leaderboard_df( model_result_path, benchmark_cols=[ AutoEvalColumn.rank_physics.name, AutoEvalColumn.model.name, AutoEvalColumn.score_physics.name, # AutoEvalColumn.sd_reason_social.name, AutoEvalColumn.license.name, AutoEvalColumn.organization.name, AutoEvalColumn.knowledge_cutoff.name, ], rank_col=[AutoEvalColumn.rank_physics.name], ) ) with gr.TabItem(" Coding", elem_id="coding-table", id=5): CURRENT_TEXT = """ We are working on adding more fine-grained tasks in coding domains to the leaderboard. The forthcoming ones focus on Python, Java, and C++, with plans to expand to more languages. We collect a variety of recent coding datasets, including [HumanEval](https://huggingface.co/datasets/openai/openai_humaneval), [MBPP](https://huggingface.co/datasets/google-research-datasets/mbpp), [HumanEvalFix](https://huggingface.co/datasets/bigcode/humanevalpack), [newly crawled LeetCode data](https://leetcode.com/problemset/), filtered code-related queries from [Arena-Hard-Auto](https://github.com/lmarena/arena-hard-auto) and more! Our efforts also include synthesizing new code-related queries to ensure diversity! """ gr.Markdown(CURRENT_TEXT, elem_classes="markdown-text") with gr.TabItem("โž• C++", elem_id="cpp_subtab", id=0, elem_classes="subtab"): leaderboard = overall_leaderboard( get_model_leaderboard_df( model_result_path, benchmark_cols=[ AutoEvalColumn.rank_cpp.name, AutoEvalColumn.model.name, AutoEvalColumn.score_cpp.name, # AutoEvalColumn.sd_cpp.name, AutoEvalColumn.license.name, AutoEvalColumn.organization.name, AutoEvalColumn.knowledge_cutoff.name, ], rank_col=[AutoEvalColumn.rank_cpp.name], ) ) with gr.TabItem("๐Ÿ Python", elem_id="python_subtab", id=1, elem_classes="subtab"): CURRENT_TEXT = """ # Coming soon! """ gr.Markdown(CURRENT_TEXT, elem_classes="markdown-text") with gr.TabItem("โ˜• Java", elem_id="java_subtab", id=2, elem_classes="subtab"): CURRENT_TEXT = """ # Coming soon! """ gr.Markdown(CURRENT_TEXT, elem_classes="markdown-text") with gr.TabItem("๐ŸŽฏ Mixed", elem_id="llm-benchmark-tab-table", id=1): DESCRIPTION_TEXT = """ Overall dimension measures the comprehensive performance of LLMs across diverse tasks. We start with diverse questions from the widely-used [MT-Bench](https://arxiv.org/abs/2306.05685), coving a wide range of domains, including writing, roleplay, extraction, reasoning, math, coding, knowledge I (STEM), and knowledge II (humanities/social science). """ gr.Markdown(DESCRIPTION_TEXT, elem_classes="markdown-text") with gr.TabItem("MT-Bench", elem_id="mt-bench_subtab", id=0, elem_classes="subtab"): leaderboard = overall_leaderboard( get_model_leaderboard_df( model_result_path, benchmark_cols=[ AutoEvalColumn.rank_overall.name, AutoEvalColumn.model.name, AutoEvalColumn.score_overall.name, # AutoEvalColumn.sd_overall.name, AutoEvalColumn.license.name, AutoEvalColumn.organization.name, AutoEvalColumn.knowledge_cutoff.name, ], rank_col=[AutoEvalColumn.rank_overall.name], )) with gr.TabItem("๐Ÿ“ About", elem_id="llm-benchmark-tab-table", id=6): ABOUT_TEXT = """ # About Us [Decentralized Arena](https://de-arena.maitrix.org/) is an open-source project that automates and scales the evaluation of large language models (LLMs) across various fine-grained dimensions, developed by reseachers from UCSD, CMU, MBZUAI, [Maitrix.org](https://maitrix.org/) and [LLM360](https://www.llm360.ai/). Stay tuned for more updates and new features! ## Team members Yanbin Yin, [Zhen Wang](https://zhenwang9102.github.io/), [Kun Zhou](https://lancelot39.github.io/), Xiangdong Zhang, [Shibo Hao](https://ber666.github.io/), [Yi Gu](https://www.yigu.page/), [Jieyuan Liu](https://www.linkedin.com/in/jieyuan-liu/), [Somanshu Singla](https://www.linkedin.com/in/somanshu-singla-105636214/), [Tianyang Liu](https://leolty.github.io/), [Eric P. Xing](https://www.cs.cmu.edu/~epxing/), [Zhengzhong Liu](https://hunterhector.github.io/), [Haojian Jin](https://www.haojianj.in/), [Zhiting Hu](https://zhiting.ucsd.edu/) ## Contact Us - Follow us on X, [Maitrix.org](https://twitter.com/MaitrixOrg) and [LLM360](https://twitter.com/llm360) - Email us at [Zhen Wang](mailto:zhenwang9102@gmail.com), [Kun Zhou](mailto:franciskunzhou@gmail.com) and [Zhiting Hu](mailto:zhitinghu@gmail.com) """ gr.Markdown(ABOUT_TEXT, elem_classes="markdown-text") ''' with gr.TabItem("๐Ÿš€ Submit here! ", elem_id="llm-benchmark-tab-table", id=3): with gr.Column(): with gr.Row(): gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text") with gr.Column(): with gr.Accordion( f"โœ… Finished Evaluations ({len(finished_eval_queue_df)})", open=False, ): with gr.Row(): finished_eval_table = gr.components.Dataframe( value=finished_eval_queue_df, headers=EVAL_COLS, datatype=EVAL_TYPES, row_count=5, ) with gr.Accordion( f"๐Ÿ”„ Running Evaluation Queue ({len(running_eval_queue_df)})", open=False, ): with gr.Row(): running_eval_table = gr.components.Dataframe( value=running_eval_queue_df, headers=EVAL_COLS, datatype=EVAL_TYPES, row_count=5, ) with gr.Accordion( f"โณ Pending Evaluation Queue ({len(pending_eval_queue_df)})", open=False, ): with gr.Row(): pending_eval_table = gr.components.Dataframe( value=pending_eval_queue_df, headers=EVAL_COLS, datatype=EVAL_TYPES, row_count=5, ) with gr.Row(): gr.Markdown("# โœ‰๏ธโœจ Submit your model here!", elem_classes="markdown-text") with gr.Row(): with gr.Column(): model_name_textbox = gr.Textbox(label="Model name") revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main") model_type = gr.Dropdown( choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown], label="Model type", multiselect=False, value=None, interactive=True, ) with gr.Column(): precision = gr.Dropdown( choices=[i.value.name for i in Precision if i != Precision.Unknown], label="Precision", multiselect=False, value="float16", interactive=True, ) weight_type = gr.Dropdown( choices=[i.value.name for i in WeightType], label="Weights type", multiselect=False, value="Original", interactive=True, ) base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)") submit_button = gr.Button("Submit Eval") submission_result = gr.Markdown() submit_button.click( add_new_eval, [ model_name_textbox, base_model_name_textbox, revision_name_textbox, precision, weight_type, model_type, ], submission_result, ) ''' with gr.Row(): with gr.Accordion("๐Ÿ“™ Citation", open=False): citation_button = gr.Textbox( value=CITATION_BUTTON_TEXT, label=CITATION_BUTTON_LABEL, lines=20, elem_id="citation-button", show_copy_button=True, ) scheduler = BackgroundScheduler() scheduler.add_job(restart_space, "interval", seconds=1800) scheduler.start() demo.queue(default_concurrency_limit=40).launch()