de-arena / app.py
yzabc007's picture
Update space
04e5831
raw
history blame
27.1 kB
import gradio as gr
from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns, SearchColumns
import pandas as pd
from apscheduler.schedulers.background import BackgroundScheduler
from huggingface_hub import snapshot_download
from src.about import (
CITATION_BUTTON_LABEL,
CITATION_BUTTON_TEXT,
EVALUATION_QUEUE_TEXT,
INTRODUCTION_TEXT,
LLM_BENCHMARKS_TEXT,
TITLE,
SUB_TITLE,
EXTERNAL_LINKS,
COMING_SOON_TEXT
)
from src.display.css_html_js import custom_css
from src.display.utils import (
BENCHMARK_COLS,
COLS,
EVAL_COLS,
EVAL_TYPES,
AutoEvalColumn,
ModelType,
fields,
WeightType,
Precision
)
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
from src.populate import get_evaluation_queue_df, get_leaderboard_df, get_model_leaderboard_df
from src.submission.submit import add_new_eval
def restart_space():
API.restart_space(repo_id=REPO_ID)
### Space initialisation
try:
print(EVAL_REQUESTS_PATH)
snapshot_download(
repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
)
except Exception:
restart_space()
try:
print(EVAL_RESULTS_PATH)
snapshot_download(
repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
)
except Exception:
restart_space()
LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
(
finished_eval_queue_df,
running_eval_queue_df,
pending_eval_queue_df,
) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
def init_leaderboard(dataframe):
if dataframe is None or dataframe.empty:
raise ValueError("Leaderboard DataFrame is empty or None.")
return Leaderboard(
value=dataframe,
datatype=[c.type for c in fields(AutoEvalColumn)],
select_columns=None,
# SelectColumns(
# default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
# cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
# label="Select Columns to Display:",
# ),
# search_columns=None,
# search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
search_columns=SearchColumns(primary_column=AutoEvalColumn.model.name, secondary_columns=[],
placeholder="Search by the model name",
label="Searching"),
hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
filter_columns=None,
# [
# ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
# ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
# ColumnFilter(
# AutoEvalColumn.params.name,
# type="slider",
# min=0.01,
# max=150,
# label="Select the number of parameters (B)",
# ),
# ColumnFilter(
# AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
# ),
# ],
# bool_checkboxgroup_label="Hide models",
interactive=False,
)
# model_result_path = "./src/results/models_2024-10-07-14:50:12.666068.jsonl"
# model_result_path = "./src/results/models_2024-10-08-03:10:26.811832.jsonl"
# model_result_path = "./src/results/models_2024-10-08-03:25:44.801310.jsonl"
# model_result_path = "./src/results/models_2024-10-08-17:39:21.001582.jsonl"
# model_result_path = "./src/results/models_2024-10-09-05:17:38.810960.json"
# model_result_path = "./src/results/models_2024-10-09-06:22:21.122422.json"
# model_result_path = "./src/results/models_2024-10-10-06:18:54.263527.json"
model_result_path = "./src/results/models_2024-10-18-14:06:13.588399.json"
# model_leaderboard_df = get_model_leaderboard_df(model_result_path)
def overall_leaderboard(dataframe):
if dataframe is None or dataframe.empty:
raise ValueError("Leaderboard DataFrame is empty or None.")
return Leaderboard(
value=dataframe,
datatype=[c.type for c in fields(AutoEvalColumn)],
select_columns=None,
search_columns=SearchColumns(primary_column=AutoEvalColumn.model.name, secondary_columns=[],
placeholder="Search by the model name",
label="Searching"),
hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
filter_columns=None,
interactive=False,
)
demo = gr.Blocks(css=custom_css)
with demo:
gr.HTML(TITLE)
gr.HTML(SUB_TITLE)
gr.HTML(EXTERNAL_LINKS)
# gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
# gr.HTML('<p style="font-size:15px;">This is a larger text using HTML in Markdown.</p>')
INTRODUCTION_TEXT_FONT_SIZE = 16
INTRODUCTION_TEXT = (
f'<p style="font-size:{INTRODUCTION_TEXT_FONT_SIZE}px;">'
'<strong>Decentralized Arena</strong> automates, scales, and accelerates "<a href="https://lmarena.ai/">Chatbot Arena</a>" '
'for large language model (LLM) evaluation across diverse, fine-grained dimensions, '
'such as mathematics (algebra, geometry, probability), logical reasoning, social reasoning, science (chemistry, physics, biology), or any user-defined dimensions. '
'The evaluation is decentralized and democratic, with all participating LLMs assessing each other to ensure unbiased and fair results. '
'With a 95% correlation to Chatbot Arena\'s overall rankings, the system is fully transparent and reproducible.'
'</p>'
f'<p style="font-size:{INTRODUCTION_TEXT_FONT_SIZE}px;">'
'We actively invite <b>model developers</b> to participate and expedite their benchmarking efforts '
'and encourage <b>data stakeholders</b> to freely define and evaluate dimensions of interest for their own objectives.'
'</p>'
)
gr.HTML(INTRODUCTION_TEXT)
with gr.Tabs(elem_classes="tab-buttons") as tabs:
with gr.TabItem("🏅 Overview", elem_id="llm-benchmark-tab-table", id=0):
DESCRIPTION_TEXT = """
Total #models: 53 (Last updated: 2024-10-09)
This page prvovides a comprehensive overview of model ranks across various dimensions, based on their averaged ranks.
(Missing values are due to the slow or problemtic model responses to be fixed soom.)
"""
gr.Markdown(DESCRIPTION_TEXT, elem_classes="markdown-text")
leaderboard = overall_leaderboard(
get_model_leaderboard_df(
model_result_path,
benchmark_cols=[
# AutoEvalColumn.rank_overall.name,
AutoEvalColumn.model.name,
AutoEvalColumn.rank_overall.name,
# AutoEvalColumn.rank_math_algebra.name,
# AutoEvalColumn.rank_math_geometry.name,
AutoEvalColumn.rank_math_probability.name,
AutoEvalColumn.rank_reason_logical.name,
# AutoEvalColumn.rank_reason_social.name,
AutoEvalColumn.rank_chemistry.name,
# AutoEvalColumn.rank_cpp.name,
],
rank_col=[],
)
)
with gr.TabItem("🎯 Overall", elem_id="llm-benchmark-tab-table", id=1):
DESCRIPTION_TEXT = """
Overall dimension measures the comprehensive performance of LLMs across diverse tasks.
We start with diverse questions from the widely-used [MT-Bench](https://arxiv.org/abs/2306.05685),
coving a wide range of domains, including writing, roleplay, extraction, reasoning, math, coding, knowledge I (STEM), and knowledge II (humanities/social science).
"""
gr.Markdown(DESCRIPTION_TEXT, elem_classes="markdown-text")
leaderboard = overall_leaderboard(
get_model_leaderboard_df(
model_result_path,
benchmark_cols=[
AutoEvalColumn.rank_overall.name,
AutoEvalColumn.model.name,
AutoEvalColumn.score_overall.name,
AutoEvalColumn.sd_overall.name,
AutoEvalColumn.license.name,
AutoEvalColumn.organization.name,
AutoEvalColumn.knowledge_cutoff.name,
],
rank_col=[AutoEvalColumn.rank_overall.name],
))
with gr.TabItem("🔢 Math", elem_id="math-tab-table", id=2):
DESCRIPTION_TEXT="""
Algebra, Geometry, and Probability are the current three main math domains in the leaderboard.
To mitigate the potential impact of data contimination, we have carefully selected the datasets from various sources.
We prioritize **recent math datasets** and focus on **college and beyond level** math questions.
The current datasets include
[MATH](https://arxiv.org/abs/2103.03874),
[MATH-500](https://github.com/openai/prm800k/tree/main/prm800k/math_splits),
[Omni](https://omni-math.github.io/),
[MathQA](https://arxiv.org/abs/1905.13319),
[MathBench](https://arxiv.org/abs/2405.12209),
[SciBench](https://arxiv.org/abs/2307.10635), and more!
We plan to include more math domains, such as calculus, number theory, and more in the future.
"""
gr.Markdown(DESCRIPTION_TEXT, elem_classes="markdown-text")
# leaderboard = init_leaderboard(LEADERBOARD_DF)
with gr.TabItem("🧮 Algebra", elem_id="algebra_subtab", id=0, elem_classes="subtab"):
leaderboard = overall_leaderboard(
get_model_leaderboard_df(
model_result_path,
benchmark_cols=[
AutoEvalColumn.rank_math_algebra.name,
AutoEvalColumn.model.name,
AutoEvalColumn.score_math_algebra.name,
AutoEvalColumn.sd_math_algebra.name,
AutoEvalColumn.license.name,
AutoEvalColumn.organization.name,
AutoEvalColumn.knowledge_cutoff.name,
],
rank_col=[AutoEvalColumn.rank_math_algebra.name],
)
)
with gr.TabItem("📐 Geometry", elem_id="geometry_subtab", id=1, elem_classes="subtab"):
leaderboard = overall_leaderboard(
get_model_leaderboard_df(
model_result_path,
benchmark_cols=[
AutoEvalColumn.rank_math_geometry.name,
AutoEvalColumn.model.name,
AutoEvalColumn.score_math_geometry.name,
AutoEvalColumn.sd_math_geometry.name,
AutoEvalColumn.license.name,
AutoEvalColumn.organization.name,
AutoEvalColumn.knowledge_cutoff.name,
],
rank_col=[AutoEvalColumn.rank_math_geometry.name],
)
)
with gr.TabItem("📊 Probability", elem_id="prob_subtab", id=2, elem_classes="subtab"):
leaderboard = overall_leaderboard(
get_model_leaderboard_df(
model_result_path,
benchmark_cols=[
AutoEvalColumn.rank_math_probability.name,
AutoEvalColumn.model.name,
AutoEvalColumn.score_math_probability.name,
# AutoEvalColumn.sd_math_probability.name,
AutoEvalColumn.license.name,
AutoEvalColumn.organization.name,
AutoEvalColumn.knowledge_cutoff.name,
],
rank_col=[AutoEvalColumn.rank_math_probability.name],
)
)
with gr.TabItem("🧠 Reasoning", elem_id="reasonong-tab-table", id=3):
DESCRIPTION_TEXT = """
Reasoning is a broad domain for evaluating LLMs, but traditional tasks like commonsense reasoning have become less effective in differentiating modern LLMs.
We now present two challenging types of reasoning: logical reasoning and social reasoning, both of which present more meaningful and sophisticated ways to assess LLM performance.
For logical reasoning, we leverage datasets from sources such as
[BIG-Bench Hard (BBH)](https://arxiv.org/abs/2210.09261),
[FOLIO](https://arxiv.org/abs/2209.00840),
[LogiQA2.0](https://github.com/csitfun/LogiQA2.0),
[PrOntoQA](https://arxiv.org/abs/2210.01240),
[ReClor](https://arxiv.org/abs/2002.04326),
These cover a range of tasks including deductive reasoning, object counting and tracking, pattern recognition,
temporal reasoning, first-order logic reaosning, etc.
For social reasoning, we collect datasets from
[MMToM-QA (Text-only)](https://arxiv.org/abs/2401.08743),
[BigToM](https://arxiv.org/abs/2306.15448),
[Adv-CSFB](https://arxiv.org/abs/2305.14763),
[SocialIQA](https://arxiv.org/abs/1904.09728),
[NormBank](https://arxiv.org/abs/2305.17008), covering challenging social reasoning tasks,
such as social commonsense reasoning, social normative reasoning, Theory of Mind (ToM) reasoning, etc.
More fine-grained types of reasoning, such as symbolic, analogical, counterfactual reasoning, are planned to be added in the future.
"""
gr.Markdown(DESCRIPTION_TEXT, elem_classes="markdown-text")
with gr.TabItem("🧩 Logical", elem_id="logical_subtab", id=0, elem_classes="subtab"):
leaderboard = overall_leaderboard(
get_model_leaderboard_df(
model_result_path,
benchmark_cols=[
AutoEvalColumn.rank_reason_logical.name,
AutoEvalColumn.model.name,
AutoEvalColumn.score_reason_logical.name,
AutoEvalColumn.sd_reason_logical.name,
AutoEvalColumn.license.name,
AutoEvalColumn.organization.name,
AutoEvalColumn.knowledge_cutoff.name,
],
rank_col=[AutoEvalColumn.rank_reason_logical.name],
)
)
with gr.TabItem("🗣️ Social", elem_id="social_subtab", id=1, elem_classes="subtab"):
leaderboard = overall_leaderboard(
get_model_leaderboard_df(
model_result_path,
benchmark_cols=[
AutoEvalColumn.rank_reason_social.name,
AutoEvalColumn.model.name,
AutoEvalColumn.score_reason_social.name,
AutoEvalColumn.sd_reason_social.name,
AutoEvalColumn.license.name,
AutoEvalColumn.organization.name,
AutoEvalColumn.knowledge_cutoff.name,
],
rank_col=[AutoEvalColumn.rank_reason_social.name],
)
)
with gr.TabItem("🔬 Science", elem_id="science-table", id=4):
CURRENT_TEXT = """
Scientific tasks are crucial for evaluating LLMs, requiring both domain-specific knowledge and reasoning capabilities.
We are adding several fine-grained scientific domains to the leaderboard. The forthcoming ones are biology, chemistry, and physics.
We have diversely and aggressively collected recent scientific datasets, including but not limited to
[GPQA](https://arxiv.org/abs/2311.12022),
[JEEBench](https://aclanthology.org/2023.emnlp-main.468/),
[MMLU-Pro](https://arxiv.org/abs/2406.01574),
[OlympiadBench](https://arxiv.org/abs/2402.14008),
[SciBench](https://arxiv.org/abs/2307.10635),
[SciEval](https://arxiv.org/abs/2308.13149).
"""
gr.Markdown(CURRENT_TEXT, elem_classes="markdown-text")
with gr.TabItem("🧪 Chemistry", elem_id="chemistry_subtab", id=0, elem_classes="subtab"):
leaderboard = overall_leaderboard(
get_model_leaderboard_df(
model_result_path,
benchmark_cols=[
AutoEvalColumn.rank_chemistry.name,
AutoEvalColumn.model.name,
AutoEvalColumn.score_chemistry.name,
# AutoEvalColumn.sd_reason_social.name,
AutoEvalColumn.license.name,
AutoEvalColumn.organization.name,
AutoEvalColumn.knowledge_cutoff.name,
],
rank_col=[AutoEvalColumn.rank_chemistry.name],
)
)
with gr.TabItem("⚛️ Physics", elem_id="physics_subtab", id=1, elem_classes="subtab"):
CURRENT_TEXT = """
# Coming soon!
"""
gr.Markdown(CURRENT_TEXT, elem_classes="markdown-text")
with gr.TabItem("🧬 Biology", elem_id="biology_subtab", id=2, elem_classes="subtab"):
CURRENT_TEXT = """
# Coming soon!
"""
gr.Markdown(CURRENT_TEXT, elem_classes="markdown-text")
with gr.TabItem("</> Coding", elem_id="coding-table", id=5):
CURRENT_TEXT = """
We are working on adding more fine-grained tasks in coding domains to the leaderboard.
The forthcoming ones focus on Python, Java, and C++, with plans to expand to more languages.
We collect a variety of recent coding datasets, including
[HumanEval](https://huggingface.co/datasets/openai/openai_humaneval),
[MBPP](https://huggingface.co/datasets/google-research-datasets/mbpp),
[HumanEvalFix](https://huggingface.co/datasets/bigcode/humanevalpack),
[newly crawled LeetCode data](https://leetcode.com/problemset/),
filtered code-related queries from [Arena-Hard-Auto](https://github.com/lmarena/arena-hard-auto) and more!
Our efforts also include synthesizing new code-related queries to ensure diversity!
"""
gr.Markdown(CURRENT_TEXT, elem_classes="markdown-text")
with gr.TabItem("➕ C++", elem_id="cpp_subtab", id=0, elem_classes="subtab"):
leaderboard = overall_leaderboard(
get_model_leaderboard_df(
model_result_path,
benchmark_cols=[
AutoEvalColumn.rank_cpp.name,
AutoEvalColumn.model.name,
AutoEvalColumn.score_cpp.name,
# AutoEvalColumn.sd_cpp.name,
AutoEvalColumn.license.name,
AutoEvalColumn.organization.name,
AutoEvalColumn.knowledge_cutoff.name,
],
rank_col=[AutoEvalColumn.rank_cpp.name],
)
)
with gr.TabItem("🐍 Python", elem_id="python_subtab", id=1, elem_classes="subtab"):
CURRENT_TEXT = """
# Coming soon!
"""
gr.Markdown(CURRENT_TEXT, elem_classes="markdown-text")
with gr.TabItem("☕ Java", elem_id="java_subtab", id=2, elem_classes="subtab"):
CURRENT_TEXT = """
# Coming soon!
"""
gr.Markdown(CURRENT_TEXT, elem_classes="markdown-text")
with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=6):
ABOUT_TEXT = """
# About Us
[Decentralized Arena](https://de-arena.maitrix.org/) is an open-source project that automates and scales the evaluation of large language models (LLMs) across various fine-grained dimensions,
developed by reseachers from UCSD, CMU, MBZUAI, [Maitrix.org](https://maitrix.org/) and [LLM360](https://www.llm360.ai/).
Stay tuned for more updates and new features!
## Team members
Yanbin Yin, [Zhen Wang](https://zhenwang9102.github.io/), [Kun Zhou](https://lancelot39.github.io/), Xiangdong Zhang,
[Shibo Hao](https://ber666.github.io/), [Yi Gu](https://www.yigu.page/), [Jieyuan Liu](https://www.linkedin.com/in/jieyuan-liu/), [Somanshu Singla](https://www.linkedin.com/in/somanshu-singla-105636214/), [Tianyang Liu](https://leolty.github.io/),
[Eric P. Xing](https://www.cs.cmu.edu/~epxing/), [Zhengzhong Liu](https://hunterhector.github.io/), [Haojian Jin](https://www.haojianj.in/),
[Zhiting Hu](https://zhiting.ucsd.edu/)
## Contact Us
- Follow us on X, [Maitrix.org](https://twitter.com/MaitrixOrg) and [LLM360](https://twitter.com/llm360)
- Email us at [Zhen Wang](mailto:zhenwang9102@gmail.com), [Kun Zhou](mailto:franciskunzhou@gmail.com) and [Zhiting Hu](mailto:zhitinghu@gmail.com)
"""
gr.Markdown(ABOUT_TEXT, elem_classes="markdown-text")
'''
with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
with gr.Column():
with gr.Row():
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
with gr.Column():
with gr.Accordion(
f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
open=False,
):
with gr.Row():
finished_eval_table = gr.components.Dataframe(
value=finished_eval_queue_df,
headers=EVAL_COLS,
datatype=EVAL_TYPES,
row_count=5,
)
with gr.Accordion(
f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
open=False,
):
with gr.Row():
running_eval_table = gr.components.Dataframe(
value=running_eval_queue_df,
headers=EVAL_COLS,
datatype=EVAL_TYPES,
row_count=5,
)
with gr.Accordion(
f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
open=False,
):
with gr.Row():
pending_eval_table = gr.components.Dataframe(
value=pending_eval_queue_df,
headers=EVAL_COLS,
datatype=EVAL_TYPES,
row_count=5,
)
with gr.Row():
gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
with gr.Row():
with gr.Column():
model_name_textbox = gr.Textbox(label="Model name")
revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
model_type = gr.Dropdown(
choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
label="Model type",
multiselect=False,
value=None,
interactive=True,
)
with gr.Column():
precision = gr.Dropdown(
choices=[i.value.name for i in Precision if i != Precision.Unknown],
label="Precision",
multiselect=False,
value="float16",
interactive=True,
)
weight_type = gr.Dropdown(
choices=[i.value.name for i in WeightType],
label="Weights type",
multiselect=False,
value="Original",
interactive=True,
)
base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
submit_button = gr.Button("Submit Eval")
submission_result = gr.Markdown()
submit_button.click(
add_new_eval,
[
model_name_textbox,
base_model_name_textbox,
revision_name_textbox,
precision,
weight_type,
model_type,
],
submission_result,
)
'''
with gr.Row():
with gr.Accordion("📙 Citation", open=False):
citation_button = gr.Textbox(
value=CITATION_BUTTON_TEXT,
label=CITATION_BUTTON_LABEL,
lines=20,
elem_id="citation-button",
show_copy_button=True,
)
scheduler = BackgroundScheduler()
scheduler.add_job(restart_space, "interval", seconds=1800)
scheduler.start()
demo.queue(default_concurrency_limit=40).launch()