|
""" |
|
Live monitor of the website statistics and leaderboard. |
|
|
|
Dependency: |
|
sudo apt install pkg-config libicu-dev |
|
pip install pytz gradio gdown plotly polyglot pyicu pycld2 tabulate |
|
""" |
|
|
|
import argparse |
|
import ast |
|
import pickle |
|
import os |
|
import threading |
|
import time |
|
|
|
import gradio as gr |
|
import numpy as np |
|
import pandas as pd |
|
|
|
|
|
basic_component_values = [None] * 6 |
|
leader_component_values = [None] * 5 |
|
|
|
|
|
def make_leaderboard_md(elo_results): |
|
leaderboard_md = f""" |
|
# π GenAI-Arena Leaderboard |
|
| [Code](https://huggingface.co/spaces/TIGER-Lab/GenAI-Arena/tree/main) | [Dataset](https://huggingface.co/datasets/TIGER-Lab/GenAI-Bench) | [Twitter](https://twitter.com/TianleLI123/status/1757245259149422752) | |
|
|
|
""" |
|
return leaderboard_md |
|
|
|
|
|
def make_leaderboard_md_live(elo_results): |
|
leaderboard_md = f""" |
|
# Leaderboard |
|
Last updated: {elo_results["last_updated_datetime"]} |
|
{elo_results["leaderboard_table"]} |
|
""" |
|
return leaderboard_md |
|
|
|
|
|
def model_hyperlink(model_name, link): |
|
return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>' |
|
|
|
|
|
def load_leaderboard_table_csv(filename, add_hyperlink=True): |
|
df = pd.read_csv(filename) |
|
for col in df.columns: |
|
if "Arena Elo rating" in col: |
|
df[col] = df[col].apply(lambda x: int(x) if x != "-" else np.nan) |
|
if add_hyperlink and col == "Model": |
|
df[col] = df.apply(lambda row: model_hyperlink(row[col], row["Link"]), axis=1) |
|
return df |
|
|
|
|
|
|
|
def build_basic_stats_tab(): |
|
empty = "Loading ..." |
|
basic_component_values[:] = [empty, None, empty, empty, empty, empty] |
|
|
|
md0 = gr.Markdown(empty) |
|
gr.Markdown("#### Figure 1: Number of model calls and votes") |
|
plot_1 = gr.Plot(show_label=False) |
|
with gr.Row(): |
|
with gr.Column(): |
|
md1 = gr.Markdown(empty) |
|
with gr.Column(): |
|
md2 = gr.Markdown(empty) |
|
with gr.Row(): |
|
with gr.Column(): |
|
md3 = gr.Markdown(empty) |
|
with gr.Column(): |
|
md4 = gr.Markdown(empty) |
|
return [md0, plot_1, md1, md2, md3, md4] |
|
|
|
|
|
def get_full_table(anony_arena_df, full_arena_df, model_table_df): |
|
values = [] |
|
for i in range(len(model_table_df)): |
|
row = [] |
|
model_key = model_table_df.iloc[i]["key"] |
|
model_name = model_table_df.iloc[i]["Model"] |
|
|
|
row.append(model_name) |
|
if model_key in anony_arena_df.index: |
|
idx = anony_arena_df.index.get_loc(model_key) |
|
row.append(round(anony_arena_df.iloc[idx]["rating"])) |
|
upper_diff = round(anony_arena_df.iloc[idx]["rating_q975"] - anony_arena_df.iloc[idx]["rating"]) |
|
lower_diff = round(anony_arena_df.iloc[idx]["rating"] - anony_arena_df.iloc[idx]["rating_q025"]) |
|
row.append(f"+{upper_diff}/-{lower_diff}") |
|
else: |
|
row.append(np.nan) |
|
row.append("N/A") |
|
|
|
if model_key in full_arena_df.index: |
|
idx = full_arena_df.index.get_loc(model_key) |
|
row.append(round(full_arena_df.iloc[idx]["rating"])) |
|
upper_diff = round(full_arena_df.iloc[idx]["rating_q975"] - full_arena_df.iloc[idx]["rating"]) |
|
lower_diff = round(full_arena_df.iloc[idx]["rating"] - full_arena_df.iloc[idx]["rating_q025"]) |
|
row.append(f"+{upper_diff}/-{lower_diff}") |
|
row.append(round(full_arena_df.iloc[idx]["num_battles"])) |
|
else: |
|
row.append(np.nan) |
|
row.append("N/A") |
|
row.append(np.nan) |
|
|
|
row.append(model_table_df.iloc[i]["Organization"]) |
|
|
|
row.append(model_table_df.iloc[i]["License"]) |
|
|
|
values.append(row) |
|
values.sort(key=lambda x: -x[1] if not np.isnan(x[1]) else 1e9) |
|
|
|
for i, row in enumerate(values): |
|
row.insert(0, i + 1) |
|
return values |
|
|
|
|
|
def get_arena_table(arena_df, model_table_df): |
|
|
|
arena_df = arena_df.sort_values(by=["rating"], ascending=False) |
|
values = [] |
|
for i in range(len(arena_df)): |
|
row = [] |
|
model_key = arena_df.index[i] |
|
model_name = model_table_df[model_table_df["key"] == model_key]["Model"].values[ |
|
0 |
|
] |
|
|
|
|
|
row.append(i + 1) |
|
|
|
row.append(model_name) |
|
|
|
row.append(round(arena_df.iloc[i]["rating"])) |
|
upper_diff = round(arena_df.iloc[i]["rating_q975"] - arena_df.iloc[i]["rating"]) |
|
lower_diff = round(arena_df.iloc[i]["rating"] - arena_df.iloc[i]["rating_q025"]) |
|
row.append(f"+{upper_diff}/-{lower_diff}") |
|
|
|
print(arena_df.iloc[i]) |
|
row.append(round(arena_df.iloc[i]["num_battles"])) |
|
|
|
row.append( |
|
model_table_df[model_table_df["key"] == model_key]["Organization"].values[0] |
|
) |
|
|
|
row.append( |
|
model_table_df[model_table_df["key"] == model_key]["License"].values[0] |
|
) |
|
|
|
values.append(row) |
|
return values |
|
|
|
def make_arena_leaderboard_md(elo_results): |
|
arena_df = elo_results["leaderboard_table_df"] |
|
last_updated = elo_results["last_updated_datetime"] |
|
total_votes = sum(arena_df["num_battles"]) // 2 |
|
total_models = len(arena_df) |
|
|
|
leaderboard_md = f""" |
|
|
|
|
|
Total #models: **{total_models}**(anonymous). Total #votes: **{total_votes}**. Last updated: {last_updated}. |
|
(Note: Only anonymous votes are considered here. Check the full leaderboard for all votes.) |
|
|
|
Contribute the votes π³οΈ at [GenAI-Arena](https://huggingface.co/spaces/TIGER-Lab/GenAI-Arena)! |
|
|
|
If you want to see more models, please help us [add them](https://huggingface.co/spaces/TIGER-Lab/GenAI-Arena/tree/main?tab=readme-ov-file#-contributing-). |
|
""" |
|
return leaderboard_md |
|
|
|
def make_full_leaderboard_md(elo_results): |
|
arena_df = elo_results["leaderboard_table_df"] |
|
last_updated = elo_results["last_updated_datetime"] |
|
total_votes = sum(arena_df["num_battles"]) // 2 |
|
total_models = len(arena_df) |
|
|
|
leaderboard_md = f""" |
|
Total #models: **{total_models}**(full:anonymous+open). Total #votes: **{total_votes}**. Last updated: {last_updated}. |
|
|
|
Contribute your vote π³οΈ at [vision-arena](https://huggingface.co/spaces/WildVision/vision-arena)! |
|
""" |
|
return leaderboard_md |
|
|
|
def build_leaderboard_tab(elo_results_file, leaderboard_table_file, show_plot=True): |
|
if elo_results_file is None: |
|
md = "Loading ..." |
|
p1 = p2 = p3 = p4 = None |
|
else: |
|
with open(elo_results_file, "rb") as fin: |
|
elo_results = pickle.load(fin) |
|
|
|
anony_elo_results = elo_results["anony"] |
|
full_elo_results = elo_results["full"] |
|
anony_arena_df = anony_elo_results["leaderboard_table_df"] |
|
full_arena_df = full_elo_results["leaderboard_table_df"] |
|
p1 = anony_elo_results["win_fraction_heatmap"] |
|
p2 = anony_elo_results["battle_count_heatmap"] |
|
p3 = anony_elo_results["bootstrap_elo_rating"] |
|
p4 = anony_elo_results["average_win_rate_bar"] |
|
|
|
md = make_leaderboard_md(anony_elo_results) |
|
|
|
md_1 = gr.Markdown(md, elem_id="leaderboard_markdown") |
|
|
|
if leaderboard_table_file: |
|
model_table_df = load_leaderboard_table_csv(leaderboard_table_file) |
|
with gr.Tabs() as tabs: |
|
|
|
arena_table_vals = get_arena_table(anony_arena_df, model_table_df) |
|
with gr.Tab("Arena Elo", id=0): |
|
md = make_arena_leaderboard_md(anony_elo_results) |
|
gr.Markdown(md, elem_id="leaderboard_markdown") |
|
gr.Dataframe( |
|
headers=[ |
|
"Rank", |
|
"π€ Model", |
|
"β Arena Elo", |
|
"π 95% CI", |
|
"π³οΈ Votes", |
|
"Organization", |
|
"License", |
|
], |
|
datatype=[ |
|
"str", |
|
"markdown", |
|
"number", |
|
"str", |
|
"number", |
|
"str", |
|
"str", |
|
], |
|
value=arena_table_vals, |
|
elem_id="arena_leaderboard_dataframe", |
|
height=700, |
|
column_widths=[30, 70, 30, 30, 30, 70, 100], |
|
wrap=True, |
|
) |
|
with gr.Tab("Full Leaderboard", id=1): |
|
md = make_full_leaderboard_md(full_elo_results) |
|
gr.Markdown(md, elem_id="leaderboard_markdown") |
|
full_table_vals = get_full_table(anony_arena_df, full_arena_df, model_table_df) |
|
gr.Dataframe( |
|
headers=[ |
|
"Rank", |
|
"π€ Model", |
|
"β Arena Elo (anony)", |
|
"π 95% CI", |
|
"β Arena Elo (full)", |
|
"π 95% CI", |
|
"π³οΈ Votes", |
|
"Organization", |
|
"License", |
|
], |
|
datatype=["str", "markdown", "number", "str", "number", "str", "number", "str", "str"], |
|
value=full_table_vals, |
|
elem_id="full_leaderboard_dataframe", |
|
column_widths=[30, 70, 30, 30, 30, 30, 30, 70, 100], |
|
height=700, |
|
wrap=True, |
|
) |
|
|
|
gr.Markdown( |
|
""" ## We are still collecting more votes on more models. The ranking will be updated very fruquently. Please stay tuned! |
|
""", |
|
elem_id="leaderboard_markdown", |
|
) |
|
|
|
if show_plot: |
|
win_fraction_heatmap = anony_elo_results["win_fraction_heatmap"] |
|
battle_count_heatmap = anony_elo_results["battle_count_heatmap"] |
|
bootstrap_elo_rating = anony_elo_results["bootstrap_elo_rating"] |
|
average_win_rate_bar = anony_elo_results["average_win_rate_bar"] |
|
with gr.Row(): |
|
with gr.Column(): |
|
gr.Markdown( |
|
"#### Figure 1: Fraction of Model A Wins for All Non-tied A vs. B Battles" |
|
) |
|
plot_1 = gr.Plot(win_fraction_heatmap, show_label=False) |
|
with gr.Column(): |
|
gr.Markdown( |
|
"#### Figure 2: Battle Count for Each Combination of Models (without Ties)" |
|
) |
|
plot_2 = gr.Plot(battle_count_heatmap, show_label=False) |
|
with gr.Row(): |
|
with gr.Column(): |
|
gr.Markdown( |
|
"#### Figure 3: Bootstrap of Elo Estimates (1000 Rounds of Random Sampling)" |
|
) |
|
plot_3 = gr.Plot(bootstrap_elo_rating, show_label=False) |
|
with gr.Column(): |
|
gr.Markdown( |
|
"#### Figure 4: Average Win Rate Against All Other Models (Assuming Uniform Sampling and No Ties)" |
|
) |
|
plot_4 = gr.Plot(average_win_rate_bar, show_label=False) |
|
|
|
else: |
|
pass |
|
|
|
leader_component_values[:] = [md, p1, p2, p3, p4] |
|
|
|
from .utils import acknowledgment_md |
|
|
|
gr.Markdown(acknowledgment_md) |
|
|
|
|
|
return [md_1] |