import os
import pickle
import pandas as pd
import numpy as np
import gradio as gr
from datetime import datetime
from huggingface_hub import HfApi
from apscheduler.schedulers.background import BackgroundScheduler
import plotly.graph_objects as go
from utils import (
KEY_TO_CATEGORY_NAME,
CAT_NAME_TO_EXPLANATION,
download_latest_data_from_space,
get_constants,
update_release_date_mapping,
format_data,
get_trendlines,
find_crossover_point,
sigmoid_transition,
apply_template,
)
###################
### Initialize scheduler
###################
# def restart_space():
# HfApi(token=os.getenv("HF_TOKEN", None)).restart_space(
# repo_id="m-ric/llm-race-to-the-top"
# )
# print(f"Space restarted on {datetime.now()}")
# # restart the space every day at 9am
# scheduler = BackgroundScheduler()
# scheduler.add_job(restart_space, "cron", day_of_week="mon-sun", hour=7, minute=0)
# scheduler.start()
###################
### Load Data
###################
# gather ELO data
latest_elo_file_local = download_latest_data_from_space(
repo_id="lmsys/chatbot-arena-leaderboard", file_type="pkl"
)
with open(latest_elo_file_local, "rb") as fin:
elo_results = pickle.load(fin)
# TO-DO: need to also include vision
elo_results = elo_results["text"]
arena_dfs = {}
for k in KEY_TO_CATEGORY_NAME.keys():
if k not in elo_results:
continue
arena_dfs[KEY_TO_CATEGORY_NAME[k]] = elo_results[k]["leaderboard_table_df"]
# gather open llm leaderboard data
latest_leaderboard_file_local = download_latest_data_from_space(
repo_id="lmsys/chatbot-arena-leaderboard", file_type="csv"
)
leaderboard_df = pd.read_csv(latest_leaderboard_file_local)
# load release date mapping data
release_date_mapping = pd.read_json("release_date_mapping.json", orient="records")
###################
### Prepare Data
###################
# update release date mapping with new models
# check for new models in ELO data
new_model_keys_to_add = [
model
for model in arena_dfs["Overall"].index.to_list()
if model not in release_date_mapping["key"].to_list()
]
if new_model_keys_to_add:
release_date_mapping = update_release_date_mapping(
new_model_keys_to_add, leaderboard_df, release_date_mapping
)
# merge leaderboard data with ELO data
merged_dfs = {}
for k, v in arena_dfs.items():
merged_dfs[k] = (
pd.merge(arena_dfs[k], leaderboard_df, left_index=True, right_on="key")
.sort_values("rating", ascending=False)
.reset_index(drop=True)
)
# add release dates into the merged data
for k, v in merged_dfs.items():
merged_dfs[k] = pd.merge(
merged_dfs[k], release_date_mapping[["key", "Release Date"]], on="key"
)
# format dataframes
merged_dfs = {k: format_data(v) for k, v in merged_dfs.items()}
# get constants
min_elo_score, max_elo_score, _ = get_constants(merged_dfs)
date_updated = elo_results["full"]["last_updated_datetime"].split(" ")[0]
ratings_df = merged_dfs["Overall"]
ratings_df = ratings_df.loc[~ratings_df["Release Date"].isna()]
ratings_df["Organization"] = ratings_df["Organization"].apply(lambda x: "DeepSeek" if x == "DeepSeek AI" else x)
###################
### Build and Plot Data
###################
def get_data_split(dfs, set_name):
df = dfs[set_name].copy(deep=True)
return df.reset_index(drop=True)
def clean_df_for_display(df):
df = df.loc[
:,
[
"Model",
"rating",
"MMLU",
"MT-bench (score)",
"Release Date",
"Organization",
"License",
"Link",
],
].rename(columns={"rating": "ELO Score", "MT-bench (score)": "MT-Bench"})
df["Release Date"] = df["Release Date"].astype(str)
df.sort_values("ELO Score", ascending=False, inplace=True)
df.reset_index(drop=True, inplace=True)
return df
def format_data(df):
"""
Formats the given DataFrame by performing the following operations:
- Converts the 'License' column values to 'Proprietary LLM' if they are in PROPRIETARY_LICENSES, otherwise 'Open LLM'.
- Converts the 'Release Date' column to datetime format.
- Adds a new 'Month-Year' column by extracting the month and year from the 'Release Date' column.
- Rounds the 'rating' column to the nearest integer.
- Resets the index of the DataFrame.
Args:
df (pandas.DataFrame): The DataFrame to be formatted.
Returns:
pandas.DataFrame: The formatted DataFrame.
"""
PROPRIETARY_LICENSES = ["Proprietary", "Proprietory"]
df["License"] = df["License"].apply(
lambda x: "Proprietary LLM" if x in PROPRIETARY_LICENSES else "Open LLM"
)
df["Release Date"] = pd.to_datetime(df["Release Date"])
df["Month-Year"] = df["Release Date"].dt.to_period("M")
df["rating"] = df["rating"].round()
return df.reset_index(drop=True)
# Define organization to country mapping and colors
org_info = {
"OpenAI": ("#00A67E", "๐บ๐ธ"), # Teal
"Google": ("#4285F4", "๐บ๐ธ"), # Google Blue
"xAI": ("black", "๐บ๐ธ"), # Bright Orange
"Anthropic": ("#cc785c", "๐บ๐ธ"), # Brown (as requested)
"Meta": ("#0064E0", "๐บ๐ธ"), # Facebook Blue
"Alibaba": ("#6958cf", "๐จ๐ณ"),
"DeepSeek": ("#9900CC", "๐จ๐ณ"),
"01 AI": ("#11871e", "๐จ๐ณ"), # Bright Green
"DeepSeek AI": ("#9900CC", "๐จ๐ณ"), # Purple
"Mistral": ("#ff7000", "๐ซ๐ท"), # Mistral Orange (as requested)
"AI21 Labs": ("#1E90FF", "๐ฎ๐ฑ"), # Dodger Blue,
"Reka AI": ("#FFC300", "๐บ๐ธ"),
"Zhipu AI": ("#FFC300", "๐จ๐ณ"),
"Nvidia": ("#76B900", "๐บ๐ธ"),
}
def make_figure(original_df, start_time_gradio, speak_french):
fig = go.Figure()
start_date = pd.to_datetime(start_time_gradio, unit='s')
df = original_df.copy(deep=True)
df["Release Date"] = pd.to_datetime(df["Release Date"])
for i, org in enumerate(
df.groupby("Organization")["rating"]
.max()
.sort_values(ascending=False)
.index.tolist()
):
org_data = df[df["Organization"] == org]
if len(org_data) > 0:
x_values = []
y_values = []
current_best = -np.inf
best_models = []
# Group by date and get the best model for each date
daily_best = org_data.groupby("Release Date").first().reset_index()
for _, row in daily_best.iterrows():
if row["rating"] > current_best:
if len(x_values) > 0:
# Create smooth transition
transition_days = (row["Release Date"] - x_values[-1]).days
transition_points = pd.date_range(
x_values[-1],
row["Release Date"],
periods=max(100, transition_days),
)
x_values.extend(transition_points)
transition_y = current_best + (
row["rating"] - current_best
) * sigmoid_transition(
np.linspace(-6, 6, len(transition_points)), 0, k=1
)
y_values.extend(transition_y)
x_values.append(row["Release Date"])
y_values.append(row["rating"])
current_best = row["rating"]
best_models.append(row)
# Extend the line to the current date
current_date = pd.Timestamp.now()
if x_values[-1] < current_date:
x_values.append(current_date)
y_values.append(current_best)
# Get org color and flag
color, flag = org_info.get(org, ("#808080", ""))
# Add line plot
fig.add_trace(
go.Scatter(
x=x_values,
y=y_values,
mode="lines",
name=f"{i+1}. {org} {flag}",
line=dict(color=color, width=2),
hoverinfo="skip",
)
)
# Add scatter plot for best model points
best_models_df = pd.DataFrame(best_models)
fig.add_trace(
go.Scatter(
x=best_models_df["Release Date"],
y=best_models_df["rating"],
mode="markers",
name=org,
showlegend=False,
marker=dict(color=color, size=8, symbol="circle"),
text=best_models_df["Model"],
hovertemplate="%{text}
Date: %{x}
ELO Score: %{y:.2f}",
)
)
# Update layout
if speak_french:
fig.update_layout(
title="La course au classement",
yaxis_title="Score ELO",
legend_title="Classement en Novembre 2024",
)
else:
fig.update_layout(
yaxis_title="ELO score on Chatbot Arena",
legend_title="Ranking as of November 2024",
title="The race for the best LLM",
)
print("START TIME:", start_time)
margin = 30
fig.update_layout(
xaxis_title="Date",
hovermode="closest",
xaxis_range=[start_date, current_date], # Extend x-axis for labels
yaxis_range=[df.loc[df["Release Date"] >= start_date]["rating"].min()+margin, df["rating"].max() + margin],
)
apply_template(fig, annotation_text="Aymeric Roucher", height=600)
fig.update_xaxes(
tickformat="%m-%Y",
)
return fig, df
def filter_df(top_n_orgs=11, minimum_rating=1000):
top_orgs = ratings_df.groupby("Organization")["rating"].max().nlargest(int(top_n_orgs)).index.tolist()
return ratings_df.loc[(ratings_df["Organization"].isin(top_orgs))]
with gr.Blocks(
theme=gr.themes.Soft(
primary_hue=gr.themes.colors.sky,
secondary_hue=gr.themes.colors.green,
# spacing_size=gr.themes.sizes.spacing_sm,
text_size=gr.themes.sizes.text_sm,
font=[
gr.themes.GoogleFont("Open Sans"),
"ui-serif",
"system-ui",
"serif",
],
),
) as demo:
filtered_df = gr.State()
with gr.Row():
top_n_orgs = gr.Slider(minimum=1, maximum=15, value=11, step=1, label="View top N companies")
# minimum_rating = gr.Slider(minimum=800, maximum=1300, value=1000, step=1, label="Restrict to ELO scores above N")
start_time = gr.DateTime(value="2024-01-01 00:00:00", label="Start time")
speak_french = gr.Checkbox(value=False, label="Parler franรงais")
with gr.Group():
with gr.Tab("Plot"):
plot = gr.Plot(show_label=False)
with gr.Tab("Raw Data"):
display_df = gr.DataFrame()
gr.Markdown(
"""
This app visualizes the progress of LLMs over time as scored by the [LMSYS Chatbot Arena](https://leaderboard.lmsys.org/).
The app is adapted from [this app](https://huggingface.co/spaces/andrewrreed/closed-vs-open-arena-elo) by Andew Reed,
and is intended to stay up-to-date as new models are released and evaluated.
> ### Plot info
> The ELO score (y-axis) is a measure of the relative strength of a model based on its performance against other models in the arena.
> The Release Date (x-axis) corresponds to when the model was first publicly released or when its ELO results were first reported (for ease of automated updates).
> Trend lines are based on Ordinary Least Squares (OLS) regression and adjust based on the filter criteria.
"""
)
demo.load(
fn=filter_df,
inputs=[top_n_orgs],
outputs=filtered_df,
).then(
fn=make_figure,
inputs=[filtered_df, start_time, speak_french],
outputs=[plot, display_df],
)
top_n_orgs.change(
fn=filter_df,
inputs=[top_n_orgs],
outputs=filtered_df,
).then(
fn=make_figure,
inputs=[filtered_df, start_time, speak_french],
outputs=[plot, display_df],
)
start_time.change(
fn=make_figure,
inputs=[filtered_df, start_time, speak_french],
outputs=[plot, display_df],
)
speak_french.change(
fn=make_figure,
inputs=[filtered_df, start_time, speak_french],
outputs=[plot, display_df],
)
demo.launch()