Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
File size: 12,601 Bytes
459e74a 2876fcd 459e74a 31aa24a 459e74a 1f5d15f 459e74a 1f5d15f 459e74a 31aa24a 16268bf 459e74a 16268bf 459e74a 65af3cf 459e74a 65af3cf 459e74a 5897436 d789018 5897436 459e74a 10ea153 459e74a 79f07b1 5897436 31aa24a 5897436 60fda6c 31aa24a 65af3cf 459e74a 31aa24a 5897436 459e74a 31aa24a 459e74a 31aa24a 459e74a 31aa24a 459e74a 31aa24a dc3c946 5897436 65af3cf 459e74a 31aa24a 2876fcd 459e74a 5897436 31aa24a 65af3cf 31aa24a 65af3cf 5897436 31aa24a 65af3cf 31aa24a 65af3cf 2876fcd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 |
import os
import pickle
import pandas as pd
import numpy as np
import gradio as gr
from datetime import datetime
from huggingface_hub import HfApi
from apscheduler.schedulers.background import BackgroundScheduler
import plotly.graph_objects as go
from utils import (
KEY_TO_CATEGORY_NAME,
CAT_NAME_TO_EXPLANATION,
download_latest_data_from_space,
get_constants,
update_release_date_mapping,
format_data,
get_trendlines,
find_crossover_point,
sigmoid_transition,
apply_template,
)
###################
### Initialize scheduler
###################
# def restart_space():
# HfApi(token=os.getenv("HF_TOKEN", None)).restart_space(
# repo_id="m-ric/llm-race-to-the-top"
# )
# print(f"Space restarted on {datetime.now()}")
# # restart the space every day at 9am
# scheduler = BackgroundScheduler()
# scheduler.add_job(restart_space, "cron", day_of_week="mon-sun", hour=7, minute=0)
# scheduler.start()
###################
### Load Data
###################
# gather ELO data
latest_elo_file_local = download_latest_data_from_space(
repo_id="lmsys/chatbot-arena-leaderboard", file_type="pkl"
)
with open(latest_elo_file_local, "rb") as fin:
elo_results = pickle.load(fin)
# TO-DO: need to also include vision
elo_results = elo_results["text"]
arena_dfs = {}
for k in KEY_TO_CATEGORY_NAME.keys():
if k not in elo_results:
continue
arena_dfs[KEY_TO_CATEGORY_NAME[k]] = elo_results[k]["leaderboard_table_df"]
# gather open llm leaderboard data
latest_leaderboard_file_local = download_latest_data_from_space(
repo_id="lmsys/chatbot-arena-leaderboard", file_type="csv"
)
leaderboard_df = pd.read_csv(latest_leaderboard_file_local)
# load release date mapping data
release_date_mapping = pd.read_json("release_date_mapping.json", orient="records")
###################
### Prepare Data
###################
# update release date mapping with new models
# check for new models in ELO data
new_model_keys_to_add = [
model
for model in arena_dfs["Overall"].index.to_list()
if model not in release_date_mapping["key"].to_list()
]
if new_model_keys_to_add:
release_date_mapping = update_release_date_mapping(
new_model_keys_to_add, leaderboard_df, release_date_mapping
)
# merge leaderboard data with ELO data
merged_dfs = {}
for k, v in arena_dfs.items():
merged_dfs[k] = (
pd.merge(arena_dfs[k], leaderboard_df, left_index=True, right_on="key")
.sort_values("rating", ascending=False)
.reset_index(drop=True)
)
# add release dates into the merged data
for k, v in merged_dfs.items():
merged_dfs[k] = pd.merge(
merged_dfs[k], release_date_mapping[["key", "Release Date"]], on="key"
)
# format dataframes
merged_dfs = {k: format_data(v) for k, v in merged_dfs.items()}
# get constants
min_elo_score, max_elo_score, _ = get_constants(merged_dfs)
date_updated = elo_results["full"]["last_updated_datetime"].split(" ")[0]
ratings_df = merged_dfs["Overall"]
ratings_df = ratings_df.loc[~ratings_df["Release Date"].isna()]
ratings_df["Organization"] = ratings_df["Organization"].apply(lambda x: "DeepSeek" if x == "DeepSeek AI" else x)
###################
### Build and Plot Data
###################
def get_data_split(dfs, set_name):
df = dfs[set_name].copy(deep=True)
return df.reset_index(drop=True)
def clean_df_for_display(df):
df = df.loc[
:,
[
"Model",
"rating",
"MMLU",
"MT-bench (score)",
"Release Date",
"Organization",
"License",
"Link",
],
].rename(columns={"rating": "ELO Score", "MT-bench (score)": "MT-Bench"})
df["Release Date"] = df["Release Date"].astype(str)
df.sort_values("ELO Score", ascending=False, inplace=True)
df.reset_index(drop=True, inplace=True)
return df
def format_data(df):
"""
Formats the given DataFrame by performing the following operations:
- Converts the 'License' column values to 'Proprietary LLM' if they are in PROPRIETARY_LICENSES, otherwise 'Open LLM'.
- Converts the 'Release Date' column to datetime format.
- Adds a new 'Month-Year' column by extracting the month and year from the 'Release Date' column.
- Rounds the 'rating' column to the nearest integer.
- Resets the index of the DataFrame.
Args:
df (pandas.DataFrame): The DataFrame to be formatted.
Returns:
pandas.DataFrame: The formatted DataFrame.
"""
PROPRIETARY_LICENSES = ["Proprietary", "Proprietory"]
df["License"] = df["License"].apply(
lambda x: "Proprietary LLM" if x in PROPRIETARY_LICENSES else "Open LLM"
)
df["Release Date"] = pd.to_datetime(df["Release Date"])
df["Month-Year"] = df["Release Date"].dt.to_period("M")
df["rating"] = df["rating"].round()
return df.reset_index(drop=True)
# Define organization to country mapping and colors
org_info = {
"OpenAI": ("#00A67E", "๐บ๐ธ"), # Teal
"Google": ("#4285F4", "๐บ๐ธ"), # Google Blue
"xAI": ("black", "๐บ๐ธ"), # Bright Orange
"Anthropic": ("#cc785c", "๐บ๐ธ"), # Brown (as requested)
"Meta": ("#0064E0", "๐บ๐ธ"), # Facebook Blue
"Alibaba": ("#6958cf", "๐จ๐ณ"),
"DeepSeek": ("#9900CC", "๐จ๐ณ"),
"01 AI": ("#11871e", "๐จ๐ณ"), # Bright Green
"DeepSeek AI": ("#9900CC", "๐จ๐ณ"), # Purple
"Mistral": ("#ff7000", "๐ซ๐ท"), # Mistral Orange (as requested)
"AI21 Labs": ("#1E90FF", "๐ฎ๐ฑ"), # Dodger Blue,
"Reka AI": ("#FFC300", "๐บ๐ธ"),
"Zhipu AI": ("#FFC300", "๐จ๐ณ"),
"Nvidia": ("#76B900", "๐บ๐ธ"),
}
def make_figure(original_df, start_time_gradio, speak_french):
fig = go.Figure()
start_date = pd.to_datetime(start_time_gradio, unit='s')
df = original_df.copy(deep=True)
df["Release Date"] = pd.to_datetime(df["Release Date"])
for i, org in enumerate(
df.groupby("Organization")["rating"]
.max()
.sort_values(ascending=False)
.index.tolist()
):
org_data = df[df["Organization"] == org]
if len(org_data) > 0:
x_values = []
y_values = []
current_best = -np.inf
best_models = []
# Group by date and get the best model for each date
daily_best = org_data.groupby("Release Date").first().reset_index()
for _, row in daily_best.iterrows():
if row["rating"] > current_best:
if len(x_values) > 0:
# Create smooth transition
transition_days = (row["Release Date"] - x_values[-1]).days
transition_points = pd.date_range(
x_values[-1],
row["Release Date"],
periods=max(100, transition_days),
)
x_values.extend(transition_points)
transition_y = current_best + (
row["rating"] - current_best
) * sigmoid_transition(
np.linspace(-6, 6, len(transition_points)), 0, k=1
)
y_values.extend(transition_y)
x_values.append(row["Release Date"])
y_values.append(row["rating"])
current_best = row["rating"]
best_models.append(row)
# Extend the line to the current date
current_date = pd.Timestamp.now()
if x_values[-1] < current_date:
x_values.append(current_date)
y_values.append(current_best)
# Get org color and flag
color, flag = org_info.get(org, ("#808080", ""))
# Add line plot
fig.add_trace(
go.Scatter(
x=x_values,
y=y_values,
mode="lines",
name=f"{i+1}. {org} {flag}",
line=dict(color=color, width=2),
hoverinfo="skip",
)
)
# Add scatter plot for best model points
best_models_df = pd.DataFrame(best_models)
fig.add_trace(
go.Scatter(
x=best_models_df["Release Date"],
y=best_models_df["rating"],
mode="markers",
name=org,
showlegend=False,
marker=dict(color=color, size=8, symbol="circle"),
text=best_models_df["Model"],
hovertemplate="<b>%{text}</b><br>Date: %{x}<br>ELO Score: %{y:.2f}<extra></extra>",
)
)
# Update layout
if speak_french:
fig.update_layout(
title="La course au classement",
yaxis_title="Score ELO",
legend_title="Classement en Novembre 2024",
)
else:
fig.update_layout(
yaxis_title="ELO score on Chatbot Arena",
legend_title="Ranking as of November 2024",
title="The race for the best LLM",
)
print("START TIME:", start_time)
margin = 30
fig.update_layout(
xaxis_title="Date",
hovermode="closest",
xaxis_range=[start_date, current_date], # Extend x-axis for labels
yaxis_range=[df.loc[df["Release Date"] >= start_date]["rating"].min()+margin, df["rating"].max() + margin],
)
apply_template(fig, annotation_text="Aymeric Roucher", height=600)
fig.update_xaxes(
tickformat="%m-%Y",
)
return fig, df
def filter_df(top_n_orgs=11, minimum_rating=1000):
top_orgs = ratings_df.groupby("Organization")["rating"].max().nlargest(int(top_n_orgs)).index.tolist()
return ratings_df.loc[(ratings_df["Organization"].isin(top_orgs))]
with gr.Blocks(
theme=gr.themes.Soft(
primary_hue=gr.themes.colors.sky,
secondary_hue=gr.themes.colors.green,
# spacing_size=gr.themes.sizes.spacing_sm,
text_size=gr.themes.sizes.text_sm,
font=[
gr.themes.GoogleFont("Open Sans"),
"ui-serif",
"system-ui",
"serif",
],
),
) as demo:
filtered_df = gr.State()
with gr.Row():
top_n_orgs = gr.Slider(minimum=1, maximum=15, value=11, step=1, label="View top N companies")
# minimum_rating = gr.Slider(minimum=800, maximum=1300, value=1000, step=1, label="Restrict to ELO scores above N")
start_time = gr.DateTime(value="2024-01-01 00:00:00", label="Start time")
speak_french = gr.Checkbox(value=False, label="Parler franรงais")
with gr.Group():
with gr.Tab("Plot"):
plot = gr.Plot(show_label=False)
with gr.Tab("Raw Data"):
display_df = gr.DataFrame()
gr.Markdown(
"""
This app visualizes the progress of LLMs over time as scored by the [LMSYS Chatbot Arena](https://leaderboard.lmsys.org/).
The app is adapted from [this app](https://huggingface.co/spaces/andrewrreed/closed-vs-open-arena-elo) by Andew Reed,
and is intended to stay up-to-date as new models are released and evaluated.
> ### Plot info
> The ELO score (y-axis) is a measure of the relative strength of a model based on its performance against other models in the arena.
> The Release Date (x-axis) corresponds to when the model was first publicly released or when its ELO results were first reported (for ease of automated updates).
> Trend lines are based on Ordinary Least Squares (OLS) regression and adjust based on the filter criteria.
"""
)
demo.load(
fn=filter_df,
inputs=[top_n_orgs],
outputs=filtered_df,
).then(
fn=make_figure,
inputs=[filtered_df, start_time, speak_french],
outputs=[plot, display_df],
)
top_n_orgs.change(
fn=filter_df,
inputs=[top_n_orgs],
outputs=filtered_df,
).then(
fn=make_figure,
inputs=[filtered_df, start_time, speak_french],
outputs=[plot, display_df],
)
start_time.change(
fn=make_figure,
inputs=[filtered_df, start_time, speak_french],
outputs=[plot, display_df],
)
speak_french.change(
fn=make_figure,
inputs=[filtered_df, start_time, speak_french],
outputs=[plot, display_df],
)
demo.launch()
|