import argparse
import ast
import glob
import pickle
import traceback
import numpy as np
from datetime import datetime
import pandas as pd
import gradio as gr
import numpy as np
basic_component_values = [None] * 6
leader_component_values = [None] * 5
promo_banner = """
USE THE LATEST VERSIONS OF THE BEST CHATBOTS IN RUSSIAN FOR FREE
"""
deprecated_model_name = [
"GigaChat 3.1.25.3",
"GigaChat-Pro 2.2.25.3",
"saiga_llama3_8b_v6",
"saiga_phi3_medium",
"GigaChat-Plus 3.1.25.3",
"GigaChat-Pro 4.0.26.8",
"GigaChat 4.0.26.8",
"xAI: Grok 2",
"GigaChat-Pro 4.0.26.15",
"GigaChat 4.0.26.15",
"YandexGPT Experimental", "yandex-gpt-arena"
]
def make_default_md_1():
leaderboard_md = f"""
# 🏆 LLM Arena in Russian: Leaderboard
{promo_banner}
"""
return leaderboard_md
def make_default_md_2():
leaderboard_md = f"""
The LLM Arena platform is an open crowdsourcing platform for evaluating large language models (LLM) in Russian. We collect pairwise comparisons from people to rank LLMs using the Bradley-Terry model and display model ratings on the Elo scale.
Chatbot Arena in Russian depends on community participation, so please contribute by casting your vote!
- To **add your model** to the comparison, contact us on TG: [Group](https://t.me/+bFEOl-Bdmok4NGUy)
- If you **found a bug** or **have a suggestion**, contact us: [Roman](https://t.me/roman_kucev)
- You can **contribute your vote** at [llmarena.ru](https://llmarena.ru/)!
"""
return leaderboard_md
def make_arena_leaderboard_md(arena_df, last_updated_time):
total_votes = sum(arena_df["num_battles"])
total_models = len(arena_df)
space = " "
leaderboard_md = f"""
Total # of models: **{total_models}**.{space} Total # of votes: **{"{:,}".format(total_votes)}**.{space} Last updated: {last_updated_time}.
***Rank (UB)**: model rating (upper bound), determined as one plus the number of models that are statistically better than the target model.
Model A is statistically better than Model B when the lower bound of Model A's rating is higher than the upper bound of Model B's rating (with a 95% confidence interval).
See Figure 1 below for a visualization of the confidence intervals of model ratings.
"""
return leaderboard_md
def make_category_arena_leaderboard_md(arena_df, arena_subset_df, name="site_visitors/medium_prompts:style control"):
total_votes = sum(arena_df["num_battles"])
total_models = len(arena_df)
space = " "
total_subset_votes = sum(arena_subset_df["num_battles"])
total_subset_models = len(arena_subset_df)
leaderboard_md = f"""### {cat_name_to_explanation[name]}
#### {space} #models: **{total_subset_models} ({round(total_subset_models/total_models *100)}%)** {space} #votes: **{"{:,}".format(total_subset_votes)} ({round(total_subset_votes/total_votes * 100)}%)**{space}
"""
return leaderboard_md
def model_hyperlink(model_name, link):
return f'{model_name}'
def filter_deprecated_models_plots(fig, hidden_models=None):
"""
Removes deprecated models from a Plotly figure.
Args:
fig: The Plotly figure object.
hidden_models: A list of model names to remove.
"""
if fig is None:
return
if hidden_models is None:
return fig
if fig.data[0].type == 'heatmap':
data = fig.data[0]
mask_x = ~np.isin(data.x, hidden_models)
mask_y = ~np.isin(data.y, hidden_models)
data.update({
'x': np.array(data.x)[mask_x],
'y': np.array(data.y)[mask_y],
'z': np.array(data.z)[np.ix_(mask_y, mask_x)]
})
elif fig.data[0].type == 'scatter':
trace = fig.data[0]
mask = ~np.isin(trace.x, hidden_models)
trace.x, trace.y, trace.text = np.array(trace.x)[mask], np.array(trace.y)[mask], np.array(trace.text)[mask]
for key in ['array', 'arrayminus']:
if key in trace.error_y:
trace.error_y[key] = trace.error_y[key][mask]
elif fig.data[0].type == 'bar':
mask = ~np.isin(fig.data[0].x, hidden_models)
fig.data[0].x = fig.data[0].x[mask]
fig.data[0].y = fig.data[0].y[mask]
return fig
def load_leaderboard_table_csv(filename, add_hyperlink=True):
lines = open(filename).readlines()
heads = [v.strip() for v in lines[0].split(",")]
rows = []
for i in range(1, len(lines)):
row = [v.strip() for v in lines[i].split(",")]
for j in range(len(heads)):
item = {}
for h, v in zip(heads, row):
if h == "Arena Elo rating":
if v != "-":
v = int(ast.literal_eval(v))
else:
v = np.nan
elif h == "MMLU":
if v != "-":
v = round(ast.literal_eval(v) * 100, 1)
else:
v = np.nan
elif h == "MT-bench (win rate %)":
if v != "-":
v = round(ast.literal_eval(v[:-1]), 1)
else:
v = np.nan
elif h == "MT-bench (score)":
if v != "-":
v = round(ast.literal_eval(v), 2)
else:
v = np.nan
item[h] = v
if add_hyperlink:
item["Model"] = model_hyperlink(item["Model"], item["Link"])
rows.append(item)
return rows
def create_ranking_str(ranking, ranking_difference):
if ranking_difference > 0:
return f"{int(ranking)} \u2191"
elif ranking_difference < 0:
return f"{int(ranking)} \u2193"
else:
return f"{int(ranking)}"
def recompute_final_ranking(arena_df):
# compute ranking based on CI
ranking = {}
for i, model_a in enumerate(arena_df.index):
ranking[model_a] = 1
for j, model_b in enumerate(arena_df.index):
if i == j:
continue
if (
arena_df.loc[model_b]["rating_q025"]
> arena_df.loc[model_a]["rating_q975"]
):
ranking[model_a] += 1
return list(ranking.values())
def get_arena_table(arena_df, model_table_df, arena_subset_df=None, hidden_models=None):
# Apply hidden_models filter first
if hidden_models:
arena_df = arena_df[~arena_df.index.isin(hidden_models)].copy()
arena_df = arena_df.sort_values(
by=["final_ranking", "rating"], ascending=[True, False]
)
arena_df["final_ranking"] = recompute_final_ranking(arena_df)
arena_df = arena_df.sort_values(
by=["final_ranking", "rating"], ascending=[True, False]
)
# sort by rating
if arena_subset_df is not None:
# filter out models not in the arena_df
arena_subset_df = arena_subset_df[arena_subset_df.index.isin(arena_df.index)]
arena_subset_df = arena_subset_df.sort_values(by=["rating"], ascending=False)
arena_subset_df["final_ranking"] = recompute_final_ranking(arena_subset_df)
# keep only the models in the subset in arena_df and recompute final_ranking
arena_df = arena_df[arena_df.index.isin(arena_subset_df.index)]
# recompute final ranking
arena_df["final_ranking"] = recompute_final_ranking(arena_df)
# assign ranking by the order
arena_subset_df["final_ranking_no_tie"] = range(1, len(arena_subset_df) + 1)
arena_df["final_ranking_no_tie"] = range(1, len(arena_df) + 1)
# join arena_df and arena_subset_df on index
arena_df = arena_subset_df.join(
arena_df["final_ranking"], rsuffix="_global", how="inner"
)
arena_df["ranking_difference"] = (
arena_df["final_ranking_global"] - arena_df["final_ranking"]
)
arena_df = arena_df.sort_values(
by=["final_ranking", "rating"], ascending=[True, False]
)
arena_df["final_ranking"] = arena_df.apply(
lambda x: create_ranking_str(x["final_ranking"], x["ranking_difference"]),
axis=1,
)
arena_df["final_ranking"] = arena_df["final_ranking"].astype(str)
values = []
for i in range(len(arena_df)):
row = []
model_key = arena_df.index[i]
try:
model_name = model_table_df[model_table_df["key"] == model_key][
"Model"
].values[0]
ranking = arena_df.iloc[i].get("final_ranking") or i + 1
row.append(ranking)
if arena_subset_df is not None:
row.append(arena_df.iloc[i].get("ranking_difference") or 0)
row.append(model_name)
row.append(round(arena_df.iloc[i]["rating"]))
upper_diff = round(
arena_df.iloc[i]["rating_q975"] - arena_df.iloc[i]["rating"]
)
lower_diff = round(
arena_df.iloc[i]["rating"] - arena_df.iloc[i]["rating_q025"]
)
row.append(f"+{upper_diff}/-{lower_diff}")
row.append(round(arena_df.iloc[i]["num_battles"]))
row.append(
model_table_df[model_table_df["key"] == model_key][
"Organization"
].values[0]
)
row.append(
model_table_df[model_table_df["key"] == model_key]["License"].values[0]
)
cutoff_date = model_table_df[model_table_df["key"] == model_key][
"Knowledge cutoff date"
].values[0]
if cutoff_date == "-":
row.append("Unknown")
else:
row.append(cutoff_date)
values.append(row)
except Exception as e:
traceback.print_exc()
print(f"{model_key} - {e}")
return values
key_to_category_name = {
"full": "Overall",
"crowdsourcing/simple_prompts": "crowdsourcing/simple_prompts",
"site_visitors/medium_prompts": "site_visitors/medium_prompts",
"site_visitors/medium_prompts:style control": "site_visitors/medium_prompts:style control"
}
cat_name_to_explanation = {
"Overall": "All queries",
"crowdsourcing/simple_prompts": "Queries collected through crowdsourcing. Mostly simple ones.",
"site_visitors/medium_prompts": "Queries from website visitors. Contain more complex prompts.",
"site_visitors/medium_prompts:style control": "Queries from website visitors. Contain more complex prompts. [Reduced stylistic influence](https://lmsys.org/blog/2024-08-28-style-control/) of the response on the rating."
}
cat_name_to_baseline = {
"Hard Prompts (English)": "English",
}
actual_categories = [
# "Overall",
# "crowdsourcing/simple_prompts",
"site_visitors/medium_prompts",
"site_visitors/medium_prompts:style control"
]
req_cat = "site_visitors/medium_prompts:style control"
# selected_category = req_cat if req_cat in actual_categories else "Overall"
selected_category = req_cat if req_cat in actual_categories else "site_visitors/medium_prompts:style control"
def read_elo_file(elo_results_file, leaderboard_table_file):
arena_dfs = {}
category_elo_results = {}
with open(elo_results_file, "rb") as fin:
elo_results = pickle.load(fin)
last_updated_time = None
if selected_category in elo_results:
last_updated_time = elo_results[selected_category]["last_updated_datetime"].split(
" "
)[0]
for k in key_to_category_name.keys():
if k not in elo_results:
continue
arena_dfs[key_to_category_name[k]] = elo_results[k][
"leaderboard_table_df"
]
category_elo_results[key_to_category_name[k]] = elo_results[k]
data = load_leaderboard_table_csv(leaderboard_table_file)
model_table_df = pd.DataFrame(data)
return last_updated_time, arena_dfs, category_elo_results, elo_results, model_table_df
def build_leaderboard_tab(
elo_results_file, leaderboard_table_file, show_plot=False, mirror=False
):
arena_dfs = {}
arena_df = pd.DataFrame()
category_elo_results = {}
last_updated_time, arena_dfs, category_elo_results, elo_results, model_table_df = read_elo_file(elo_results_file, leaderboard_table_file)
arena_df = arena_dfs[selected_category]
p1 = category_elo_results[selected_category]["win_fraction_heatmap"]
p2 = category_elo_results[selected_category]["battle_count_heatmap"]
p3 = category_elo_results[selected_category]["bootstrap_elo_rating"]
p4 = category_elo_results[selected_category]["average_win_rate_bar"]
# arena_df = arena_dfs["Overall"]
default_md = make_default_md_1()
default_md_2 = make_default_md_2()
with gr.Row():
with gr.Column(scale=4):
md_1 = gr.Markdown(default_md, elem_id="leaderboard_markdown")
with gr.Column(scale=1):
vote_button = gr.Button("Vote!", link="https://llmarena.ru")
md_2 = gr.Markdown(default_md_2, elem_id="leaderboard_markdown")
if leaderboard_table_file:
data = load_leaderboard_table_csv(leaderboard_table_file)
model_table_df = pd.DataFrame(data)
with gr.Tabs() as tabs:
arena_table_vals = get_arena_table(arena_df, model_table_df, hidden_models=deprecated_model_name)
with gr.Tab("Arena", id=0):
md = make_arena_leaderboard_md(arena_dfs[selected_category], last_updated_time)
lb_description = gr.Markdown(md, elem_id="leaderboard_markdown")
with gr.Row():
with gr.Column(scale=2):
category_dropdown = gr.Dropdown(
choices=actual_categories,
value=selected_category,
label="Category",
)
with gr.Column(scale=2):
category_checkbox = gr.CheckboxGroup(
["Deprecated"],
label="Filter",
value=[],
info="",
)
default_category_details = make_category_arena_leaderboard_md(
arena_df, arena_df, name=selected_category
)
with gr.Column(scale=4, variant="panel"):
category_deets = gr.Markdown(
default_category_details, elem_id="category_deets"
)
arena_vals = pd.DataFrame(
arena_table_vals,
columns=[
"Rank* (UB)",
"Model",
"Arena Elo",
"95% CI",
"Votes",
"Organization",
"License",
"Knowledge Cutoff",
],
)
elo_display_df = gr.Dataframe(
headers=[
"Rank* (UB)",
"Model",
"Arena Elo",
"95% CI",
"Votes",
"Organization",
"License",
"Knowledge Cutoff",
],
datatype=[
"str",
"markdown",
"number",
"str",
"number",
"str",
"str",
"str",
],
value=arena_vals.style,
elem_id="arena_leaderboard_dataframe",
height=700,
column_widths=[70, 190, 100, 100, 90, 130, 150, 100],
wrap=True,
)
gr.Markdown(
elem_id="leaderboard_markdown",
)
leader_component_values[:] = [default_md, p1, p2, p3, p4]
if show_plot:
more_stats_md = gr.Markdown(
f"""## More statistics on Chatbot Arena""",
elem_id="leaderboard_header_markdown",
)
with gr.Row():
with gr.Column():
gr.Markdown(
"#### Figure 1: Confidence Intervals on Model Strength (via Bootstrapping)",
elem_id="plot-title",
)
plot_3 = gr.Plot(p3, show_label=False)
with gr.Column():
gr.Markdown(
"#### Figure 2: Average Win Rate Against All Other Models (Assuming Uniform Sampling and No Ties)",
elem_id="plot-title",
)
plot_4 = gr.Plot(p4, show_label=False)
with gr.Row():
with gr.Column():
gr.Markdown(
"#### Figure 3: Fraction of Model A Wins for All Non-tied A vs. B Battles",
elem_id="plot-title",
)
plot_1 = gr.Plot(
p1, show_label=False, elem_id="plot-container"
)
with gr.Column():
gr.Markdown(
"#### Figure 4: Battle Count for Each Combination of Models (without Ties)",
elem_id="plot-title",
)
plot_2 = gr.Plot(p2, show_label=False)
if not show_plot:
gr.Markdown(
"""
""",
elem_id="leaderboard_markdown",
)
else:
pass
def update_leaderboard_df(arena_table_vals):
elo_datarame = pd.DataFrame(
arena_table_vals,
columns=[
"Rank* (UB)",
"Delta",
"Model",
"Arena Elo",
"95% CI",
"Votes",
"Organization",
"License",
"Knowledge Cutoff",
],
)
def highlight_max(s):
return [
"color: green; font-weight: bold"
if "\u2191" in v
else "color: red; font-weight: bold"
if "\u2193" in v
else ""
for v in s
]
def highlight_rank_max(s):
return [
"color: green; font-weight: bold"
if v > 0
else "color: red; font-weight: bold"
if v < 0
else ""
for v in s
]
return elo_datarame.style.apply(highlight_max, subset=["Rank* (UB)"]).apply(
highlight_rank_max, subset=["Delta"]
)
def update_leaderboard_and_plots(category, filters):
_, arena_dfs, category_elo_results, _, model_table_df = read_elo_file(elo_results_file, leaderboard_table_file)
arena_subset_df = arena_dfs[category]
arena_subset_df = arena_subset_df[arena_subset_df["num_battles"] > 200]
elo_subset_results = category_elo_results[category]
baseline_category = cat_name_to_baseline.get(category, selected_category)
arena_df = arena_dfs[baseline_category]
arena_values = get_arena_table(
arena_df,
model_table_df,
arena_subset_df=arena_subset_df if category != "Overall" else None,
hidden_models=(None if len(filters) > 0 and "Deprecated" in filters else deprecated_model_name)
)
# Filter plots based on deprecated models
p1 = filter_deprecated_models_plots(
elo_subset_results["win_fraction_heatmap"],
hidden_models=(None if len(filters) > 0 and "Deprecated" in filters else deprecated_model_name)
)
p2 = filter_deprecated_models_plots(
elo_subset_results["battle_count_heatmap"],
hidden_models=(None if len(filters) > 0 and "Deprecated" in filters else deprecated_model_name)
)
p3 = filter_deprecated_models_plots(
elo_subset_results["bootstrap_elo_rating"],
hidden_models=(None if len(filters) > 0 and "Deprecated" in filters else deprecated_model_name)
)
p4 = filter_deprecated_models_plots(
elo_subset_results["average_win_rate_bar"],
hidden_models=(None if len(filters) > 0 and "Deprecated" in filters else deprecated_model_name)
)
if category != "Overall":
arena_values = update_leaderboard_df(arena_values)
arena_values = gr.Dataframe(
headers=[
"Rank* (UB)",
"Delta",
"Model",
"Arena Elo",
"95% CI",
"Votes",
"Organization",
"License",
"Knowledge Cutoff",
],
datatype=[
"str",
"number",
"markdown",
"number",
"str",
"number",
"str",
"str",
"str",
],
value=arena_values,
elem_id="arena_leaderboard_dataframe",
height=700,
column_widths=[70, 70, 200, 90, 100, 90, 120, 150, 100],
wrap=True,
)
else:
arena_values = gr.Dataframe(
headers=[
"Rank* (UB)",
"Model",
"Arena Elo",
"95% CI",
"Votes",
"Organization",
"License",
"Knowledge Cutoff",
],
datatype=[
"str",
"markdown",
"number",
"str",
"number",
"str",
"str",
"str",
],
value=arena_values,
elem_id="arena_leaderboard_dataframe",
height=700,
column_widths=[70, 190, 100, 100, 90, 140, 150, 100],
wrap=True,
)
p1 = elo_subset_results["win_fraction_heatmap"]
p2 = elo_subset_results["battle_count_heatmap"]
p3 = elo_subset_results["bootstrap_elo_rating"]
p4 = elo_subset_results["average_win_rate_bar"]
more_stats_md = f"""## More Statistics for Chatbot Arena - {category}
"""
leaderboard_md = make_category_arena_leaderboard_md(
arena_df, arena_subset_df, name=category
)
return arena_values, p1, p2, p3, p4, more_stats_md, leaderboard_md
if leaderboard_table_file:
category_dropdown.change(
fn=update_leaderboard_and_plots,
inputs=[category_dropdown, category_checkbox],
outputs=[
elo_display_df,
plot_1,
plot_2,
plot_3,
plot_4,
more_stats_md,
category_deets,
],
)
category_checkbox.change(
update_leaderboard_and_plots,
inputs=[category_dropdown, category_checkbox],
outputs=[
elo_display_df,
plot_1,
plot_2,
plot_3,
plot_4,
more_stats_md,
category_deets,
],
)
if show_plot and leaderboard_table_file:
return [md_1, md_2, lb_description, category_deets, elo_display_df, plot_1, plot_2, plot_3, plot_4]
return [md_1]
def build_demo(elo_results_file, leaderboard_table_file):
text_size = gr.themes.sizes.text_lg
theme = gr.themes.Default.load("theme.json")
theme.text_size = text_size
theme.set(
button_large_text_size="40px",
button_small_text_size="40px",
button_large_text_weight="1000",
button_small_text_weight="1000",
button_shadow="*shadow_drop_lg",
button_shadow_hover="*shadow_drop_lg",
checkbox_label_shadow="*shadow_drop_lg",
button_shadow_active="*shadow_inset",
button_secondary_background_fill="*primary_300",
button_secondary_background_fill_dark="*primary_700",
button_secondary_background_fill_hover="*primary_200",
button_secondary_background_fill_hover_dark="*primary_500",
button_secondary_text_color="*primary_800",
button_secondary_text_color_dark="white",
)
with gr.Blocks(
title="LLM arena: leaderboard",
theme=theme,
css=block_css,
) as demo:
build_leaderboard_tab(
elo_results_file, leaderboard_table_file, show_plot=True, mirror=True
)
return demo
block_css = """
#notice_markdown .prose {
font-size: 110% !important;
}
#notice_markdown th {
display: none;
}
#notice_markdown td {
padding-top: 6px;
padding-bottom: 6px;
}
#arena_leaderboard_dataframe table {
font-size: 110%;
}
#full_leaderboard_dataframe table {
font-size: 110%;
}
#model_description_markdown {
font-size: 110% !important;
}
#leaderboard_markdown .prose {
font-size: 110% !important;
}
#leaderboard_markdown td {
padding-top: 6px;
padding-bottom: 6px;
}
#leaderboard_dataframe td {
line-height: 0.1em;
}
#about_markdown .prose {
font-size: 110% !important;
}
#ack_markdown .prose {
font-size: 110% !important;
}
#chatbot .prose {
font-size: 105% !important;
}
.sponsor-image-about img {
margin: 0 20px;
margin-top: 20px;
height: 40px;
max-height: 100%;
width: auto;
float: left;
}
.chatbot h1, h2, h3 {
margin-top: 8px; /* Adjust the value as needed */
margin-bottom: 0px; /* Adjust the value as needed */
padding-bottom: 0px;
}
.chatbot h1 {
font-size: 130%;
}
.chatbot h2 {
font-size: 120%;
}
.chatbot h3 {
font-size: 110%;
}
.chatbot p:not(:first-child) {
margin-top: 8px;
}
.typing {
display: inline-block;
}
.cursor {
display: inline-block;
width: 7px;
height: 1em;
background-color: black;
vertical-align: middle;
animation: blink 1s infinite;
}
.dark .cursor {
display: inline-block;
width: 7px;
height: 1em;
background-color: white;
vertical-align: middle;
animation: blink 1s infinite;
}
@keyframes blink {
0%, 50% { opacity: 1; }
50.1%, 100% { opacity: 0; }
}
.app {
max-width: 100% !important;
padding: 20px !important;
}
a {
color: #1976D2; /* Your current link color, a shade of blue */
text-decoration: none; /* Removes underline from links */
}
a:hover {
color: #63A4FF; /* This can be any color you choose for hover */
text-decoration: underline; /* Adds underline on hover */
}
"""
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--share", action="store_true")
parser.add_argument("--host", default="0.0.0.0")
parser.add_argument("--port", type=int, default=7860)
args = parser.parse_args()
elo_result_files = glob.glob("elo_results_*.pkl")
elo_result_files.sort(key=lambda x: int(x[12:-4]))
elo_result_file = elo_result_files[-1]
leaderboard_table_files = glob.glob("leaderboard_table_*.csv")
leaderboard_table_files.sort(key=lambda x: int(x[18:-4]))
leaderboard_table_file = leaderboard_table_files[-1]
demo = build_demo(elo_result_file, leaderboard_table_file)
demo.launch(show_api=False)