import argparse import ast import glob import pickle import traceback import numpy as np from datetime import datetime import pandas as pd import gradio as gr import numpy as np basic_component_values = [None] * 6 leader_component_values = [None] * 5 promo_banner = """
USE THE LATEST VERSIONS OF THE BEST CHATBOTS IN RUSSIAN FOR FREE
""" deprecated_model_name = [ "GigaChat 3.1.25.3", "GigaChat-Pro 2.2.25.3", "saiga_llama3_8b_v6", "saiga_phi3_medium", "GigaChat-Plus 3.1.25.3", "GigaChat-Pro 4.0.26.8", "GigaChat 4.0.26.8", "xAI: Grok 2", "GigaChat-Pro 4.0.26.15", "GigaChat 4.0.26.15", "YandexGPT Experimental", "yandex-gpt-arena" ] def make_default_md_1(): leaderboard_md = f""" # 🏆 LLM Arena in Russian: Leaderboard {promo_banner} """ return leaderboard_md def make_default_md_2(): leaderboard_md = f""" The LLM Arena platform is an open crowdsourcing platform for evaluating large language models (LLM) in Russian. We collect pairwise comparisons from people to rank LLMs using the Bradley-Terry model and display model ratings on the Elo scale. Chatbot Arena in Russian depends on community participation, so please contribute by casting your vote! - To **add your model** to the comparison, contact us on TG: [Group](https://t.me/+bFEOl-Bdmok4NGUy) - If you **found a bug** or **have a suggestion**, contact us: [Roman](https://t.me/roman_kucev) - You can **contribute your vote** at [llmarena.ru](https://llmarena.ru/)! """ return leaderboard_md def make_arena_leaderboard_md(arena_df, last_updated_time): total_votes = sum(arena_df["num_battles"]) total_models = len(arena_df) space = " " leaderboard_md = f""" Total # of models: **{total_models}**.{space} Total # of votes: **{"{:,}".format(total_votes)}**.{space} Last updated: {last_updated_time}. ***Rank (UB)**: model rating (upper bound), determined as one plus the number of models that are statistically better than the target model. Model A is statistically better than Model B when the lower bound of Model A's rating is higher than the upper bound of Model B's rating (with a 95% confidence interval). See Figure 1 below for a visualization of the confidence intervals of model ratings. """ return leaderboard_md def make_category_arena_leaderboard_md(arena_df, arena_subset_df, name="site_visitors/medium_prompts:style control"): total_votes = sum(arena_df["num_battles"]) total_models = len(arena_df) space = " " total_subset_votes = sum(arena_subset_df["num_battles"]) total_subset_models = len(arena_subset_df) leaderboard_md = f"""### {cat_name_to_explanation[name]} #### {space} #models: **{total_subset_models} ({round(total_subset_models/total_models *100)}%)** {space} #votes: **{"{:,}".format(total_subset_votes)} ({round(total_subset_votes/total_votes * 100)}%)**{space} """ return leaderboard_md def model_hyperlink(model_name, link): return f'{model_name}' def filter_deprecated_models_plots(fig, hidden_models=None): """ Removes deprecated models from a Plotly figure. Args: fig: The Plotly figure object. hidden_models: A list of model names to remove. """ if fig is None: return if hidden_models is None: return fig if fig.data[0].type == 'heatmap': data = fig.data[0] mask_x = ~np.isin(data.x, hidden_models) mask_y = ~np.isin(data.y, hidden_models) data.update({ 'x': np.array(data.x)[mask_x], 'y': np.array(data.y)[mask_y], 'z': np.array(data.z)[np.ix_(mask_y, mask_x)] }) elif fig.data[0].type == 'scatter': trace = fig.data[0] mask = ~np.isin(trace.x, hidden_models) trace.x, trace.y, trace.text = np.array(trace.x)[mask], np.array(trace.y)[mask], np.array(trace.text)[mask] for key in ['array', 'arrayminus']: if key in trace.error_y: trace.error_y[key] = trace.error_y[key][mask] elif fig.data[0].type == 'bar': mask = ~np.isin(fig.data[0].x, hidden_models) fig.data[0].x = fig.data[0].x[mask] fig.data[0].y = fig.data[0].y[mask] return fig def load_leaderboard_table_csv(filename, add_hyperlink=True): lines = open(filename).readlines() heads = [v.strip() for v in lines[0].split(",")] rows = [] for i in range(1, len(lines)): row = [v.strip() for v in lines[i].split(",")] for j in range(len(heads)): item = {} for h, v in zip(heads, row): if h == "Arena Elo rating": if v != "-": v = int(ast.literal_eval(v)) else: v = np.nan elif h == "MMLU": if v != "-": v = round(ast.literal_eval(v) * 100, 1) else: v = np.nan elif h == "MT-bench (win rate %)": if v != "-": v = round(ast.literal_eval(v[:-1]), 1) else: v = np.nan elif h == "MT-bench (score)": if v != "-": v = round(ast.literal_eval(v), 2) else: v = np.nan item[h] = v if add_hyperlink: item["Model"] = model_hyperlink(item["Model"], item["Link"]) rows.append(item) return rows def create_ranking_str(ranking, ranking_difference): if ranking_difference > 0: return f"{int(ranking)} \u2191" elif ranking_difference < 0: return f"{int(ranking)} \u2193" else: return f"{int(ranking)}" def recompute_final_ranking(arena_df): # compute ranking based on CI ranking = {} for i, model_a in enumerate(arena_df.index): ranking[model_a] = 1 for j, model_b in enumerate(arena_df.index): if i == j: continue if ( arena_df.loc[model_b]["rating_q025"] > arena_df.loc[model_a]["rating_q975"] ): ranking[model_a] += 1 return list(ranking.values()) def get_arena_table(arena_df, model_table_df, arena_subset_df=None, hidden_models=None): # Apply hidden_models filter first if hidden_models: arena_df = arena_df[~arena_df.index.isin(hidden_models)].copy() arena_df = arena_df.sort_values( by=["final_ranking", "rating"], ascending=[True, False] ) arena_df["final_ranking"] = recompute_final_ranking(arena_df) arena_df = arena_df.sort_values( by=["final_ranking", "rating"], ascending=[True, False] ) # sort by rating if arena_subset_df is not None: # filter out models not in the arena_df arena_subset_df = arena_subset_df[arena_subset_df.index.isin(arena_df.index)] arena_subset_df = arena_subset_df.sort_values(by=["rating"], ascending=False) arena_subset_df["final_ranking"] = recompute_final_ranking(arena_subset_df) # keep only the models in the subset in arena_df and recompute final_ranking arena_df = arena_df[arena_df.index.isin(arena_subset_df.index)] # recompute final ranking arena_df["final_ranking"] = recompute_final_ranking(arena_df) # assign ranking by the order arena_subset_df["final_ranking_no_tie"] = range(1, len(arena_subset_df) + 1) arena_df["final_ranking_no_tie"] = range(1, len(arena_df) + 1) # join arena_df and arena_subset_df on index arena_df = arena_subset_df.join( arena_df["final_ranking"], rsuffix="_global", how="inner" ) arena_df["ranking_difference"] = ( arena_df["final_ranking_global"] - arena_df["final_ranking"] ) arena_df = arena_df.sort_values( by=["final_ranking", "rating"], ascending=[True, False] ) arena_df["final_ranking"] = arena_df.apply( lambda x: create_ranking_str(x["final_ranking"], x["ranking_difference"]), axis=1, ) arena_df["final_ranking"] = arena_df["final_ranking"].astype(str) values = [] for i in range(len(arena_df)): row = [] model_key = arena_df.index[i] try: model_name = model_table_df[model_table_df["key"] == model_key][ "Model" ].values[0] ranking = arena_df.iloc[i].get("final_ranking") or i + 1 row.append(ranking) if arena_subset_df is not None: row.append(arena_df.iloc[i].get("ranking_difference") or 0) row.append(model_name) row.append(round(arena_df.iloc[i]["rating"])) upper_diff = round( arena_df.iloc[i]["rating_q975"] - arena_df.iloc[i]["rating"] ) lower_diff = round( arena_df.iloc[i]["rating"] - arena_df.iloc[i]["rating_q025"] ) row.append(f"+{upper_diff}/-{lower_diff}") row.append(round(arena_df.iloc[i]["num_battles"])) row.append( model_table_df[model_table_df["key"] == model_key][ "Organization" ].values[0] ) row.append( model_table_df[model_table_df["key"] == model_key]["License"].values[0] ) cutoff_date = model_table_df[model_table_df["key"] == model_key][ "Knowledge cutoff date" ].values[0] if cutoff_date == "-": row.append("Unknown") else: row.append(cutoff_date) values.append(row) except Exception as e: traceback.print_exc() print(f"{model_key} - {e}") return values key_to_category_name = { "full": "Overall", "crowdsourcing/simple_prompts": "crowdsourcing/simple_prompts", "site_visitors/medium_prompts": "site_visitors/medium_prompts", "site_visitors/medium_prompts:style control": "site_visitors/medium_prompts:style control" } cat_name_to_explanation = { "Overall": "All queries", "crowdsourcing/simple_prompts": "Queries collected through crowdsourcing. Mostly simple ones.", "site_visitors/medium_prompts": "Queries from website visitors. Contain more complex prompts.", "site_visitors/medium_prompts:style control": "Queries from website visitors. Contain more complex prompts. [Reduced stylistic influence](https://lmsys.org/blog/2024-08-28-style-control/) of the response on the rating." } cat_name_to_baseline = { "Hard Prompts (English)": "English", } actual_categories = [ # "Overall", # "crowdsourcing/simple_prompts", "site_visitors/medium_prompts", "site_visitors/medium_prompts:style control" ] req_cat = "site_visitors/medium_prompts:style control" # selected_category = req_cat if req_cat in actual_categories else "Overall" selected_category = req_cat if req_cat in actual_categories else "site_visitors/medium_prompts:style control" def read_elo_file(elo_results_file, leaderboard_table_file): arena_dfs = {} category_elo_results = {} with open(elo_results_file, "rb") as fin: elo_results = pickle.load(fin) last_updated_time = None if selected_category in elo_results: last_updated_time = elo_results[selected_category]["last_updated_datetime"].split( " " )[0] for k in key_to_category_name.keys(): if k not in elo_results: continue arena_dfs[key_to_category_name[k]] = elo_results[k][ "leaderboard_table_df" ] category_elo_results[key_to_category_name[k]] = elo_results[k] data = load_leaderboard_table_csv(leaderboard_table_file) model_table_df = pd.DataFrame(data) return last_updated_time, arena_dfs, category_elo_results, elo_results, model_table_df def build_leaderboard_tab( elo_results_file, leaderboard_table_file, show_plot=False, mirror=False ): arena_dfs = {} arena_df = pd.DataFrame() category_elo_results = {} last_updated_time, arena_dfs, category_elo_results, elo_results, model_table_df = read_elo_file(elo_results_file, leaderboard_table_file) arena_df = arena_dfs[selected_category] p1 = category_elo_results[selected_category]["win_fraction_heatmap"] p2 = category_elo_results[selected_category]["battle_count_heatmap"] p3 = category_elo_results[selected_category]["bootstrap_elo_rating"] p4 = category_elo_results[selected_category]["average_win_rate_bar"] # arena_df = arena_dfs["Overall"] default_md = make_default_md_1() default_md_2 = make_default_md_2() with gr.Row(): with gr.Column(scale=4): md_1 = gr.Markdown(default_md, elem_id="leaderboard_markdown") with gr.Column(scale=1): vote_button = gr.Button("Vote!", link="https://llmarena.ru") md_2 = gr.Markdown(default_md_2, elem_id="leaderboard_markdown") if leaderboard_table_file: data = load_leaderboard_table_csv(leaderboard_table_file) model_table_df = pd.DataFrame(data) with gr.Tabs() as tabs: arena_table_vals = get_arena_table(arena_df, model_table_df, hidden_models=deprecated_model_name) with gr.Tab("Arena", id=0): md = make_arena_leaderboard_md(arena_dfs[selected_category], last_updated_time) lb_description = gr.Markdown(md, elem_id="leaderboard_markdown") with gr.Row(): with gr.Column(scale=2): category_dropdown = gr.Dropdown( choices=actual_categories, value=selected_category, label="Category", ) with gr.Column(scale=2): category_checkbox = gr.CheckboxGroup( ["Deprecated"], label="Filter", value=[], info="", ) default_category_details = make_category_arena_leaderboard_md( arena_df, arena_df, name=selected_category ) with gr.Column(scale=4, variant="panel"): category_deets = gr.Markdown( default_category_details, elem_id="category_deets" ) arena_vals = pd.DataFrame( arena_table_vals, columns=[ "Rank* (UB)", "Model", "Arena Elo", "95% CI", "Votes", "Organization", "License", "Knowledge Cutoff", ], ) elo_display_df = gr.Dataframe( headers=[ "Rank* (UB)", "Model", "Arena Elo", "95% CI", "Votes", "Organization", "License", "Knowledge Cutoff", ], datatype=[ "str", "markdown", "number", "str", "number", "str", "str", "str", ], value=arena_vals.style, elem_id="arena_leaderboard_dataframe", height=700, column_widths=[70, 190, 100, 100, 90, 130, 150, 100], wrap=True, ) gr.Markdown( elem_id="leaderboard_markdown", ) leader_component_values[:] = [default_md, p1, p2, p3, p4] if show_plot: more_stats_md = gr.Markdown( f"""## More statistics on Chatbot Arena""", elem_id="leaderboard_header_markdown", ) with gr.Row(): with gr.Column(): gr.Markdown( "#### Figure 1: Confidence Intervals on Model Strength (via Bootstrapping)", elem_id="plot-title", ) plot_3 = gr.Plot(p3, show_label=False) with gr.Column(): gr.Markdown( "#### Figure 2: Average Win Rate Against All Other Models (Assuming Uniform Sampling and No Ties)", elem_id="plot-title", ) plot_4 = gr.Plot(p4, show_label=False) with gr.Row(): with gr.Column(): gr.Markdown( "#### Figure 3: Fraction of Model A Wins for All Non-tied A vs. B Battles", elem_id="plot-title", ) plot_1 = gr.Plot( p1, show_label=False, elem_id="plot-container" ) with gr.Column(): gr.Markdown( "#### Figure 4: Battle Count for Each Combination of Models (without Ties)", elem_id="plot-title", ) plot_2 = gr.Plot(p2, show_label=False) if not show_plot: gr.Markdown( """ """, elem_id="leaderboard_markdown", ) else: pass def update_leaderboard_df(arena_table_vals): elo_datarame = pd.DataFrame( arena_table_vals, columns=[ "Rank* (UB)", "Delta", "Model", "Arena Elo", "95% CI", "Votes", "Organization", "License", "Knowledge Cutoff", ], ) def highlight_max(s): return [ "color: green; font-weight: bold" if "\u2191" in v else "color: red; font-weight: bold" if "\u2193" in v else "" for v in s ] def highlight_rank_max(s): return [ "color: green; font-weight: bold" if v > 0 else "color: red; font-weight: bold" if v < 0 else "" for v in s ] return elo_datarame.style.apply(highlight_max, subset=["Rank* (UB)"]).apply( highlight_rank_max, subset=["Delta"] ) def update_leaderboard_and_plots(category, filters): _, arena_dfs, category_elo_results, _, model_table_df = read_elo_file(elo_results_file, leaderboard_table_file) arena_subset_df = arena_dfs[category] arena_subset_df = arena_subset_df[arena_subset_df["num_battles"] > 200] elo_subset_results = category_elo_results[category] baseline_category = cat_name_to_baseline.get(category, selected_category) arena_df = arena_dfs[baseline_category] arena_values = get_arena_table( arena_df, model_table_df, arena_subset_df=arena_subset_df if category != "Overall" else None, hidden_models=(None if len(filters) > 0 and "Deprecated" in filters else deprecated_model_name) ) # Filter plots based on deprecated models p1 = filter_deprecated_models_plots( elo_subset_results["win_fraction_heatmap"], hidden_models=(None if len(filters) > 0 and "Deprecated" in filters else deprecated_model_name) ) p2 = filter_deprecated_models_plots( elo_subset_results["battle_count_heatmap"], hidden_models=(None if len(filters) > 0 and "Deprecated" in filters else deprecated_model_name) ) p3 = filter_deprecated_models_plots( elo_subset_results["bootstrap_elo_rating"], hidden_models=(None if len(filters) > 0 and "Deprecated" in filters else deprecated_model_name) ) p4 = filter_deprecated_models_plots( elo_subset_results["average_win_rate_bar"], hidden_models=(None if len(filters) > 0 and "Deprecated" in filters else deprecated_model_name) ) if category != "Overall": arena_values = update_leaderboard_df(arena_values) arena_values = gr.Dataframe( headers=[ "Rank* (UB)", "Delta", "Model", "Arena Elo", "95% CI", "Votes", "Organization", "License", "Knowledge Cutoff", ], datatype=[ "str", "number", "markdown", "number", "str", "number", "str", "str", "str", ], value=arena_values, elem_id="arena_leaderboard_dataframe", height=700, column_widths=[70, 70, 200, 90, 100, 90, 120, 150, 100], wrap=True, ) else: arena_values = gr.Dataframe( headers=[ "Rank* (UB)", "Model", "Arena Elo", "95% CI", "Votes", "Organization", "License", "Knowledge Cutoff", ], datatype=[ "str", "markdown", "number", "str", "number", "str", "str", "str", ], value=arena_values, elem_id="arena_leaderboard_dataframe", height=700, column_widths=[70, 190, 100, 100, 90, 140, 150, 100], wrap=True, ) p1 = elo_subset_results["win_fraction_heatmap"] p2 = elo_subset_results["battle_count_heatmap"] p3 = elo_subset_results["bootstrap_elo_rating"] p4 = elo_subset_results["average_win_rate_bar"] more_stats_md = f"""## More Statistics for Chatbot Arena - {category} """ leaderboard_md = make_category_arena_leaderboard_md( arena_df, arena_subset_df, name=category ) return arena_values, p1, p2, p3, p4, more_stats_md, leaderboard_md if leaderboard_table_file: category_dropdown.change( fn=update_leaderboard_and_plots, inputs=[category_dropdown, category_checkbox], outputs=[ elo_display_df, plot_1, plot_2, plot_3, plot_4, more_stats_md, category_deets, ], ) category_checkbox.change( update_leaderboard_and_plots, inputs=[category_dropdown, category_checkbox], outputs=[ elo_display_df, plot_1, plot_2, plot_3, plot_4, more_stats_md, category_deets, ], ) if show_plot and leaderboard_table_file: return [md_1, md_2, lb_description, category_deets, elo_display_df, plot_1, plot_2, plot_3, plot_4] return [md_1] def build_demo(elo_results_file, leaderboard_table_file): text_size = gr.themes.sizes.text_lg theme = gr.themes.Default.load("theme.json") theme.text_size = text_size theme.set( button_large_text_size="40px", button_small_text_size="40px", button_large_text_weight="1000", button_small_text_weight="1000", button_shadow="*shadow_drop_lg", button_shadow_hover="*shadow_drop_lg", checkbox_label_shadow="*shadow_drop_lg", button_shadow_active="*shadow_inset", button_secondary_background_fill="*primary_300", button_secondary_background_fill_dark="*primary_700", button_secondary_background_fill_hover="*primary_200", button_secondary_background_fill_hover_dark="*primary_500", button_secondary_text_color="*primary_800", button_secondary_text_color_dark="white", ) with gr.Blocks( title="LLM arena: leaderboard", theme=theme, css=block_css, ) as demo: build_leaderboard_tab( elo_results_file, leaderboard_table_file, show_plot=True, mirror=True ) return demo block_css = """ #notice_markdown .prose { font-size: 110% !important; } #notice_markdown th { display: none; } #notice_markdown td { padding-top: 6px; padding-bottom: 6px; } #arena_leaderboard_dataframe table { font-size: 110%; } #full_leaderboard_dataframe table { font-size: 110%; } #model_description_markdown { font-size: 110% !important; } #leaderboard_markdown .prose { font-size: 110% !important; } #leaderboard_markdown td { padding-top: 6px; padding-bottom: 6px; } #leaderboard_dataframe td { line-height: 0.1em; } #about_markdown .prose { font-size: 110% !important; } #ack_markdown .prose { font-size: 110% !important; } #chatbot .prose { font-size: 105% !important; } .sponsor-image-about img { margin: 0 20px; margin-top: 20px; height: 40px; max-height: 100%; width: auto; float: left; } .chatbot h1, h2, h3 { margin-top: 8px; /* Adjust the value as needed */ margin-bottom: 0px; /* Adjust the value as needed */ padding-bottom: 0px; } .chatbot h1 { font-size: 130%; } .chatbot h2 { font-size: 120%; } .chatbot h3 { font-size: 110%; } .chatbot p:not(:first-child) { margin-top: 8px; } .typing { display: inline-block; } .cursor { display: inline-block; width: 7px; height: 1em; background-color: black; vertical-align: middle; animation: blink 1s infinite; } .dark .cursor { display: inline-block; width: 7px; height: 1em; background-color: white; vertical-align: middle; animation: blink 1s infinite; } @keyframes blink { 0%, 50% { opacity: 1; } 50.1%, 100% { opacity: 0; } } .app { max-width: 100% !important; padding: 20px !important; } a { color: #1976D2; /* Your current link color, a shade of blue */ text-decoration: none; /* Removes underline from links */ } a:hover { color: #63A4FF; /* This can be any color you choose for hover */ text-decoration: underline; /* Adds underline on hover */ } """ if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--share", action="store_true") parser.add_argument("--host", default="0.0.0.0") parser.add_argument("--port", type=int, default=7860) args = parser.parse_args() elo_result_files = glob.glob("elo_results_*.pkl") elo_result_files.sort(key=lambda x: int(x[12:-4])) elo_result_file = elo_result_files[-1] leaderboard_table_files = glob.glob("leaderboard_table_*.csv") leaderboard_table_files.sort(key=lambda x: int(x[18:-4])) leaderboard_table_file = leaderboard_table_files[-1] demo = build_demo(elo_result_file, leaderboard_table_file) demo.launch(show_api=False)