This view is limited to 50 files because it contains too many changes.Β  See the raw diff here.
Files changed (50) hide show
  1. README.md +4 -2
  2. app.py +424 -38
  3. arena_hard_auto_leaderboard_v0.1.csv +0 -61
  4. elo_results_20240327.pkl +0 -3
  5. elo_results_20240410.pkl β†’ elo_results_20240329.pkl +2 -2
  6. elo_results_20240403.pkl +0 -3
  7. elo_results_20240409.pkl +0 -3
  8. elo_results_20240411.pkl +0 -3
  9. elo_results_20240413.pkl +0 -3
  10. elo_results_20240418.pkl +0 -3
  11. elo_results_20240419.pkl +0 -3
  12. elo_results_20240422.pkl +0 -3
  13. elo_results_20240426.pkl +0 -3
  14. elo_results_20240501.pkl +0 -3
  15. elo_results_20240508.pkl +0 -3
  16. elo_results_20240515.pkl +0 -3
  17. elo_results_20240516.pkl +0 -3
  18. elo_results_20240519.pkl +0 -3
  19. elo_results_20240520.pkl +0 -3
  20. elo_results_20240527.pkl +0 -3
  21. elo_results_20240602.pkl +0 -3
  22. elo_results_20240606.pkl +0 -3
  23. elo_results_20240611.pkl +0 -3
  24. elo_results_20240617.pkl +0 -3
  25. elo_results_20240621.pkl +0 -3
  26. elo_results_20240623.pkl +0 -3
  27. elo_results_20240626.pkl +0 -3
  28. elo_results_20240629.pkl +0 -3
  29. elo_results_20240706.pkl +0 -3
  30. elo_results_20240708.pkl +0 -3
  31. elo_results_20240716.pkl +0 -3
  32. elo_results_20240722.pkl +0 -3
  33. elo_results_20240725.pkl +0 -3
  34. elo_results_20240730.pkl +0 -3
  35. elo_results_20240731.pkl +0 -3
  36. elo_results_20240801.pkl +0 -3
  37. elo_results_20240805.pkl +0 -3
  38. elo_results_20240806.pkl +0 -3
  39. elo_results_20240813.pkl +0 -3
  40. elo_results_20240822.pkl +0 -3
  41. elo_results_20240823.pkl +0 -3
  42. elo_results_20240827.pkl +0 -3
  43. elo_results_20240828.pkl +0 -3
  44. elo_results_20240904.pkl +0 -3
  45. elo_results_20240915.pkl +0 -3
  46. elo_results_20240917.pkl +0 -3
  47. elo_results_20240927.pkl +0 -3
  48. index.html +0 -12
  49. leaderboard_table_20240404.csv β†’ leaderboard_table_20240329.csv +1 -4
  50. leaderboard_table_20240409.csv +0 -95
README.md CHANGED
@@ -1,13 +1,15 @@
1
  ---
2
- title: Chatbot Arena Leaderboard
3
  emoji: πŸ†πŸ€–
4
  colorFrom: indigo
5
  colorTo: green
6
  sdk: gradio
 
 
7
  pinned: false
8
  license: apache-2.0
9
  tags:
10
  - leaderboard
11
  ---
12
 
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: LMSys Chatbot Arena Leaderboard
3
  emoji: πŸ†πŸ€–
4
  colorFrom: indigo
5
  colorTo: green
6
  sdk: gradio
7
+ sdk_version: 3.50.2
8
+ app_file: app.py
9
  pinned: false
10
  license: apache-2.0
11
  tags:
12
  - leaderboard
13
  ---
14
 
15
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py CHANGED
@@ -1,61 +1,451 @@
1
- from fastchat.serve.monitor.monitor import build_leaderboard_tab, build_basic_stats_tab, basic_component_values, leader_component_values
2
- from fastchat.utils import build_logger, get_window_url_params_js
3
-
4
  import argparse
5
  import glob
6
- import re
 
7
  import gradio as gr
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
 
10
  def load_demo(url_params, request: gr.Request):
11
  logger.info(f"load_demo. ip: {request.client.host}. params: {url_params}")
12
  return basic_component_values + leader_component_values
13
 
14
- def build_demo(elo_results_file, leaderboard_table_file):
15
- from fastchat.serve.gradio_web_server import block_css
16
 
17
- text_size = gr.themes.sizes.text_lg
18
- # load theme from theme.json
19
- theme = gr.themes.Default.load("theme.json")
20
- # set text size to large
21
- theme.text_size = text_size
22
- theme.set(
23
- button_large_text_size="40px",
24
- button_small_text_size="40px",
25
- button_large_text_weight="1000",
26
- button_small_text_weight="1000",
27
- button_shadow="*shadow_drop_lg",
28
- button_shadow_hover="*shadow_drop_lg",
29
- checkbox_label_shadow="*shadow_drop_lg",
30
- button_shadow_active="*shadow_inset",
31
- button_secondary_background_fill="*primary_300",
32
- button_secondary_background_fill_dark="*primary_700",
33
- button_secondary_background_fill_hover="*primary_200",
34
- button_secondary_background_fill_hover_dark="*primary_500",
35
- button_secondary_text_color="*primary_800",
36
- button_secondary_text_color_dark="white",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  )
38
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  with gr.Blocks(
40
  title="Chatbot Arena Leaderboard",
41
- theme=theme,
42
  css=block_css,
43
  ) as demo:
44
  leader_components = build_leaderboard_tab(
45
- elo_results_file, leaderboard_table_file, arena_hard_file, show_plot=True, mirror=True
46
  )
47
  return demo
48
 
 
49
  if __name__ == "__main__":
50
  parser = argparse.ArgumentParser()
51
  parser.add_argument("--share", action="store_true")
52
- parser.add_argument("--host", default="0.0.0.0")
53
- parser.add_argument("--port", type=int, default=7860)
54
  args = parser.parse_args()
55
 
56
- logger = build_logger("monitor", "monitor.log")
57
- logger.info(f"args: {args}")
58
-
59
  elo_result_files = glob.glob("elo_results_*.pkl")
60
  elo_result_files.sort(key=lambda x: int(x[12:-4]))
61
  elo_result_file = elo_result_files[-1]
@@ -63,10 +453,6 @@ if __name__ == "__main__":
63
  leaderboard_table_files = glob.glob("leaderboard_table_*.csv")
64
  leaderboard_table_files.sort(key=lambda x: int(x[18:-4]))
65
  leaderboard_table_file = leaderboard_table_files[-1]
66
-
67
- arena_hard_files = glob.glob("arena_hard_auto_leaderboard_*.csv")
68
- arena_hard_files.sort(key=lambda x: float(x[29:32]))
69
- arena_hard_file = arena_hard_files[-1]
70
 
71
  demo = build_demo(elo_result_file, leaderboard_table_file)
72
- demo.launch(share=args.share, server_name=args.host, server_port=args.port)
 
1
+ """A gradio app that renders a static leaderboard. This is used for Hugging Face Space."""
2
+ import ast
 
3
  import argparse
4
  import glob
5
+ import pickle
6
+
7
  import gradio as gr
8
+ import numpy as np
9
+ import pandas as pd
10
+
11
+
12
+ # notebook_url = "https://colab.research.google.com/drive/1RAWb22-PFNI-X1gPVzc927SGUdfr6nsR?usp=sharing"
13
+ notebook_url = "https://colab.research.google.com/drive/1KdwokPjirkTmpO_P1WByFNFiqxWQquwH#scrollTo=o_CpbkGEbhrK"
14
+
15
+
16
+ basic_component_values = [None] * 6
17
+ leader_component_values = [None] * 5
18
+
19
+
20
+ def make_default_md(arena_df, elo_results):
21
+ total_votes = sum(arena_df["num_battles"]) // 2
22
+ total_models = len(arena_df)
23
+
24
+ leaderboard_md = f"""
25
+ # πŸ† LMSYS Chatbot Arena Leaderboard
26
+ | [Vote](https://chat.lmsys.org) | [Blog](https://lmsys.org/blog/2023-05-03-arena/) | [GitHub](https://github.com/lm-sys/FastChat) | [Paper](https://arxiv.org/abs/2306.05685) | [Dataset](https://github.com/lm-sys/FastChat/blob/main/docs/dataset_release.md) | [Twitter](https://twitter.com/lmsysorg) | [Discord](https://discord.gg/HSWAKCrnFx) |
27
+
28
+ LMSYS [Chatbot Arena](https://lmsys.org/blog/2023-05-03-arena/) is a crowdsourced open platform for LLM evals.
29
+ We've collected over **500,000** human preference votes to rank LLMs with the Elo ranking system.
30
+ """
31
+ return leaderboard_md
32
+
33
+
34
+ def make_arena_leaderboard_md(arena_df):
35
+ total_votes = sum(arena_df["num_battles"]) // 2
36
+ total_models = len(arena_df)
37
+
38
+ leaderboard_md = f"""
39
+ Total #models: **{total_models}**. Total #votes: **{total_votes}**. Last updated: March 29, 2024.
40
+
41
+ Contribute your vote πŸ—³οΈ at [chat.lmsys.org](https://chat.lmsys.org)! Find more analysis in the [notebook]({notebook_url}).
42
+ """
43
+ return leaderboard_md
44
+
45
+
46
+ def make_full_leaderboard_md(elo_results):
47
+ leaderboard_md = f"""
48
+ Three benchmarks are displayed: **Arena Elo**, **MT-Bench** and **MMLU**.
49
+ - [Chatbot Arena](https://chat.lmsys.org/?arena) - a crowdsourced, randomized battle platform. We use 500K+ user votes to compute Elo ratings.
50
+ - [MT-Bench](https://arxiv.org/abs/2306.05685): a set of challenging multi-turn questions. We use GPT-4 to grade the model responses.
51
+ - [MMLU](https://arxiv.org/abs/2009.03300) (5-shot): a test to measure a model's multitask accuracy on 57 tasks.
52
+
53
+ πŸ’» Code: The MT-bench scores (single-answer grading on a scale of 10) are computed by [fastchat.llm_judge](https://github.com/lm-sys/FastChat/tree/main/fastchat/llm_judge).
54
+ The MMLU scores are mostly computed by [InstructEval](https://github.com/declare-lab/instruct-eval).
55
+ Higher values are better for all benchmarks. Empty cells mean not available.
56
+ """
57
+ return leaderboard_md
58
+
59
+
60
+ def make_leaderboard_md_live(elo_results):
61
+ leaderboard_md = f"""
62
+ # Leaderboard
63
+ Last updated: {elo_results["last_updated_datetime"]}
64
+ {elo_results["leaderboard_table"]}
65
+ """
66
+ return leaderboard_md
67
+
68
+
69
+ def update_elo_components(max_num_files, elo_results_file):
70
+ log_files = get_log_files(max_num_files)
71
+
72
+ # Leaderboard
73
+ if elo_results_file is None: # Do live update
74
+ battles = clean_battle_data(log_files)
75
+ elo_results = report_elo_analysis_results(battles)
76
+
77
+ leader_component_values[0] = make_leaderboard_md_live(elo_results)
78
+ leader_component_values[1] = elo_results["win_fraction_heatmap"]
79
+ leader_component_values[2] = elo_results["battle_count_heatmap"]
80
+ leader_component_values[3] = elo_results["bootstrap_elo_rating"]
81
+ leader_component_values[4] = elo_results["average_win_rate_bar"]
82
+
83
+ # Basic stats
84
+ basic_stats = report_basic_stats(log_files)
85
+ md0 = f"Last updated: {basic_stats['last_updated_datetime']}"
86
+
87
+ md1 = "### Action Histogram\n"
88
+ md1 += basic_stats["action_hist_md"] + "\n"
89
+
90
+ md2 = "### Anony. Vote Histogram\n"
91
+ md2 += basic_stats["anony_vote_hist_md"] + "\n"
92
+
93
+ md3 = "### Model Call Histogram\n"
94
+ md3 += basic_stats["model_hist_md"] + "\n"
95
+
96
+ md4 = "### Model Call (Last 24 Hours)\n"
97
+ md4 += basic_stats["num_chats_last_24_hours"] + "\n"
98
+
99
+ basic_component_values[0] = md0
100
+ basic_component_values[1] = basic_stats["chat_dates_bar"]
101
+ basic_component_values[2] = md1
102
+ basic_component_values[3] = md2
103
+ basic_component_values[4] = md3
104
+ basic_component_values[5] = md4
105
+
106
+
107
+ def update_worker(max_num_files, interval, elo_results_file):
108
+ while True:
109
+ tic = time.time()
110
+ update_elo_components(max_num_files, elo_results_file)
111
+ durtaion = time.time() - tic
112
+ print(f"update duration: {durtaion:.2f} s")
113
+ time.sleep(max(interval - durtaion, 0))
114
 
115
 
116
  def load_demo(url_params, request: gr.Request):
117
  logger.info(f"load_demo. ip: {request.client.host}. params: {url_params}")
118
  return basic_component_values + leader_component_values
119
 
 
 
120
 
121
+ def model_hyperlink(model_name, link):
122
+ return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
123
+
124
+
125
+ def load_leaderboard_table_csv(filename, add_hyperlink=True):
126
+ lines = open(filename).readlines()
127
+ heads = [v.strip() for v in lines[0].split(",")]
128
+ rows = []
129
+ for i in range(1, len(lines)):
130
+ row = [v.strip() for v in lines[i].split(",")]
131
+ for j in range(len(heads)):
132
+ item = {}
133
+ for h, v in zip(heads, row):
134
+ if h == "Arena Elo rating":
135
+ if v != "-":
136
+ v = int(ast.literal_eval(v))
137
+ else:
138
+ v = np.nan
139
+ elif h == "MMLU":
140
+ if v != "-":
141
+ v = round(ast.literal_eval(v) * 100, 1)
142
+ else:
143
+ v = np.nan
144
+ elif h == "MT-bench (win rate %)":
145
+ if v != "-":
146
+ v = round(ast.literal_eval(v[:-1]), 1)
147
+ else:
148
+ v = np.nan
149
+ elif h == "MT-bench (score)":
150
+ if v != "-":
151
+ v = round(ast.literal_eval(v), 2)
152
+ else:
153
+ v = np.nan
154
+ item[h] = v
155
+ if add_hyperlink:
156
+ item["Model"] = model_hyperlink(item["Model"], item["Link"])
157
+ rows.append(item)
158
+
159
+ return rows
160
+
161
+
162
+ def build_basic_stats_tab():
163
+ empty = "Loading ..."
164
+ basic_component_values[:] = [empty, None, empty, empty, empty, empty]
165
+
166
+ md0 = gr.Markdown(empty)
167
+ gr.Markdown("#### Figure 1: Number of model calls and votes")
168
+ plot_1 = gr.Plot(show_label=False)
169
+ with gr.Row():
170
+ with gr.Column():
171
+ md1 = gr.Markdown(empty)
172
+ with gr.Column():
173
+ md2 = gr.Markdown(empty)
174
+ with gr.Row():
175
+ with gr.Column():
176
+ md3 = gr.Markdown(empty)
177
+ with gr.Column():
178
+ md4 = gr.Markdown(empty)
179
+ return [md0, plot_1, md1, md2, md3, md4]
180
+
181
+ def get_full_table(arena_df, model_table_df):
182
+ values = []
183
+ for i in range(len(model_table_df)):
184
+ row = []
185
+ model_key = model_table_df.iloc[i]["key"]
186
+ model_name = model_table_df.iloc[i]["Model"]
187
+ # model display name
188
+ row.append(model_name)
189
+ if model_key in arena_df.index:
190
+ idx = arena_df.index.get_loc(model_key)
191
+ row.append(round(arena_df.iloc[idx]["rating"]))
192
+ else:
193
+ row.append(np.nan)
194
+ row.append(model_table_df.iloc[i]["MT-bench (score)"])
195
+ row.append(model_table_df.iloc[i]["MMLU"])
196
+ # Organization
197
+ row.append(model_table_df.iloc[i]["Organization"])
198
+ # license
199
+ row.append(model_table_df.iloc[i]["License"])
200
+
201
+ values.append(row)
202
+ values.sort(key=lambda x: -x[1] if not np.isnan(x[1]) else 1e9)
203
+ return values
204
+
205
+
206
+ def get_arena_table(arena_df, model_table_df):
207
+ # sort by rating
208
+ arena_df = arena_df.sort_values(by=["rating"], ascending=False)
209
+ values = []
210
+ for i in range(len(arena_df)):
211
+ row = []
212
+ model_key = arena_df.index[i]
213
+ model_name = model_table_df[model_table_df["key"] == model_key]["Model"].values[
214
+ 0
215
+ ]
216
+
217
+ # rank
218
+ ranking = arena_df.iloc[i].get("final_ranking") or i+1
219
+ row.append(ranking)
220
+ # model display name
221
+ row.append(model_name)
222
+ # elo rating
223
+ row.append(round(arena_df.iloc[i]["rating"]))
224
+ upper_diff = round(
225
+ arena_df.iloc[i]["rating_q975"] - arena_df.iloc[i]["rating"]
226
+ )
227
+ lower_diff = round(
228
+ arena_df.iloc[i]["rating"] - arena_df.iloc[i]["rating_q025"]
229
+ )
230
+ row.append(f"+{upper_diff}/-{lower_diff}")
231
+ # num battles
232
+ row.append(round(arena_df.iloc[i]["num_battles"]))
233
+ # Organization
234
+ row.append(
235
+ model_table_df[model_table_df["key"] == model_key]["Organization"].values[0]
236
+ )
237
+ # license
238
+ row.append(
239
+ model_table_df[model_table_df["key"] == model_key]["License"].values[0]
240
+ )
241
+
242
+ cutoff_date = model_table_df[model_table_df["key"] == model_key]["Knowledge cutoff date"].values[0]
243
+ if cutoff_date == "-":
244
+ row.append("Unknown")
245
+ else:
246
+ row.append(cutoff_date)
247
+ values.append(row)
248
+ return values
249
+
250
+ def build_leaderboard_tab(elo_results_file, leaderboard_table_file, show_plot=False):
251
+ if elo_results_file is None: # Do live update
252
+ default_md = "Loading ..."
253
+ p1 = p2 = p3 = p4 = None
254
+ else:
255
+ with open(elo_results_file, "rb") as fin:
256
+ elo_results = pickle.load(fin)
257
+ if "full" in elo_results:
258
+ elo_results = elo_results["full"]
259
+
260
+ p1 = elo_results["win_fraction_heatmap"]
261
+ p2 = elo_results["battle_count_heatmap"]
262
+ p3 = elo_results["bootstrap_elo_rating"]
263
+ p4 = elo_results["average_win_rate_bar"]
264
+ arena_df = elo_results["leaderboard_table_df"]
265
+ default_md = make_default_md(arena_df, elo_results)
266
+
267
+ md_1 = gr.Markdown(default_md, elem_id="leaderboard_markdown")
268
+ if leaderboard_table_file:
269
+ data = load_leaderboard_table_csv(leaderboard_table_file)
270
+ model_table_df = pd.DataFrame(data)
271
+
272
+ with gr.Tabs() as tabs:
273
+ # arena table
274
+ arena_table_vals = get_arena_table(arena_df, model_table_df)
275
+ with gr.Tab("Arena Elo", id=0):
276
+ md = make_arena_leaderboard_md(arena_df)
277
+ gr.Markdown(md, elem_id="leaderboard_markdown")
278
+ gr.Dataframe(
279
+ headers=[
280
+ "Rank",
281
+ "πŸ€– Model",
282
+ "⭐ Arena Elo",
283
+ "πŸ“Š 95% CI",
284
+ "πŸ—³οΈ Votes",
285
+ "Organization",
286
+ "License",
287
+ "Knowledge Cutoff",
288
+ ],
289
+ datatype=[
290
+ "str",
291
+ "markdown",
292
+ "number",
293
+ "str",
294
+ "number",
295
+ "str",
296
+ "str",
297
+ "str",
298
+ ],
299
+ value=arena_table_vals,
300
+ elem_id="arena_leaderboard_dataframe",
301
+ height=700,
302
+ column_widths=[50, 200, 120, 100, 100, 150, 150, 100],
303
+ wrap=True,
304
+ )
305
+ with gr.Tab("Full Leaderboard", id=1):
306
+ md = make_full_leaderboard_md(elo_results)
307
+ gr.Markdown(md, elem_id="leaderboard_markdown")
308
+ full_table_vals = get_full_table(arena_df, model_table_df)
309
+ gr.Dataframe(
310
+ headers=[
311
+ "πŸ€– Model",
312
+ "⭐ Arena Elo",
313
+ "πŸ“ˆ MT-bench",
314
+ "πŸ“š MMLU",
315
+ "Organization",
316
+ "License",
317
+ ],
318
+ datatype=["markdown", "number", "number", "number", "str", "str"],
319
+ value=full_table_vals,
320
+ elem_id="full_leaderboard_dataframe",
321
+ column_widths=[200, 100, 100, 100, 150, 150],
322
+ height=700,
323
+ wrap=True,
324
+ )
325
+ if not show_plot:
326
+ gr.Markdown(
327
+ """ ## Visit our [HF space](https://huggingface.co/spaces/lmsys/chatbot-arena-leaderboard) for more analysis!
328
+ If you want to see more models, please help us [add them](https://github.com/lm-sys/FastChat/blob/main/docs/arena.md#how-to-add-a-new-model).
329
+ """,
330
+ elem_id="leaderboard_markdown",
331
+ )
332
+ else:
333
+ pass
334
+
335
+ gr.Markdown(
336
+ f"""Note: we take the 95% confidence interval into account when determining a model's ranking.
337
+ A model is ranked higher only if its lower bound of model score is higher than the upper bound of the other model's score.
338
+ See Figure 3 below for visualization of the confidence intervals.
339
+ """,
340
+ elem_id="leaderboard_markdown"
341
  )
342
 
343
+ leader_component_values[:] = [default_md, p1, p2, p3, p4]
344
+
345
+ if show_plot:
346
+ gr.Markdown(
347
+ f"""## More Statistics for Chatbot Arena\n
348
+ Below are figures for more statistics. The code for generating them is also included in this [notebook]({notebook_url}).
349
+ You can find more discussions in this blog [post](https://lmsys.org/blog/2023-12-07-leaderboard/).
350
+ """,
351
+ elem_id="leaderboard_markdown"
352
+ )
353
+ with gr.Row():
354
+ with gr.Column():
355
+ gr.Markdown(
356
+ "#### Figure 1: Fraction of Model A Wins for All Non-tied A vs. B Battles"
357
+ )
358
+ plot_1 = gr.Plot(p1, show_label=False)
359
+ with gr.Column():
360
+ gr.Markdown(
361
+ "#### Figure 2: Battle Count for Each Combination of Models (without Ties)"
362
+ )
363
+ plot_2 = gr.Plot(p2, show_label=False)
364
+ with gr.Row():
365
+ with gr.Column():
366
+ gr.Markdown(
367
+ "#### Figure 3: Confidence Intervals on Model Strength (via Bootstrapping)"
368
+ )
369
+ plot_3 = gr.Plot(p3, show_label=False)
370
+ with gr.Column():
371
+ gr.Markdown(
372
+ "#### Figure 4: Average Win Rate Against All Other Models (Assuming Uniform Sampling and No Ties)"
373
+ )
374
+ plot_4 = gr.Plot(p4, show_label=False)
375
+
376
+ gr.Markdown(acknowledgment_md)
377
+
378
+ if show_plot:
379
+ return [md_1, plot_1, plot_2, plot_3, plot_4]
380
+ return [md_1]
381
+
382
+ block_css = """
383
+ #notice_markdown {
384
+ font-size: 104%
385
+ }
386
+ #notice_markdown th {
387
+ display: none;
388
+ }
389
+ #notice_markdown td {
390
+ padding-top: 6px;
391
+ padding-bottom: 6px;
392
+ }
393
+ #leaderboard_markdown {
394
+ font-size: 104%
395
+ }
396
+ #leaderboard_markdown td {
397
+ padding-top: 6px;
398
+ padding-bottom: 6px;
399
+ }
400
+ #leaderboard_dataframe td {
401
+ line-height: 0.1em;
402
+ }
403
+ footer {
404
+ display:none !important
405
+ }
406
+ .sponsor-image-about img {
407
+ margin: 0 20px;
408
+ margin-top: 20px;
409
+ height: 40px;
410
+ max-height: 100%;
411
+ width: auto;
412
+ float: left;
413
+ }
414
+ """
415
+
416
+ acknowledgment_md = """
417
+ ### Acknowledgment
418
+ We thank [Kaggle](https://www.kaggle.com/), [MBZUAI](https://mbzuai.ac.ae/), [a16z](https://www.a16z.com/), [Together AI](https://www.together.ai/), [Anyscale](https://www.anyscale.com/), [HuggingFace](https://huggingface.co/) for their generous [sponsorship](https://lmsys.org/donations/).
419
+
420
+ <div class="sponsor-image-about">
421
+ <img src="https://storage.googleapis.com/public-arena-asset/kaggle.png" alt="Kaggle">
422
+ <img src="https://storage.googleapis.com/public-arena-asset/mbzuai.jpeg" alt="MBZUAI">
423
+ <img src="https://storage.googleapis.com/public-arena-asset/a16z.jpeg" alt="a16z">
424
+ <img src="https://storage.googleapis.com/public-arena-asset/together.png" alt="Together AI">
425
+ <img src="https://storage.googleapis.com/public-arena-asset/anyscale.png" alt="AnyScale">
426
+ <img src="https://storage.googleapis.com/public-arena-asset/huggingface.png" alt="HuggingFace">
427
+ </div>
428
+ """
429
+
430
+ def build_demo(elo_results_file, leaderboard_table_file):
431
+ text_size = gr.themes.sizes.text_lg
432
+
433
  with gr.Blocks(
434
  title="Chatbot Arena Leaderboard",
435
+ theme=gr.themes.Base(text_size=text_size),
436
  css=block_css,
437
  ) as demo:
438
  leader_components = build_leaderboard_tab(
439
+ elo_results_file, leaderboard_table_file, show_plot=True
440
  )
441
  return demo
442
 
443
+
444
  if __name__ == "__main__":
445
  parser = argparse.ArgumentParser()
446
  parser.add_argument("--share", action="store_true")
 
 
447
  args = parser.parse_args()
448
 
 
 
 
449
  elo_result_files = glob.glob("elo_results_*.pkl")
450
  elo_result_files.sort(key=lambda x: int(x[12:-4]))
451
  elo_result_file = elo_result_files[-1]
 
453
  leaderboard_table_files = glob.glob("leaderboard_table_*.csv")
454
  leaderboard_table_files.sort(key=lambda x: int(x[18:-4]))
455
  leaderboard_table_file = leaderboard_table_files[-1]
 
 
 
 
456
 
457
  demo = build_demo(elo_result_file, leaderboard_table_file)
458
+ demo.launch(share=args.share)
arena_hard_auto_leaderboard_v0.1.csv DELETED
@@ -1,61 +0,0 @@
1
- model,score,rating_q025,rating_q975,CI,avg_tokens,date
2
- gpt-4-turbo-2024-04-09,82.63,80.75,84.6,"(1.9, 2.0)",662.0,2024-07-31
3
- claude-3-5-sonnet-20240620,79.35,77.25,80.62,"(2.1, 1.3)",567.0,2024-07-31
4
- gpt-4o-2024-05-13,79.21,77.42,80.71,"(1.8, 1.5)",696.0,2024-07-31
5
- gpt-4-0125-preview,77.96,75.94,79.9,"(2.0, 1.9)",619.0,2024-07-31
6
- athene-70b-0725,76.83,74.84,78.74,"(2.0, 1.9)",683.0,2024-07-31
7
- gpt-4o-mini-2024-07-18,74.94,72.66,77.07,"(2.3, 2.1)",668.0,2024-07-31
8
- gemini-1.5-pro-api-0514,71.96,69.62,74.62,"(2.3, 2.7)",676.0,2024-07-31
9
- yi-large-preview,71.48,69.02,73.37,"(2.5, 1.9)",720.0,2024-07-31
10
- mistral-large-2407,70.42,68.11,72.43,"(2.3, 2.0)",623.0,2024-07-31
11
- llama-3.1-405b-instruct,64.09,61.43,66.55,"(2.7, 2.5)",633.0,2024-07-31
12
- glm-4-0520,63.84,61.28,66.19,"(2.6, 2.3)",636.0,2024-07-31
13
- yi-large,63.7,61.76,65.86,"(1.9, 2.2)",626.0,2024-07-31
14
- deepseek-coder-v2,62.3,59.82,64.72,"(2.5, 2.4)",578.0,2024-07-31
15
- claude-3-opus-20240229,60.36,57.56,62.34,"(2.8, 2.0)",541.0,2024-07-31
16
- gemma-2-27b-it,57.51,55.11,60.12,"(2.4, 2.6)",577.0,2024-07-31
17
- llama-3.1-70b-instruct,55.73,52.85,58.2,"(2.9, 2.5)",628.0,2024-07-31
18
- glm-4-0116,55.72,53.83,58.16,"(1.9, 2.4)",622.0,2024-07-31
19
- gemini-1.5-pro-api-0409-preview,53.37,51.13,56.66,"(2.2, 3.3)",478.0,2024-07-31
20
- glm-4-air,50.88,48.62,53.21,"(2.3, 2.3)",619.0,2024-07-31
21
- gpt-4-0314,50.0,50.0,50.0,"(0.0, 0.0)",423.0,2024-07-31
22
- gemini-1.5-flash-api-0514,49.61,47.46,52.17,"(2.1, 2.6)",642.0,2024-07-31
23
- qwen2-72b-instruct,46.86,44.57,49.29,"(2.3, 2.4)",515.0,2024-07-31
24
- claude-3-sonnet-20240229,46.8,44.12,49.04,"(2.7, 2.2)",552.0,2024-07-31
25
- llama-3-70b-instruct,46.57,43.84,49.18,"(2.7, 2.6)",591.0,2024-07-31
26
- claude-3-haiku-20240307,41.47,39.57,44.02,"(1.9, 2.6)",505.0,2024-07-31
27
- gpt-4-0613,37.9,35.6,40.36,"(2.3, 2.5)",354.0,2024-07-31
28
- mistral-large-2402,37.71,34.81,39.77,"(2.9, 2.1)",400.0,2024-07-31
29
- mixtral-8x22b-instruct-v0.1,36.36,34.21,38.55,"(2.1, 2.2)",430.0,2024-07-31
30
- qwen1.5-72b-chat,36.12,33.88,38.15,"(2.2, 2.0)",474.0,2024-07-31
31
- phi-3-medium-4k-instruct,33.37,31.26,35.14,"(2.1, 1.8)",517.0,2024-07-31
32
- command-r-plus,33.07,30.85,35.12,"(2.2, 2.0)",541.0,2024-07-31
33
- mistral-medium,31.9,29.66,34.31,"(2.2, 2.4)",485.0,2024-07-31
34
- phi-3-small-8k-instruct,29.77,27.94,31.97,"(1.8, 2.2)",568.0,2024-07-31
35
- mistral-next,27.37,25.4,29.09,"(2.0, 1.7)",297.0,2024-07-31
36
- gpt-3.5-turbo-0613,24.82,22.54,26.29,"(2.3, 1.5)",401.0,2024-07-31
37
- dbrx-instruct-preview,24.63,22.33,26.83,"(2.3, 2.2)",415.0,2024-07-31
38
- claude-2.0,23.99,21.71,25.65,"(2.3, 1.7)",295.0,2024-07-31
39
- mixtral-8x7b-instruct-v0.1,23.4,21.38,25.41,"(2.0, 2.0)",457.0,2024-07-31
40
- gpt-3.5-turbo-0125,23.34,21.67,25.27,"(1.7, 1.9)",329.0,2024-07-31
41
- yi-34b-chat,23.15,20.75,24.7,"(2.4, 1.6)",611.0,2024-07-31
42
- starling-lm-7b-beta,23.01,20.81,24.66,"(2.2, 1.6)",530.0,2024-07-31
43
- claude-2.1,22.77,20.65,25.43,"(2.1, 2.7)",290.0,2024-07-31
44
- llama-3.1-8b-instruct,21.34,19.71,23.09,"(1.6, 1.8)",861.0,2024-07-31
45
- snorkel-mistral-pairrm-dpo,20.73,19.04,22.05,"(1.7, 1.3)",564.0,2024-07-31
46
- llama-3-8b-instruct,20.56,18.82,22.61,"(1.7, 2.1)",585.0,2024-07-31
47
- gpt-3.5-turbo-1106,18.87,17.06,20.58,"(1.8, 1.7)",285.0,2024-07-31
48
- gpt-3.5-turbo-0314,18.05,16.57,20.06,"(1.5, 2.0)",334.0,2024-07-31
49
- gemini-pro,17.8,15.96,19.32,"(1.8, 1.5)",322.0,2024-07-31
50
- snowflake-arctic-instruct,17.61,16.12,19.27,"(1.5, 1.7)",365.0,2024-07-31
51
- command-r,17.02,15.73,18.51,"(1.3, 1.5)",432.0,2024-07-31
52
- phi-3-mini-128k-instruct,15.43,13.94,17.02,"(1.5, 1.6)",609.0,2024-07-31
53
- tulu-2-dpo-70b,14.99,13.05,16.82,"(1.9, 1.8)",550.0,2024-07-31
54
- starling-lm-7b-alpha,12.8,11.23,14.5,"(1.6, 1.7)",483.0,2024-07-31
55
- mistral-7b-instruct,12.57,11.05,14.11,"(1.5, 1.5)",541.0,2024-07-31
56
- gemma-1.1-7b-it,12.09,10.61,13.43,"(1.5, 1.3)",341.0,2024-07-31
57
- llama-2-70b-chat,11.55,10.02,13.01,"(1.5, 1.5)",595.0,2024-07-31
58
- vicuna-33b,8.63,7.59,9.84,"(1.0, 1.2)",451.0,2024-07-31
59
- gemma-7b-it,7.47,6.5,8.6,"(1.0, 1.1)",378.0,2024-07-31
60
- gemma-1.1-2b-it,3.37,2.74,4.14,"(0.6, 0.8)",316.0,2024-07-31
61
- gemma-2b-it,3.0,2.33,3.67,"(0.7, 0.7)",369.0,2024-07-31
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
elo_results_20240327.pkl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:bab4e9fa00e9d7c8244723993174af2c4f35ffc8487cc3059504b72658f06f43
3
- size 457743
 
 
 
 
elo_results_20240410.pkl β†’ elo_results_20240329.pkl RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8a5757ab9692c6121451f2c787700507fe6b866837329ab0a47a9003a274338f
3
- size 120963
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7f4c037f68c9ddbf27b70b1cb333ca37bf70ff9a3cddad7a93cd62bca709cd77
3
+ size 115776
elo_results_20240403.pkl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:ce8cebf41da8c06eee0f37156e01be83cc43182e0f00444311b4ad97a83154be
3
- size 690286
 
 
 
 
elo_results_20240409.pkl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:e6774f780b63f569666e9a85b12eddceef3af75e1d1799ff7c6e0529102950c3
3
- size 119947
 
 
 
 
elo_results_20240411.pkl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:fada8d86ddb6dae319c5bda602d921859cc4280fdd53388eff446d80c3ab8192
3
- size 1183214
 
 
 
 
elo_results_20240413.pkl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:3ad8ebb2a8602a7c72382fc65521fbe7b06bb36dcf6b6cc582c6b89b1d7b1a87
3
- size 1064654
 
 
 
 
elo_results_20240418.pkl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:1b05163db100df9ef62c7efb3332891d3321c6094e787af3b4ef4a9afe2becdb
3
- size 1130887
 
 
 
 
elo_results_20240419.pkl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:05239b0b4406f270fdc845632981024dec639b8351dcb1a2308def3bbcea2e68
3
- size 1130756
 
 
 
 
elo_results_20240422.pkl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:436c6bc88b6c03b672a3a87ddb3b101ec1c5ff03d47d64196986b5d6ca7909cd
3
- size 1254718
 
 
 
 
elo_results_20240426.pkl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:a39b33094ac93d4a1e0bc57bfbb17368515ce5a7e4504d3d1e310a14cd056943
3
- size 1275849
 
 
 
 
elo_results_20240501.pkl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:35f506d5d34555cbc055abc901623fae3aa7b429057cf3039cb1b460fdc8f41c
3
- size 1159628
 
 
 
 
elo_results_20240508.pkl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:79acc98f859a5f597338eff0ae98025abfec80087d60336c0d735e7dd3595eb2
3
- size 1188396
 
 
 
 
elo_results_20240515.pkl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:692e9b280f1587b85f28aaeceee52928a92f6a98ee81e3d63a2d789c82eb9abc
3
- size 1596977
 
 
 
 
elo_results_20240516.pkl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:e3ff3e278c66aa28aece3e52369d128ca6707673e0e811304294c5d8a85aaf81
3
- size 1476125
 
 
 
 
elo_results_20240519.pkl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:70c1136517de9396d72b2e14faee88382e503d0b80e56a5131e220173f6b472b
3
- size 1604729
 
 
 
 
elo_results_20240520.pkl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:82d5306e5b88813cea3490a0cf2d02952219c52257789d6077caeee986996567
3
- size 1628933
 
 
 
 
elo_results_20240527.pkl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:ca33433b15176a4b7c4f13584b2129ebd60f059524648c4a4a986aad4a84bc1e
3
- size 1666346
 
 
 
 
elo_results_20240602.pkl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:1e884bc9a41a3aa9916e29ca34e5ac2e52e6f8d7e314e380facd06f9ae855145
3
- size 2278603
 
 
 
 
elo_results_20240606.pkl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:6073ca1b10e9ac34c2b67b73636dcd2303b4a9291c9a440ad3813c33ef5fa170
3
- size 2295194
 
 
 
 
elo_results_20240611.pkl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:8b0f14fede0abeee60825682ffab3b07b50af5d9924de3c8114ddac469b34779
3
- size 2310921
 
 
 
 
elo_results_20240617.pkl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:a6e53f4d339bbc3420b349e89a315a1c6d2fa3c9847b206aeb02e1e5170aea73
3
- size 2491948
 
 
 
 
elo_results_20240621.pkl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:b098f9f69009376d87ef4d317c81c648d0558f7c912f693e285de5d3115e309b
3
- size 2526260
 
 
 
 
elo_results_20240623.pkl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:1469aac4504cd49bdbf1566093c5be912fd6ffe27f62213fd4961eefc92b4e30
3
- size 2544361
 
 
 
 
elo_results_20240626.pkl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:f6b7c6dc746e8dfb7fed966d9c027468dccbd31da5866af6fff0083478ef52ff
3
- size 2429037
 
 
 
 
elo_results_20240629.pkl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:acdaa9f0e89e01a5d1ee914d750727bf9877cde8bf2e195439bc7625b80f197f
3
- size 2679801
 
 
 
 
elo_results_20240706.pkl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:dbd774b3b25712428c96cf54b5cb8c4d912e8b8215edbcee3dee0974fd898c8e
3
- size 2702290
 
 
 
 
elo_results_20240708.pkl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:5d72cd3bf16c9af40910a2faf9f1403df2788596390e0f13452835748968679d
3
- size 3154928
 
 
 
 
elo_results_20240716.pkl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:dec0c95bbaf8a2ed382b1676eba0b22ad89fa8815d28bc22c94cc6ea00205e5d
3
- size 3029902
 
 
 
 
elo_results_20240722.pkl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:c01b5a786ffd8bf64b72d39ca3fc7dee5483852c6a515f515532c8096a6e16d4
3
- size 3054518
 
 
 
 
elo_results_20240725.pkl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:1bfde449e424f12c9316f0c64062fc1ef9926e4a924bbaeafa455fcde0decb6f
3
- size 3073542
 
 
 
 
elo_results_20240730.pkl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:8db59f5f0852f26600f85c3188547e5d97b4641906c979bcc178cd5be7a7554c
3
- size 3137995
 
 
 
 
elo_results_20240731.pkl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:9aaff49a611fef8ff7bec058827d7de3426973c15bc538ac6bb39e764cc14b34
3
- size 3157482
 
 
 
 
elo_results_20240801.pkl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:cdaa16f762adc54cbe05ab03dff88587491193852846b749980294b7f1ea2bec
3
- size 3182414
 
 
 
 
elo_results_20240805.pkl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:e638bfe870e4409c4d768d57287cc8f6310caf85191d8020ba89f7f59ee9f6d8
3
- size 3202114
 
 
 
 
elo_results_20240806.pkl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:a4f31f36636a280589bb1039f9d1f405a989df6a8f74d1af30555d891b23a416
3
- size 3261205
 
 
 
 
elo_results_20240813.pkl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:b4c5dfd4247b704f07e61ae27dc0642a3d3cfa9a6872cc3dc03d1888a594de9f
3
- size 2943734
 
 
 
 
elo_results_20240822.pkl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:c733894de1252232a63ce3632abe52504fb6bcf43e17bb49fea2b5ad8d76116f
3
- size 3004697
 
 
 
 
elo_results_20240823.pkl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:ec103aa6bf5d0f02f8bd2c69c8ccfc8f1be1b44c7dc004d967c8d5ce470975b5
3
- size 3039588
 
 
 
 
elo_results_20240827.pkl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:fbcf63bc492b9e2018fdd2c82924f375f12db20dd577f2c139a8ff82a2d08159
3
- size 3093445
 
 
 
 
elo_results_20240828.pkl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:b7da13b5f061a7a5a112e5ca45ff707d6cf6259c8a01b40ea5b77bbd5bd3d5b0
3
- size 3819732
 
 
 
 
elo_results_20240904.pkl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:cf6117b1e28bb982e965d20b927685dce98750a82c255b868588b5b2318aaee9
3
- size 3486555
 
 
 
 
elo_results_20240915.pkl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:42512fe148c81eeab05961ed64fe446dc7f6ed3703f976fc4c8c2a6a3a3e6bef
3
- size 3726145
 
 
 
 
elo_results_20240917.pkl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:d15ed36a6e429a9213e9230fbd28de05e7788f758c4282660dd77a4689f98590
3
- size 3768775
 
 
 
 
elo_results_20240927.pkl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:7c96f9a95fbbf44b8cb46129cc8ea09eef1bb6e43b8ec12d1c8837091d23ee69
3
- size 3860967
 
 
 
 
index.html DELETED
@@ -1,12 +0,0 @@
1
- <!DOCTYPE html>
2
- <html>
3
- <head>
4
- <meta charset="utf-8" />
5
- <meta name="viewport" content="width=device-width" />
6
- <title>Chatbot Arena</title>
7
- <link rel="stylesheet" href="style.css" />
8
- </head>
9
- <body>
10
- <div>Please visit <a href="https://leaderboard.lmsys.org" target="_blank">leaderboard.lmsys.org</a>.</div>
11
- </body>
12
- </html>
 
 
 
 
 
 
 
 
 
 
 
 
 
leaderboard_table_20240404.csv β†’ leaderboard_table_20240329.csv RENAMED
@@ -88,7 +88,4 @@ codellama-70b-instruct,CodeLlama-70B-instruct,-,-,2024/1,Llama 2 Community,Meta,
88
  olmo-7b-instruct,OLMo-7B-instruct,-,-,2024/2,Apache-2.0,Allen AI,https://huggingface.co/allenai/OLMo-7B-Instruct
89
  claude-3-haiku-20240307,Claude 3 Haiku,-,0.752,2023/8,Proprietary,Anthropic,https://www.anthropic.com/news/claude-3-family
90
  starling-lm-7b-beta,Starling-LM-7B-beta,8.12,-,2024/3,Apache-2.0,Nexusflow,https://huggingface.co/Nexusflow/Starling-LM-7B-beta
91
- dbrx-instruct,DBRX-instruct,-,-,2024/3,Apache-2.0,Databricks,-
92
- command-r,Command R,-,-,2024/3,Apache-2.0,Cohere,-
93
- qwen1.5-14b-chat,Qwen1.5-14B-Chat,-,-,2024/2,Qianwen LICENSE,Alibaba,https://qwenlm.github.io/blog/qwen1.5/
94
- qwen1.5-32b-chat,Qwen1.5-32B-Chat,-,-,2024/2,Qianwen LICENSE,Alibaba,https://qwenlm.github.io/blog/qwen1.5/
 
88
  olmo-7b-instruct,OLMo-7B-instruct,-,-,2024/2,Apache-2.0,Allen AI,https://huggingface.co/allenai/OLMo-7B-Instruct
89
  claude-3-haiku-20240307,Claude 3 Haiku,-,0.752,2023/8,Proprietary,Anthropic,https://www.anthropic.com/news/claude-3-family
90
  starling-lm-7b-beta,Starling-LM-7B-beta,8.12,-,2024/3,Apache-2.0,Nexusflow,https://huggingface.co/Nexusflow/Starling-LM-7B-beta
91
+ command-r,Command R,-,-,2024/3,CC-BY-NC-4.0,Cohere,https://txt.cohere.com/command-r
 
 
 
leaderboard_table_20240409.csv DELETED
@@ -1,95 +0,0 @@
1
- key,Model,MT-bench (score),MMLU,Knowledge cutoff date,License,Organization,Link
2
- wizardlm-30b,WizardLM-30B,7.01,0.587,2023/6,Non-commercial,Microsoft,https://huggingface.co/WizardLM/WizardLM-30B-V1.0
3
- vicuna-13b-16k,Vicuna-13B-16k,6.92,0.545,2023/7,Llama 2 Community,LMSYS,https://huggingface.co/lmsys/vicuna-13b-v1.5-16k
4
- wizardlm-13b-v1.1,WizardLM-13B-v1.1,6.76,0.500,2023/7,Non-commercial,Microsoft,https://huggingface.co/WizardLM/WizardLM-13B-V1.1
5
- tulu-30b,Tulu-30B,6.43,0.581,2023/6,Non-commercial,AllenAI/UW,https://huggingface.co/allenai/tulu-30b
6
- guanaco-65b,Guanaco-65B,6.41,0.621,2023/5,Non-commercial,UW,https://huggingface.co/timdettmers/guanaco-65b-merged
7
- openassistant-llama-30b,OpenAssistant-LLaMA-30B,6.41,0.560,2023/4,Non-commercial,OpenAssistant,https://huggingface.co/OpenAssistant/oasst-sft-6-llama-30b-xor
8
- wizardlm-13b-v1.0,WizardLM-13B-v1.0,6.35,0.523,2023/5,Non-commercial,Microsoft,https://huggingface.co/WizardLM/WizardLM-13B-V1.0
9
- vicuna-7b-16k,Vicuna-7B-16k,6.22,0.485,2023/7,Llama 2 Community,LMSYS,https://huggingface.co/lmsys/vicuna-7b-v1.5-16k
10
- baize-v2-13b,Baize-v2-13B,5.75,0.489,2023/4,Non-commercial,UCSD,https://huggingface.co/project-baize/baize-v2-13b
11
- xgen-7b-8k-inst,XGen-7B-8K-Inst,5.55,0.421,2023/7,Non-commercial,Salesforce,https://huggingface.co/Salesforce/xgen-7b-8k-inst
12
- nous-hermes-13b,Nous-Hermes-13B,5.51,0.493,2023/6,Non-commercial,NousResearch,https://huggingface.co/NousResearch/Nous-Hermes-13b
13
- mpt-30b-instruct,MPT-30B-Instruct,5.22,0.478,2023/6,CC-BY-SA 3.0,MosaicML,https://huggingface.co/mosaicml/mpt-30b-instruct
14
- falcon-40b-instruct,Falcon-40B-Instruct,5.17,0.547,2023/5,Apache 2.0,TII,https://huggingface.co/tiiuae/falcon-40b-instruct
15
- h2o-oasst-openllama-13b,H2O-Oasst-OpenLLaMA-13B,4.63,0.428,2023/6,Apache 2.0,h2oai,https://huggingface.co/h2oai/h2ogpt-gm-oasst1-en-2048-open-llama-13b
16
- gpt-4-1106-preview,GPT-4-1106-preview,9.32,-,2023/4,Proprietary,OpenAI,https://openai.com/blog/new-models-and-developer-products-announced-at-devday
17
- gpt-4-0314,GPT-4-0314,8.96,0.864,2021/9,Proprietary,OpenAI,https://openai.com/research/gpt-4
18
- claude-1,Claude-1,7.90,0.770,-,Proprietary,Anthropic,https://www.anthropic.com/index/introducing-claude
19
- gpt-4-0613,GPT-4-0613,9.18,-,2021/9,Proprietary,OpenAI,https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo
20
- claude-2.0,Claude-2.0,8.06,0.785,-,Proprietary,Anthropic,https://www.anthropic.com/index/claude-2
21
- claude-2.1,Claude-2.1,8.18,-,-,Proprietary,Anthropic,https://www.anthropic.com/index/claude-2-1
22
- gpt-3.5-turbo-0613,GPT-3.5-Turbo-0613,8.39,-,2021/9,Proprietary,OpenAI,https://platform.openai.com/docs/models/gpt-3-5
23
- mixtral-8x7b-instruct-v0.1,Mixtral-8x7b-Instruct-v0.1,8.30,0.706,2023/12,Apache 2.0,Mistral,https://mistral.ai/news/mixtral-of-experts/
24
- claude-instant-1,Claude-Instant-1,7.85,0.734,-,Proprietary,Anthropic,https://www.anthropic.com/index/introducing-claude
25
- gpt-3.5-turbo-0314,GPT-3.5-Turbo-0314,7.94,0.700,2021/9,Proprietary,OpenAI,https://platform.openai.com/docs/models/gpt-3-5
26
- tulu-2-dpo-70b,Tulu-2-DPO-70B,7.89,-,2023/11,AI2 ImpACT Low-risk,AllenAI/UW,https://huggingface.co/allenai/tulu-2-dpo-70b
27
- yi-34b-chat,Yi-34B-Chat,-,0.735,2023/6,Yi License,01 AI,https://huggingface.co/01-ai/Yi-34B-Chat
28
- gemini-pro,Gemini Pro,-,0.718,2023/4,Proprietary,Google,https://blog.google/technology/ai/gemini-api-developers-cloud/
29
- gemini-pro-dev-api,Gemini Pro (Dev API),-,0.718,2023/4,Proprietary,Google,https://ai.google.dev/docs/gemini_api_overview
30
- bard-jan-24-gemini-pro,Bard (Gemini Pro),-,-,Online,Proprietary,Google,https://bard.google.com/
31
- wizardlm-70b,WizardLM-70B-v1.0,7.71,0.637,2023/8,Llama 2 Community,Microsoft,https://huggingface.co/WizardLM/WizardLM-70B-V1.0
32
- vicuna-33b,Vicuna-33B,7.12,0.592,2023/8,Non-commercial,LMSYS,https://huggingface.co/lmsys/vicuna-33b-v1.3
33
- starling-lm-7b-alpha,Starling-LM-7B-alpha,8.09,0.639,2023/11,CC-BY-NC-4.0,UC Berkeley,https://huggingface.co/berkeley-nest/Starling-LM-7B-alpha
34
- pplx-70b-online,pplx-70b-online,-,-,Online,Proprietary,Perplexity AI,https://blog.perplexity.ai/blog/introducing-pplx-online-llms
35
- openchat-3.5,OpenChat-3.5,7.81,0.643,2023/11,Apache-2.0,OpenChat,https://huggingface.co/openchat/openchat_3.5
36
- openhermes-2.5-mistral-7b,OpenHermes-2.5-Mistral-7b,-,-,2023/11,Apache-2.0,NousResearch,https://huggingface.co/teknium/OpenHermes-2.5-Mistral-7B
37
- gpt-3.5-turbo-1106,GPT-3.5-Turbo-1106,8.32,-,2021/9,Proprietary,OpenAI,https://platform.openai.com/docs/models/gpt-3-5
38
- llama-2-70b-chat,Llama-2-70b-chat,6.86,0.630,2023/7,Llama 2 Community,Meta,https://huggingface.co/meta-llama/Llama-2-70b-chat-hf
39
- solar-10.7b-instruct-v1.0,SOLAR-10.7B-Instruct-v1.0,7.58,0.662,2023/11,CC-BY-NC-4.0,Upstage AI,https://huggingface.co/upstage/SOLAR-10.7B-Instruct-v1.0
40
- dolphin-2.2.1-mistral-7b,Dolphin-2.2.1-Mistral-7B,-,-,2023/10,Apache-2.0,Cognitive Computations,https://huggingface.co/ehartford/dolphin-2.2.1-mistral-7b
41
- wizardlm-13b,WizardLM-13b-v1.2,7.20,0.527,2023/7,Llama 2 Community,Microsoft,https://huggingface.co/WizardLM/WizardLM-13B-V1.2
42
- zephyr-7b-beta,Zephyr-7b-beta,7.34,0.614,2023/10,MIT,HuggingFace,https://huggingface.co/HuggingFaceH4/zephyr-7b-beta
43
- mpt-30b-chat,MPT-30B-chat,6.39,0.504,2023/6,CC-BY-NC-SA-4.0,MosaicML,https://huggingface.co/mosaicml/mpt-30b-chat
44
- vicuna-13b,Vicuna-13B,6.57,0.558,2023/7,Llama 2 Community,LMSYS,https://huggingface.co/lmsys/vicuna-13b-v1.5
45
- qwen-14b-chat,Qwen-14B-Chat,6.96,0.665,2023/8,Qianwen LICENSE,Alibaba,https://huggingface.co/Qwen/Qwen-14B-Chat
46
- zephyr-7b-alpha,Zephyr-7b-alpha,6.88,-,2023/10,MIT,HuggingFace,https://huggingface.co/HuggingFaceH4/zephyr-7b-alpha
47
- codellama-34b-instruct,CodeLlama-34B-instruct,-,0.537,2023/7,Llama 2 Community,Meta,https://huggingface.co/codellama/CodeLlama-34b-Instruct-hf
48
- falcon-180b-chat,falcon-180b-chat,-,0.680,2023/9,Falcon-180B TII License,TII,https://huggingface.co/tiiuae/falcon-180B-chat
49
- guanaco-33b,Guanaco-33B,6.53,0.576,2023/5,Non-commercial,UW,https://huggingface.co/timdettmers/guanaco-33b-merged
50
- llama-2-13b-chat,Llama-2-13b-chat,6.65,0.536,2023/7,Llama 2 Community,Meta,https://huggingface.co/meta-llama/Llama-2-13b-chat-hf
51
- mistral-7b-instruct,Mistral-7B-Instruct-v0.1,6.84,0.554,2023/9,Apache 2.0,Mistral,https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1
52
- pplx-7b-online,pplx-7b-online,-,-,Online,Proprietary,Perplexity AI,https://blog.perplexity.ai/blog/introducing-pplx-online-llms
53
- llama-2-7b-chat,Llama-2-7b-chat,6.27,0.458,2023/7,Llama 2 Community,Meta,https://huggingface.co/meta-llama/Llama-2-7b-chat-hf
54
- vicuna-7b,Vicuna-7B,6.17,0.498,2023/7,Llama 2 Community,LMSYS,https://huggingface.co/lmsys/vicuna-7b-v1.5
55
- palm-2,PaLM-Chat-Bison-001,6.40,-,2021/6,Proprietary,Google,https://cloud.google.com/vertex-ai/docs/generative-ai/learn/models#foundation_models
56
- koala-13b,Koala-13B,5.35,0.447,2023/4,Non-commercial,UC Berkeley,https://bair.berkeley.edu/blog/2023/04/03/koala/
57
- chatglm3-6b,ChatGLM3-6B,-,-,2023/10,Apache-2.0,Tsinghua,https://huggingface.co/THUDM/chatglm3-6b
58
- gpt4all-13b-snoozy,GPT4All-13B-Snoozy,5.41,0.430,2023/3,Non-commercial,Nomic AI,https://huggingface.co/nomic-ai/gpt4all-13b-snoozy
59
- mpt-7b-chat,MPT-7B-Chat,5.42,0.320,2023/5,CC-BY-NC-SA-4.0,MosaicML,https://huggingface.co/mosaicml/mpt-7b-chat
60
- chatglm2-6b,ChatGLM2-6B,4.96,0.455,2023/6,Apache-2.0,Tsinghua,https://huggingface.co/THUDM/chatglm2-6b
61
- RWKV-4-Raven-14B,RWKV-4-Raven-14B,3.98,0.256,2023/4,Apache 2.0,RWKV,https://huggingface.co/BlinkDL/rwkv-4-raven
62
- alpaca-13b,Alpaca-13B,4.53,0.481,2023/3,Non-commercial,Stanford,https://crfm.stanford.edu/2023/03/13/alpaca.html
63
- oasst-pythia-12b,OpenAssistant-Pythia-12B,4.32,0.270,2023/4,Apache 2.0,OpenAssistant,https://huggingface.co/OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5
64
- chatglm-6b,ChatGLM-6B,4.50,0.361,2023/3,Non-commercial,Tsinghua,https://huggingface.co/THUDM/chatglm-6b
65
- fastchat-t5-3b,FastChat-T5-3B,3.04,0.477,2023/4,Apache 2.0,LMSYS,https://huggingface.co/lmsys/fastchat-t5-3b-v1.0
66
- stablelm-tuned-alpha-7b,StableLM-Tuned-Alpha-7B,2.75,0.244,2023/4,CC-BY-NC-SA-4.0,Stability AI,https://huggingface.co/stabilityai/stablelm-tuned-alpha-7b
67
- dolly-v2-12b,Dolly-V2-12B,3.28,0.257,2023/4,MIT,Databricks,https://huggingface.co/databricks/dolly-v2-12b
68
- llama-13b,LLaMA-13B,2.61,0.470,2023/2,Non-commercial,Meta,https://arxiv.org/abs/2302.13971
69
- mistral-medium,Mistral Medium,8.61,0.753,-,Proprietary,Mistral,https://mistral.ai/news/la-plateforme/
70
- llama2-70b-steerlm-chat,NV-Llama2-70B-SteerLM-Chat,7.54,0.685,2023/11,Llama 2 Community,Nvidia,https://huggingface.co/nvidia/Llama2-70B-SteerLM-Chat
71
- stripedhyena-nous-7b,StripedHyena-Nous-7B,-,-,2023/12,Apache 2.0,Together AI,https://huggingface.co/togethercomputer/StripedHyena-Nous-7B
72
- deepseek-llm-67b-chat,DeepSeek-LLM-67B-Chat,-,0.713,2023/11,DeepSeek License,DeepSeek AI,https://huggingface.co/deepseek-ai/deepseek-llm-67b-chat
73
- gpt-4-0125-preview,GPT-4-0125-preview,-,-,2023/12,Proprietary,OpenAI,https://openai.com/blog/new-models-and-developer-products-announced-at-devday
74
- qwen1.5-72b-chat,Qwen1.5-72B-Chat,8.61,0.775,2024/2,Qianwen LICENSE,Alibaba,https://qwenlm.github.io/blog/qwen1.5/
75
- qwen1.5-7b-chat,Qwen1.5-7B-Chat,7.6,0.610,2024/2,Qianwen LICENSE,Alibaba,https://qwenlm.github.io/blog/qwen1.5/
76
- qwen1.5-4b-chat,Qwen1.5-4B-Chat,-,0.561,2024/2,Qianwen LICENSE,Alibaba,https://qwenlm.github.io/blog/qwen1.5/
77
- openchat-3.5-0106,OpenChat-3.5-0106,7.8,0.658,2024/1,Apache-2.0,OpenChat,https://huggingface.co/openchat/openchat-3.5-0106
78
- nous-hermes-2-mixtral-8x7b-dpo,Nous-Hermes-2-Mixtral-8x7B-DPO,-,-,2024/1,Apache-2.0,NousResearch,https://huggingface.co/NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO
79
- gpt-3.5-turbo-0125,GPT-3.5-Turbo-0125,-,-,2021/9,Proprietary,OpenAI,https://platform.openai.com/docs/models/gpt-3-5-turbo
80
- mistral-next,Mistral-Next,-,-,-,Proprietary,Mistral,https://chat.mistral.ai/chat
81
- mistral-large-2402,Mistral-Large-2402,-,0.812,-,Proprietary,Mistral,https://mistral.ai/news/mistral-large/
82
- gemma-7b-it,Gemma-7B-it,-,0.643,2024/2,Gemma license,Google,https://huggingface.co/google/gemma-7b-it
83
- gemma-2b-it,Gemma-2B-it,-,0.423,2024/2,Gemma license,Google,https://huggingface.co/google/gemma-2b-it
84
- mistral-7b-instruct-v0.2,Mistral-7B-Instruct-v0.2,7.6,-,2023/12,Apache-2.0,Mistral,https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2
85
- claude-3-sonnet-20240229,Claude 3 Sonnet,-,0.790,2023/8,Proprietary,Anthropic,https://www.anthropic.com/news/claude-3-family
86
- claude-3-opus-20240229,Claude 3 Opus,-,0.868,2023/8,Proprietary,Anthropic,https://www.anthropic.com/news/claude-3-family
87
- codellama-70b-instruct,CodeLlama-70B-instruct,-,-,2024/1,Llama 2 Community,Meta,https://huggingface.co/codellama/CodeLlama-70b-hf
88
- olmo-7b-instruct,OLMo-7B-instruct,-,-,2024/2,Apache-2.0,Allen AI,https://huggingface.co/allenai/OLMo-7B-Instruct
89
- claude-3-haiku-20240307,Claude 3 Haiku,-,0.752,2023/8,Proprietary,Anthropic,https://www.anthropic.com/news/claude-3-family
90
- starling-lm-7b-beta,Starling-LM-7B-beta,8.12,-,2024/3,Apache-2.0,Nexusflow,https://huggingface.co/Nexusflow/Starling-LM-7B-beta
91
- command-r,Command R,-,-,2024/3,CC-BY-NC-4.0,Cohere,https://txt.cohere.com/command-r
92
- qwen1.5-14b-chat,Qwen1.5-14B-Chat,7.91,0.676,2024/2,Qianwen LICENSE,Alibaba,https://qwenlm.github.io/blog/qwen1.5/
93
- qwen1.5-32b-chat,Qwen1.5-32B-Chat,8.30,0.734,2024/2,Qianwen LICENSE,Alibaba,https://qwenlm.github.io/blog/qwen1.5-32b/
94
- command-r-plus,Command R+,-,-,2024/3,CC-BY-NC-4.0,Cohere,https://txt.cohere.com/command-r-plus-microsoft-azure/
95
- gemma-1.1-7b-it,Gemma-1.1-7B-it,-,0.643,2024/2,Gemma license,Google,https://huggingface.co/google/gemma-1.1-7b-it