update
Browse files
app.py
CHANGED
@@ -25,9 +25,7 @@ def make_default_md(arena_df, elo_results):
|
|
25 |
| [Vote](https://chat.lmsys.org) | [Blog](https://lmsys.org/blog/2023-05-03-arena/) | [GitHub](https://github.com/lm-sys/FastChat) | [Paper](https://arxiv.org/abs/2306.05685) | [Dataset](https://github.com/lm-sys/FastChat/blob/main/docs/dataset_release.md) | [Twitter](https://twitter.com/lmsysorg) | [Discord](https://discord.gg/HSWAKCrnFx) |
|
26 |
|
27 |
LMSYS [Chatbot Arena](https://lmsys.org/blog/2023-05-03-arena/) is a crowdsourced open platform for LLM evals.
|
28 |
-
We've collected over **500,000** human preference votes to rank LLMs with the Elo ranking system.
|
29 |
-
|
30 |
-
Code to recreate leaderboard tables and plots in this [notebook]({notebook_url}) and more discussions in this blog [post](https://lmsys.org/blog/2023-12-07-leaderboard/).
|
31 |
"""
|
32 |
return leaderboard_md
|
33 |
|
@@ -37,9 +35,10 @@ def make_arena_leaderboard_md(arena_df):
|
|
37 |
total_models = len(arena_df)
|
38 |
space = " "
|
39 |
leaderboard_md = f"""
|
40 |
-
Total #models: **{total_models}**.{space} Total #votes: **{"{:,}".format(total_votes)}**.{space} Last updated: April
|
41 |
|
42 |
π£ **NEW!** View leaderboard for different categories (e.g., coding, long user query)!
|
|
|
43 |
"""
|
44 |
return leaderboard_md
|
45 |
|
@@ -405,7 +404,7 @@ def build_leaderboard_tab(elo_results_file, leaderboard_table_file, show_plot=Fa
|
|
405 |
gr.Markdown(
|
406 |
f"""Note: we take the 95% confidence interval into account when determining a model's ranking.
|
407 |
A model is ranked higher only if its lower bound of model score is higher than the upper bound of the other model's score.
|
408 |
-
See Figure 3 below for visualization of the confidence intervals.
|
409 |
""",
|
410 |
elem_id="leaderboard_markdown"
|
411 |
)
|
|
|
25 |
| [Vote](https://chat.lmsys.org) | [Blog](https://lmsys.org/blog/2023-05-03-arena/) | [GitHub](https://github.com/lm-sys/FastChat) | [Paper](https://arxiv.org/abs/2306.05685) | [Dataset](https://github.com/lm-sys/FastChat/blob/main/docs/dataset_release.md) | [Twitter](https://twitter.com/lmsysorg) | [Discord](https://discord.gg/HSWAKCrnFx) |
|
26 |
|
27 |
LMSYS [Chatbot Arena](https://lmsys.org/blog/2023-05-03-arena/) is a crowdsourced open platform for LLM evals.
|
28 |
+
We've collected over **500,000** human preference votes to rank LLMs with the Elo ranking system.
|
|
|
|
|
29 |
"""
|
30 |
return leaderboard_md
|
31 |
|
|
|
35 |
total_models = len(arena_df)
|
36 |
space = " "
|
37 |
leaderboard_md = f"""
|
38 |
+
Total #models: **{total_models}**.{space} Total #votes: **{"{:,}".format(total_votes)}**.{space} Last updated: April 11, 2024.
|
39 |
|
40 |
π£ **NEW!** View leaderboard for different categories (e.g., coding, long user query)!
|
41 |
+
Code to recreate leaderboard tables and plots in this [notebook]({notebook_url}). Cast your vote π³οΈ at [chat.lmsys.org](https://chat.lmsys.org)!
|
42 |
"""
|
43 |
return leaderboard_md
|
44 |
|
|
|
404 |
gr.Markdown(
|
405 |
f"""Note: we take the 95% confidence interval into account when determining a model's ranking.
|
406 |
A model is ranked higher only if its lower bound of model score is higher than the upper bound of the other model's score.
|
407 |
+
See Figure 3 below for visualization of the confidence intervals. More details in [notebook]({notebook_url}).
|
408 |
""",
|
409 |
elem_id="leaderboard_markdown"
|
410 |
)
|