Upload arena_hard_leaderboard_20240716.csv
#44
by
connorchenn
- opened
arena_hard_leaderboard_20240716.csv
ADDED
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
model,score,rating_q025,rating_q975,CI,avg_tokens,date
|
2 |
+
gpt-4-turbo-2024-04-09,82.63,80.46,84.54,"(-2.17, +1.91)",662.0,2024-07-16
|
3 |
+
claude-3-5-sonnet-20240620,79.35,77.39,81.06,"(-1.96, +1.71)",567.0,2024-07-16
|
4 |
+
gpt-4o-2024-05-13,79.21,77.49,80.94,"(-1.72, +1.73)",696.0,2024-07-16
|
5 |
+
gpt-4-0125-preview,77.96,76.14,79.9,"(-1.82, +1.94)",619.0,2024-07-16
|
6 |
+
gemini-1.5-pro-api-0514,71.95,69.38,74.07,"(-2.57, +2.12)",676.0,2024-07-16
|
7 |
+
yi-large-preview,71.47,69.28,73.64,"(-2.19, +2.17)",720.0,2024-07-16
|
8 |
+
glm-4-0520,63.84,61.54,66.2,"(-2.30, +2.36)",636.0,2024-07-16
|
9 |
+
yi-large,63.7,60.82,66.48,"(-2.88, +2.78)",626.0,2024-07-16
|
10 |
+
deepseek-coder-v2,62.3,60.09,64.7,"(-2.21, +2.40)",578.0,2024-07-16
|
11 |
+
claude-3-opus-20240229,60.35,57.24,62.66,"(-3.11, +2.31)",541.0,2024-07-16
|
12 |
+
gemma-2-27b-it,57.51,55.1,59.64,"(-2.41, +2.13)",577.0,2024-07-16
|
13 |
+
glm-4-0116,55.72,54.04,58.39,"(-1.68, +2.67)",622.0,2024-07-16
|
14 |
+
gemini-1.5-pro-api-0409-preview,53.37,51.12,56.17,"(-2.25, +2.80)",478.0,2024-07-16
|
15 |
+
glm-4-air,50.88,48.09,52.93,"(-2.79, +2.05)",619.0,2024-07-16
|
16 |
+
gpt-4-0314,50.0,50.0,50.0,"(-0.00, +0.00)",423.0,2024-07-16
|
17 |
+
gemini-1.5-flash-api-0514,49.61,47.12,52.24,"(-2.49, +2.63)",642.0,2024-07-16
|
18 |
+
qwen2-72b-instruct,46.86,44.73,48.7,"(-2.13, +1.84)",515.0,2024-07-16
|
19 |
+
claude-3-sonnet-20240229,46.8,43.82,49.13,"(-2.98, +2.33)",552.0,2024-07-16
|
20 |
+
llama-3-70b-instruct,46.57,44.22,49.35,"(-2.35, +2.78)",591.0,2024-07-16
|
21 |
+
claude-3-haiku-20240307,41.47,38.95,44.09,"(-2.52, +2.62)",505.0,2024-07-16
|
22 |
+
gpt-4-0613,37.9,35.45,40.33,"(-2.45, +2.43)",354.0,2024-07-16
|
23 |
+
mistral-large-2402,37.71,35.42,40.14,"(-2.29, +2.43)",400.0,2024-07-16
|
24 |
+
mixtral-8x22b-instruct-v0.1,36.36,34.3,39.26,"(-2.06, +2.90)",430.0,2024-07-16
|
25 |
+
qwen1.5-72b-chat,36.12,34.08,38.27,"(-2.04, +2.15)",474.0,2024-07-16
|
26 |
+
phi-3-medium-4k-instruct,33.37,31.29,35.97,"(-2.08, +2.60)",517.0,2024-07-16
|
27 |
+
command-r-plus,33.07,30.8,35.29,"(-2.27, +2.22)",541.0,2024-07-16
|
28 |
+
mistral-medium,31.9,29.44,33.91,"(-2.46, +2.01)",485.0,2024-07-16
|
29 |
+
phi-3-small-8k-instruct,29.77,27.47,32.1,"(-2.30, +2.33)",568.0,2024-07-16
|
30 |
+
mistral-next,27.37,25.34,29.64,"(-2.03, +2.27)",297.0,2024-07-16
|
31 |
+
gpt-3.5-turbo-0613,24.82,23.0,26.99,"(-1.82, +2.17)",401.0,2024-07-16
|
32 |
+
dbrx-instruct-preview,24.63,22.94,26.63,"(-1.69, +2.00)",415.0,2024-07-16
|
33 |
+
claude-2.0,23.99,22.16,25.78,"(-1.83, +1.79)",295.0,2024-07-16
|
34 |
+
mixtral-8x7b-instruct-v0.1,23.4,21.55,25.4,"(-1.85, +2.00)",457.0,2024-07-16
|
35 |
+
gpt-3.5-turbo-0125,23.34,21.31,24.86,"(-2.03, +1.52)",329.0,2024-07-16
|
36 |
+
yi-34b-chat,23.15,21.26,25.17,"(-1.89, +2.02)",611.0,2024-07-16
|
37 |
+
starling-lm-7b-beta,23.02,20.7,24.97,"(-2.32, +1.95)",530.0,2024-07-16
|
38 |
+
claude-2.1,22.77,20.93,24.33,"(-1.84, +1.56)",290.0,2024-07-16
|
39 |
+
snorkel-mistral-pairrm-dpo,20.73,18.75,22.79,"(-1.98, +2.06)",564.0,2024-07-16
|
40 |
+
llama-3-8b-instruct,20.56,18.67,22.27,"(-1.89, +1.71)",585.0,2024-07-16
|
41 |
+
gpt-3.5-turbo-1106,18.87,17.16,20.4,"(-1.71, +1.53)",285.0,2024-07-16
|
42 |
+
gpt-3.5-turbo-0314,18.05,16.21,20.07,"(-1.84, +2.02)",334.0,2024-07-16
|
43 |
+
gemini-pro,17.8,16.11,19.61,"(-1.69, +1.81)",322.0,2024-07-16
|
44 |
+
snowflake-arctic-instruct,17.61,15.8,19.38,"(-1.81, +1.77)",365.0,2024-07-16
|
45 |
+
command-r,17.02,15.63,18.76,"(-1.39, +1.74)",432.0,2024-07-16
|
46 |
+
phi-3-mini-128k-instruct,15.42,13.94,17.05,"(-1.48, +1.63)",609.0,2024-07-16
|
47 |
+
tulu-2-dpo-70b,14.99,13.37,16.93,"(-1.62, +1.94)",550.0,2024-07-16
|
48 |
+
starling-lm-7b-alpha,12.8,11.45,14.33,"(-1.35, +1.53)",483.0,2024-07-16
|
49 |
+
mistral-7b-instruct,12.58,11.08,13.64,"(-1.50, +1.06)",541.0,2024-07-16
|
50 |
+
gemma-1.1-7b-it,12.09,10.7,13.16,"(-1.39, +1.07)",341.0,2024-07-16
|
51 |
+
llama-2-70b-chat,11.55,10.27,12.72,"(-1.28, +1.17)",595.0,2024-07-16
|
52 |
+
vicuna-33b,8.63,7.57,9.76,"(-1.06, +1.13)",451.0,2024-07-16
|
53 |
+
gemma-7b-it,7.47,6.29,8.64,"(-1.18, +1.17)",378.0,2024-07-16
|
54 |
+
gemma-1.1-2b-it,3.36,2.78,4.17,"(-0.58, +0.81)",316.0,2024-07-16
|
55 |
+
gemma-2b-it,3.0,2.28,3.67,"(-0.72, +0.67)",369.0,2024-07-16
|