Upload arena_hard_leaderboard_20240716.csv

#44
Files changed (1) hide show
  1. arena_hard_leaderboard_20240716.csv +55 -0
arena_hard_leaderboard_20240716.csv ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model,score,rating_q025,rating_q975,CI,avg_tokens,date
2
+ gpt-4-turbo-2024-04-09,82.63,80.46,84.54,"(-2.17, +1.91)",662.0,2024-07-16
3
+ claude-3-5-sonnet-20240620,79.35,77.39,81.06,"(-1.96, +1.71)",567.0,2024-07-16
4
+ gpt-4o-2024-05-13,79.21,77.49,80.94,"(-1.72, +1.73)",696.0,2024-07-16
5
+ gpt-4-0125-preview,77.96,76.14,79.9,"(-1.82, +1.94)",619.0,2024-07-16
6
+ gemini-1.5-pro-api-0514,71.95,69.38,74.07,"(-2.57, +2.12)",676.0,2024-07-16
7
+ yi-large-preview,71.47,69.28,73.64,"(-2.19, +2.17)",720.0,2024-07-16
8
+ glm-4-0520,63.84,61.54,66.2,"(-2.30, +2.36)",636.0,2024-07-16
9
+ yi-large,63.7,60.82,66.48,"(-2.88, +2.78)",626.0,2024-07-16
10
+ deepseek-coder-v2,62.3,60.09,64.7,"(-2.21, +2.40)",578.0,2024-07-16
11
+ claude-3-opus-20240229,60.35,57.24,62.66,"(-3.11, +2.31)",541.0,2024-07-16
12
+ gemma-2-27b-it,57.51,55.1,59.64,"(-2.41, +2.13)",577.0,2024-07-16
13
+ glm-4-0116,55.72,54.04,58.39,"(-1.68, +2.67)",622.0,2024-07-16
14
+ gemini-1.5-pro-api-0409-preview,53.37,51.12,56.17,"(-2.25, +2.80)",478.0,2024-07-16
15
+ glm-4-air,50.88,48.09,52.93,"(-2.79, +2.05)",619.0,2024-07-16
16
+ gpt-4-0314,50.0,50.0,50.0,"(-0.00, +0.00)",423.0,2024-07-16
17
+ gemini-1.5-flash-api-0514,49.61,47.12,52.24,"(-2.49, +2.63)",642.0,2024-07-16
18
+ qwen2-72b-instruct,46.86,44.73,48.7,"(-2.13, +1.84)",515.0,2024-07-16
19
+ claude-3-sonnet-20240229,46.8,43.82,49.13,"(-2.98, +2.33)",552.0,2024-07-16
20
+ llama-3-70b-instruct,46.57,44.22,49.35,"(-2.35, +2.78)",591.0,2024-07-16
21
+ claude-3-haiku-20240307,41.47,38.95,44.09,"(-2.52, +2.62)",505.0,2024-07-16
22
+ gpt-4-0613,37.9,35.45,40.33,"(-2.45, +2.43)",354.0,2024-07-16
23
+ mistral-large-2402,37.71,35.42,40.14,"(-2.29, +2.43)",400.0,2024-07-16
24
+ mixtral-8x22b-instruct-v0.1,36.36,34.3,39.26,"(-2.06, +2.90)",430.0,2024-07-16
25
+ qwen1.5-72b-chat,36.12,34.08,38.27,"(-2.04, +2.15)",474.0,2024-07-16
26
+ phi-3-medium-4k-instruct,33.37,31.29,35.97,"(-2.08, +2.60)",517.0,2024-07-16
27
+ command-r-plus,33.07,30.8,35.29,"(-2.27, +2.22)",541.0,2024-07-16
28
+ mistral-medium,31.9,29.44,33.91,"(-2.46, +2.01)",485.0,2024-07-16
29
+ phi-3-small-8k-instruct,29.77,27.47,32.1,"(-2.30, +2.33)",568.0,2024-07-16
30
+ mistral-next,27.37,25.34,29.64,"(-2.03, +2.27)",297.0,2024-07-16
31
+ gpt-3.5-turbo-0613,24.82,23.0,26.99,"(-1.82, +2.17)",401.0,2024-07-16
32
+ dbrx-instruct-preview,24.63,22.94,26.63,"(-1.69, +2.00)",415.0,2024-07-16
33
+ claude-2.0,23.99,22.16,25.78,"(-1.83, +1.79)",295.0,2024-07-16
34
+ mixtral-8x7b-instruct-v0.1,23.4,21.55,25.4,"(-1.85, +2.00)",457.0,2024-07-16
35
+ gpt-3.5-turbo-0125,23.34,21.31,24.86,"(-2.03, +1.52)",329.0,2024-07-16
36
+ yi-34b-chat,23.15,21.26,25.17,"(-1.89, +2.02)",611.0,2024-07-16
37
+ starling-lm-7b-beta,23.02,20.7,24.97,"(-2.32, +1.95)",530.0,2024-07-16
38
+ claude-2.1,22.77,20.93,24.33,"(-1.84, +1.56)",290.0,2024-07-16
39
+ snorkel-mistral-pairrm-dpo,20.73,18.75,22.79,"(-1.98, +2.06)",564.0,2024-07-16
40
+ llama-3-8b-instruct,20.56,18.67,22.27,"(-1.89, +1.71)",585.0,2024-07-16
41
+ gpt-3.5-turbo-1106,18.87,17.16,20.4,"(-1.71, +1.53)",285.0,2024-07-16
42
+ gpt-3.5-turbo-0314,18.05,16.21,20.07,"(-1.84, +2.02)",334.0,2024-07-16
43
+ gemini-pro,17.8,16.11,19.61,"(-1.69, +1.81)",322.0,2024-07-16
44
+ snowflake-arctic-instruct,17.61,15.8,19.38,"(-1.81, +1.77)",365.0,2024-07-16
45
+ command-r,17.02,15.63,18.76,"(-1.39, +1.74)",432.0,2024-07-16
46
+ phi-3-mini-128k-instruct,15.42,13.94,17.05,"(-1.48, +1.63)",609.0,2024-07-16
47
+ tulu-2-dpo-70b,14.99,13.37,16.93,"(-1.62, +1.94)",550.0,2024-07-16
48
+ starling-lm-7b-alpha,12.8,11.45,14.33,"(-1.35, +1.53)",483.0,2024-07-16
49
+ mistral-7b-instruct,12.58,11.08,13.64,"(-1.50, +1.06)",541.0,2024-07-16
50
+ gemma-1.1-7b-it,12.09,10.7,13.16,"(-1.39, +1.07)",341.0,2024-07-16
51
+ llama-2-70b-chat,11.55,10.27,12.72,"(-1.28, +1.17)",595.0,2024-07-16
52
+ vicuna-33b,8.63,7.57,9.76,"(-1.06, +1.13)",451.0,2024-07-16
53
+ gemma-7b-it,7.47,6.29,8.64,"(-1.18, +1.17)",378.0,2024-07-16
54
+ gemma-1.1-2b-it,3.36,2.78,4.17,"(-0.58, +0.81)",316.0,2024-07-16
55
+ gemma-2b-it,3.0,2.28,3.67,"(-0.72, +0.67)",369.0,2024-07-16