Spaces:
Running
Running
hi-melnikov
commited on
Commit
•
7555fc7
1
Parent(s):
49498de
moved data into persistent dataset
Browse files- data/arena-hard-v0.1/question.jsonl +0 -0
- data/arena_hard_battles.jsonl +0 -0
- data/bootstrapping_results.jsonl +0 -100
- data/leaderboard.json +0 -329
- data/leaderboard_logs/README.md +0 -3
- src/envs.py +2 -1
- src/gen/arena_hard_leaderboard_20240514.json +0 -329
- src/gen/arena_hard_leaderboard_20240515.json +0 -329
- src/gen/show_result.py +2 -2
- src/leaderboard/build_leaderboard.py +5 -12
data/arena-hard-v0.1/question.jsonl
DELETED
The diff for this file is too large to render.
See raw diff
|
|
data/arena_hard_battles.jsonl
DELETED
The diff for this file is too large to render.
See raw diff
|
|
data/bootstrapping_results.jsonl
DELETED
@@ -1,100 +0,0 @@
|
|
1 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":855.5644665503,"gigachat_lite":726.6208252619}
|
2 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":859.0709454157,"gigachat_lite":738.5741612323}
|
3 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":865.0434024226,"gigachat_lite":734.1011761886}
|
4 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":860.399655762,"gigachat_lite":729.5571514643}
|
5 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":855.1731508697,"gigachat_lite":728.758372467}
|
6 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":855.5326400531,"gigachat_lite":733.7900136425}
|
7 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":866.7819454641,"gigachat_lite":719.043685497}
|
8 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":858.5219875589,"gigachat_lite":714.8370789545}
|
9 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":861.4603125434,"gigachat_lite":725.8752720444}
|
10 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":859.8350548067,"gigachat_lite":715.266084892}
|
11 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":862.7609222876,"gigachat_lite":727.2017077065}
|
12 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":854.2414273092,"gigachat_lite":739.3798608124}
|
13 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":862.374147169,"gigachat_lite":719.6304899658}
|
14 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":863.1792770928,"gigachat_lite":734.0546251412}
|
15 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":865.2996605704,"gigachat_lite":718.4924449088}
|
16 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":864.8988771163,"gigachat_lite":721.0729415472}
|
17 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":867.0356240274,"gigachat_lite":738.5699274129}
|
18 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":871.6157440982,"gigachat_lite":723.7105361329}
|
19 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":861.9225322393,"gigachat_lite":728.2971721354}
|
20 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":864.7557130348,"gigachat_lite":737.8461934603}
|
21 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":853.284444198,"gigachat_lite":748.9971545908}
|
22 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":851.7087385877,"gigachat_lite":713.1462726999}
|
23 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":871.482425846,"gigachat_lite":720.2960317186}
|
24 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":866.6122634027,"gigachat_lite":727.2517234335}
|
25 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":852.7157509126,"gigachat_lite":694.2654473149}
|
26 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":859.7938560994,"gigachat_lite":735.6639839406}
|
27 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":874.1682886992,"gigachat_lite":730.5016731736}
|
28 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":855.4589887037,"gigachat_lite":734.4551919945}
|
29 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":850.0205093168,"gigachat_lite":728.8931636911}
|
30 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":875.7282859976,"gigachat_lite":717.6726330463}
|
31 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":865.3647024942,"gigachat_lite":733.3721052861}
|
32 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":856.1797064852,"gigachat_lite":725.7981758416}
|
33 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":867.6238850835,"gigachat_lite":731.0409312559}
|
34 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":857.7097671655,"gigachat_lite":715.3647090465}
|
35 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":874.4978660071,"gigachat_lite":737.7875979517}
|
36 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":857.5650653089,"gigachat_lite":729.3512200797}
|
37 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":890.8852955482,"gigachat_lite":715.9010959711}
|
38 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":855.6426165155,"gigachat_lite":722.2116159282}
|
39 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":859.3456423505,"gigachat_lite":724.6752254921}
|
40 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":857.4854945486,"gigachat_lite":718.5749125859}
|
41 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":880.1901418236,"gigachat_lite":723.0132896162}
|
42 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":849.6103242372,"gigachat_lite":732.3587564613}
|
43 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":871.0458800663,"gigachat_lite":740.6268654101}
|
44 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":877.4244267245,"gigachat_lite":724.6297632896}
|
45 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":875.3479511716,"gigachat_lite":743.701641735}
|
46 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":859.1269918194,"gigachat_lite":723.5736702859}
|
47 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":857.8015195801,"gigachat_lite":731.9752231934}
|
48 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":868.2750694028,"gigachat_lite":722.3929635211}
|
49 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":868.0957706924,"gigachat_lite":721.9705147906}
|
50 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":870.6012679715,"gigachat_lite":738.9123529498}
|
51 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":862.269673472,"gigachat_lite":733.7609432817}
|
52 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":864.2488571071,"gigachat_lite":724.1850017217}
|
53 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":874.1624601722,"gigachat_lite":727.8550112565}
|
54 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":863.1194231025,"gigachat_lite":731.3315308989}
|
55 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":857.1192986285,"gigachat_lite":722.5721295254}
|
56 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":862.0030926827,"gigachat_lite":729.8940208849}
|
57 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":861.5474187298,"gigachat_lite":735.9873637973}
|
58 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":880.5566205251,"gigachat_lite":730.6501947523}
|
59 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":861.7223684538,"gigachat_lite":702.8268457509}
|
60 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":874.9512628918,"gigachat_lite":732.6491227137}
|
61 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":858.7260910186,"gigachat_lite":736.225411771}
|
62 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":871.4133525673,"gigachat_lite":745.6156113918}
|
63 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":866.2715335516,"gigachat_lite":721.0912474577}
|
64 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":861.3256361213,"gigachat_lite":736.2254117629}
|
65 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":866.9022358038,"gigachat_lite":732.9674153867}
|
66 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":867.5601382523,"gigachat_lite":723.0966793643}
|
67 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":864.5272121008,"gigachat_lite":718.0704518208}
|
68 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":866.7782194777,"gigachat_lite":722.2852812675}
|
69 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":865.4086246736,"gigachat_lite":745.1185090985}
|
70 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":870.0314924292,"gigachat_lite":736.9690722951}
|
71 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":855.3587976891,"gigachat_lite":742.6306627437}
|
72 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":851.5511568095,"gigachat_lite":733.1555506911}
|
73 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":863.2094645624,"gigachat_lite":721.7491525609}
|
74 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":861.0624318318,"gigachat_lite":723.0795022704}
|
75 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":848.5397354473,"gigachat_lite":717.9478748234}
|
76 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":857.9432204946,"gigachat_lite":726.703609728}
|
77 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":861.2370229881,"gigachat_lite":725.3073844986}
|
78 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":878.2964116149,"gigachat_lite":722.2116156669}
|
79 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":857.9909782749,"gigachat_lite":720.1865370325}
|
80 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":871.9069179589,"gigachat_lite":731.5240457448}
|
81 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":860.2445059252,"gigachat_lite":737.0781670626}
|
82 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":850.4012745111,"gigachat_lite":708.356058121}
|
83 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":866.7922558028,"gigachat_lite":730.3511179714}
|
84 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":862.2175409513,"gigachat_lite":727.5035049316}
|
85 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":856.8494155845,"gigachat_lite":706.4191731996}
|
86 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":856.4641060792,"gigachat_lite":734.2333848904}
|
87 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":878.905415424,"gigachat_lite":736.5196621633}
|
88 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":851.8853822745,"gigachat_lite":724.9647865416}
|
89 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":859.2360763272,"gigachat_lite":718.7060814362}
|
90 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":869.1579952553,"gigachat_lite":722.5615781913}
|
91 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":855.2369472583,"gigachat_lite":731.6666527735}
|
92 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":859.2009612357,"gigachat_lite":722.1914533305}
|
93 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":876.2027799847,"gigachat_lite":719.1795542579}
|
94 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":849.6362696273,"gigachat_lite":730.3223324585}
|
95 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":865.1318475963,"gigachat_lite":724.1322488355}
|
96 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":855.8791178271,"gigachat_lite":734.6332090556}
|
97 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":873.3916447336,"gigachat_lite":716.1292305518}
|
98 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":867.1797828548,"gigachat_lite":726.7846008592}
|
99 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":865.1613697328,"gigachat_lite":717.027778133}
|
100 |
-
{"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":875.1689869302,"gigachat_lite":728.6562483681}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
data/leaderboard.json
DELETED
@@ -1,329 +0,0 @@
|
|
1 |
-
[
|
2 |
-
{
|
3 |
-
"results":[
|
4 |
-
1000.0,
|
5 |
-
1000.0,
|
6 |
-
1000.0,
|
7 |
-
1000.0,
|
8 |
-
1000.0,
|
9 |
-
1000.0,
|
10 |
-
1000.0,
|
11 |
-
1000.0,
|
12 |
-
1000.0,
|
13 |
-
1000.0,
|
14 |
-
1000.0,
|
15 |
-
1000.0,
|
16 |
-
1000.0,
|
17 |
-
1000.0,
|
18 |
-
1000.0,
|
19 |
-
1000.0,
|
20 |
-
1000.0,
|
21 |
-
1000.0,
|
22 |
-
1000.0,
|
23 |
-
1000.0,
|
24 |
-
1000.0,
|
25 |
-
1000.0,
|
26 |
-
1000.0,
|
27 |
-
1000.0,
|
28 |
-
1000.0,
|
29 |
-
1000.0,
|
30 |
-
1000.0,
|
31 |
-
1000.0,
|
32 |
-
1000.0,
|
33 |
-
1000.0,
|
34 |
-
1000.0,
|
35 |
-
1000.0,
|
36 |
-
1000.0,
|
37 |
-
1000.0,
|
38 |
-
1000.0,
|
39 |
-
1000.0,
|
40 |
-
1000.0,
|
41 |
-
1000.0,
|
42 |
-
1000.0,
|
43 |
-
1000.0,
|
44 |
-
1000.0,
|
45 |
-
1000.0,
|
46 |
-
1000.0,
|
47 |
-
1000.0,
|
48 |
-
1000.0,
|
49 |
-
1000.0,
|
50 |
-
1000.0,
|
51 |
-
1000.0,
|
52 |
-
1000.0,
|
53 |
-
1000.0,
|
54 |
-
1000.0,
|
55 |
-
1000.0,
|
56 |
-
1000.0,
|
57 |
-
1000.0,
|
58 |
-
1000.0,
|
59 |
-
1000.0,
|
60 |
-
1000.0,
|
61 |
-
1000.0,
|
62 |
-
1000.0,
|
63 |
-
1000.0,
|
64 |
-
1000.0,
|
65 |
-
1000.0,
|
66 |
-
1000.0,
|
67 |
-
1000.0,
|
68 |
-
1000.0,
|
69 |
-
1000.0,
|
70 |
-
1000.0,
|
71 |
-
1000.0,
|
72 |
-
1000.0,
|
73 |
-
1000.0,
|
74 |
-
1000.0,
|
75 |
-
1000.0,
|
76 |
-
1000.0,
|
77 |
-
1000.0,
|
78 |
-
1000.0,
|
79 |
-
1000.0,
|
80 |
-
1000.0,
|
81 |
-
1000.0,
|
82 |
-
1000.0,
|
83 |
-
1000.0,
|
84 |
-
1000.0,
|
85 |
-
1000.0,
|
86 |
-
1000.0,
|
87 |
-
1000.0,
|
88 |
-
1000.0,
|
89 |
-
1000.0,
|
90 |
-
1000.0,
|
91 |
-
1000.0,
|
92 |
-
1000.0,
|
93 |
-
1000.0,
|
94 |
-
1000.0,
|
95 |
-
1000.0,
|
96 |
-
1000.0,
|
97 |
-
1000.0,
|
98 |
-
1000.0,
|
99 |
-
1000.0,
|
100 |
-
1000.0,
|
101 |
-
1000.0,
|
102 |
-
1000.0,
|
103 |
-
1000.0
|
104 |
-
],
|
105 |
-
"model":"gpt-3.5-turbo-0125",
|
106 |
-
"score":50.0,
|
107 |
-
"lower":50.0,
|
108 |
-
"upper":50.0,
|
109 |
-
"avg_tokens":0.0
|
110 |
-
},
|
111 |
-
{
|
112 |
-
"results":[
|
113 |
-
855.5644665503,
|
114 |
-
859.0709454157,
|
115 |
-
865.0434024226,
|
116 |
-
860.399655762,
|
117 |
-
855.1731508697,
|
118 |
-
855.5326400531,
|
119 |
-
866.7819454641,
|
120 |
-
858.5219875589,
|
121 |
-
861.4603125434,
|
122 |
-
859.8350548067,
|
123 |
-
862.7609222876,
|
124 |
-
854.2414273092,
|
125 |
-
862.374147169,
|
126 |
-
863.1792770928,
|
127 |
-
865.2996605704,
|
128 |
-
864.8988771163,
|
129 |
-
867.0356240274,
|
130 |
-
871.6157440982,
|
131 |
-
861.9225322393,
|
132 |
-
864.7557130348,
|
133 |
-
853.284444198,
|
134 |
-
851.7087385877,
|
135 |
-
871.482425846,
|
136 |
-
866.6122634027,
|
137 |
-
852.7157509126,
|
138 |
-
859.7938560994,
|
139 |
-
874.1682886992,
|
140 |
-
855.4589887037,
|
141 |
-
850.0205093168,
|
142 |
-
875.7282859976,
|
143 |
-
865.3647024942,
|
144 |
-
856.1797064852,
|
145 |
-
867.6238850835,
|
146 |
-
857.7097671655,
|
147 |
-
874.4978660071,
|
148 |
-
857.5650653089,
|
149 |
-
890.8852955482,
|
150 |
-
855.6426165155,
|
151 |
-
859.3456423505,
|
152 |
-
857.4854945486,
|
153 |
-
880.1901418236,
|
154 |
-
849.6103242372,
|
155 |
-
871.0458800663,
|
156 |
-
877.4244267245,
|
157 |
-
875.3479511716,
|
158 |
-
859.1269918194,
|
159 |
-
857.8015195801,
|
160 |
-
868.2750694028,
|
161 |
-
868.0957706924,
|
162 |
-
870.6012679715,
|
163 |
-
862.269673472,
|
164 |
-
864.2488571071,
|
165 |
-
874.1624601722,
|
166 |
-
863.1194231025,
|
167 |
-
857.1192986285,
|
168 |
-
862.0030926827,
|
169 |
-
861.5474187298,
|
170 |
-
880.5566205251,
|
171 |
-
861.7223684538,
|
172 |
-
874.9512628918,
|
173 |
-
858.7260910186,
|
174 |
-
871.4133525673,
|
175 |
-
866.2715335516,
|
176 |
-
861.3256361213,
|
177 |
-
866.9022358038,
|
178 |
-
867.5601382523,
|
179 |
-
864.5272121008,
|
180 |
-
866.7782194777,
|
181 |
-
865.4086246736,
|
182 |
-
870.0314924292,
|
183 |
-
855.3587976891,
|
184 |
-
851.5511568095,
|
185 |
-
863.2094645624,
|
186 |
-
861.0624318318,
|
187 |
-
848.5397354473,
|
188 |
-
857.9432204946,
|
189 |
-
861.2370229881,
|
190 |
-
878.2964116149,
|
191 |
-
857.9909782749,
|
192 |
-
871.9069179589,
|
193 |
-
860.2445059252,
|
194 |
-
850.4012745111,
|
195 |
-
866.7922558028,
|
196 |
-
862.2175409513,
|
197 |
-
856.8494155845,
|
198 |
-
856.4641060792,
|
199 |
-
878.905415424,
|
200 |
-
851.8853822745,
|
201 |
-
859.2360763272,
|
202 |
-
869.1579952553,
|
203 |
-
855.2369472583,
|
204 |
-
859.2009612357,
|
205 |
-
876.2027799847,
|
206 |
-
849.6362696273,
|
207 |
-
865.1318475963,
|
208 |
-
855.8791178271,
|
209 |
-
873.3916447336,
|
210 |
-
867.1797828548,
|
211 |
-
865.1613697328,
|
212 |
-
875.1689869302
|
213 |
-
],
|
214 |
-
"model":"gigachat_pro",
|
215 |
-
"score":31.37,
|
216 |
-
"lower":29.64,
|
217 |
-
"upper":33.33,
|
218 |
-
"avg_tokens":0.0
|
219 |
-
},
|
220 |
-
{
|
221 |
-
"results":[
|
222 |
-
726.6208252619,
|
223 |
-
738.5741612323,
|
224 |
-
734.1011761886,
|
225 |
-
729.5571514643,
|
226 |
-
728.758372467,
|
227 |
-
733.7900136425,
|
228 |
-
719.043685497,
|
229 |
-
714.8370789545,
|
230 |
-
725.8752720444,
|
231 |
-
715.266084892,
|
232 |
-
727.2017077065,
|
233 |
-
739.3798608124,
|
234 |
-
719.6304899658,
|
235 |
-
734.0546251412,
|
236 |
-
718.4924449088,
|
237 |
-
721.0729415472,
|
238 |
-
738.5699274129,
|
239 |
-
723.7105361329,
|
240 |
-
728.2971721354,
|
241 |
-
737.8461934603,
|
242 |
-
748.9971545908,
|
243 |
-
713.1462726999,
|
244 |
-
720.2960317186,
|
245 |
-
727.2517234335,
|
246 |
-
694.2654473149,
|
247 |
-
735.6639839406,
|
248 |
-
730.5016731736,
|
249 |
-
734.4551919945,
|
250 |
-
728.8931636911,
|
251 |
-
717.6726330463,
|
252 |
-
733.3721052861,
|
253 |
-
725.7981758416,
|
254 |
-
731.0409312559,
|
255 |
-
715.3647090465,
|
256 |
-
737.7875979517,
|
257 |
-
729.3512200797,
|
258 |
-
715.9010959711,
|
259 |
-
722.2116159282,
|
260 |
-
724.6752254921,
|
261 |
-
718.5749125859,
|
262 |
-
723.0132896162,
|
263 |
-
732.3587564613,
|
264 |
-
740.6268654101,
|
265 |
-
724.6297632896,
|
266 |
-
743.701641735,
|
267 |
-
723.5736702859,
|
268 |
-
731.9752231934,
|
269 |
-
722.3929635211,
|
270 |
-
721.9705147906,
|
271 |
-
738.9123529498,
|
272 |
-
733.7609432817,
|
273 |
-
724.1850017217,
|
274 |
-
727.8550112565,
|
275 |
-
731.3315308989,
|
276 |
-
722.5721295254,
|
277 |
-
729.8940208849,
|
278 |
-
735.9873637973,
|
279 |
-
730.6501947523,
|
280 |
-
702.8268457509,
|
281 |
-
732.6491227137,
|
282 |
-
736.225411771,
|
283 |
-
745.6156113918,
|
284 |
-
721.0912474577,
|
285 |
-
736.2254117629,
|
286 |
-
732.9674153867,
|
287 |
-
723.0966793643,
|
288 |
-
718.0704518208,
|
289 |
-
722.2852812675,
|
290 |
-
745.1185090985,
|
291 |
-
736.9690722951,
|
292 |
-
742.6306627437,
|
293 |
-
733.1555506911,
|
294 |
-
721.7491525609,
|
295 |
-
723.0795022704,
|
296 |
-
717.9478748234,
|
297 |
-
726.703609728,
|
298 |
-
725.3073844986,
|
299 |
-
722.2116156669,
|
300 |
-
720.1865370325,
|
301 |
-
731.5240457448,
|
302 |
-
737.0781670626,
|
303 |
-
708.356058121,
|
304 |
-
730.3511179714,
|
305 |
-
727.5035049316,
|
306 |
-
706.4191731996,
|
307 |
-
734.2333848904,
|
308 |
-
736.5196621633,
|
309 |
-
724.9647865416,
|
310 |
-
718.7060814362,
|
311 |
-
722.5615781913,
|
312 |
-
731.6666527735,
|
313 |
-
722.1914533305,
|
314 |
-
719.1795542579,
|
315 |
-
730.3223324585,
|
316 |
-
724.1322488355,
|
317 |
-
734.6332090556,
|
318 |
-
716.1292305518,
|
319 |
-
726.7846008592,
|
320 |
-
717.027778133,
|
321 |
-
728.6562483681
|
322 |
-
],
|
323 |
-
"model":"gigachat_lite",
|
324 |
-
"score":17.2,
|
325 |
-
"lower":15.65,
|
326 |
-
"upper":18.68,
|
327 |
-
"avg_tokens":276.0
|
328 |
-
}
|
329 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
data/leaderboard_logs/README.md
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
# Логи генерации leaderboard
|
2 |
-
Сюда из space отправляются после генерации
|
3 |
-
Сохраняется только последний за день
|
|
|
|
|
|
|
|
src/envs.py
CHANGED
@@ -27,7 +27,8 @@ if not os.access(HF_HOME, os.W_OK):
|
|
27 |
else:
|
28 |
print("Write access confirmed for HF_HOME")
|
29 |
|
30 |
-
|
|
|
31 |
|
32 |
RESET_JUDGEMENT_ENV = "RESET_JUDGEMENT"
|
33 |
|
|
|
27 |
else:
|
28 |
print("Write access confirmed for HF_HOME")
|
29 |
|
30 |
+
DATA_PATH = os.path.join(HF_HOME, "data")
|
31 |
+
DATA_ARENA_PATH = os.path.join(DATA_PATH, "arena-hard-v0.1")
|
32 |
|
33 |
RESET_JUDGEMENT_ENV = "RESET_JUDGEMENT"
|
34 |
|
src/gen/arena_hard_leaderboard_20240514.json
DELETED
@@ -1,329 +0,0 @@
|
|
1 |
-
[
|
2 |
-
{
|
3 |
-
"results":[
|
4 |
-
1000.0,
|
5 |
-
1000.0,
|
6 |
-
1000.0,
|
7 |
-
1000.0,
|
8 |
-
1000.0,
|
9 |
-
1000.0,
|
10 |
-
1000.0,
|
11 |
-
1000.0,
|
12 |
-
1000.0,
|
13 |
-
1000.0,
|
14 |
-
1000.0,
|
15 |
-
1000.0,
|
16 |
-
1000.0,
|
17 |
-
1000.0,
|
18 |
-
1000.0,
|
19 |
-
1000.0,
|
20 |
-
1000.0,
|
21 |
-
1000.0,
|
22 |
-
1000.0,
|
23 |
-
1000.0,
|
24 |
-
1000.0,
|
25 |
-
1000.0,
|
26 |
-
1000.0,
|
27 |
-
1000.0,
|
28 |
-
1000.0,
|
29 |
-
1000.0,
|
30 |
-
1000.0,
|
31 |
-
1000.0,
|
32 |
-
1000.0,
|
33 |
-
1000.0,
|
34 |
-
1000.0,
|
35 |
-
1000.0,
|
36 |
-
1000.0,
|
37 |
-
1000.0,
|
38 |
-
1000.0,
|
39 |
-
1000.0,
|
40 |
-
1000.0,
|
41 |
-
1000.0,
|
42 |
-
1000.0,
|
43 |
-
1000.0,
|
44 |
-
1000.0,
|
45 |
-
1000.0,
|
46 |
-
1000.0,
|
47 |
-
1000.0,
|
48 |
-
1000.0,
|
49 |
-
1000.0,
|
50 |
-
1000.0,
|
51 |
-
1000.0,
|
52 |
-
1000.0,
|
53 |
-
1000.0,
|
54 |
-
1000.0,
|
55 |
-
1000.0,
|
56 |
-
1000.0,
|
57 |
-
1000.0,
|
58 |
-
1000.0,
|
59 |
-
1000.0,
|
60 |
-
1000.0,
|
61 |
-
1000.0,
|
62 |
-
1000.0,
|
63 |
-
1000.0,
|
64 |
-
1000.0,
|
65 |
-
1000.0,
|
66 |
-
1000.0,
|
67 |
-
1000.0,
|
68 |
-
1000.0,
|
69 |
-
1000.0,
|
70 |
-
1000.0,
|
71 |
-
1000.0,
|
72 |
-
1000.0,
|
73 |
-
1000.0,
|
74 |
-
1000.0,
|
75 |
-
1000.0,
|
76 |
-
1000.0,
|
77 |
-
1000.0,
|
78 |
-
1000.0,
|
79 |
-
1000.0,
|
80 |
-
1000.0,
|
81 |
-
1000.0,
|
82 |
-
1000.0,
|
83 |
-
1000.0,
|
84 |
-
1000.0,
|
85 |
-
1000.0,
|
86 |
-
1000.0,
|
87 |
-
1000.0,
|
88 |
-
1000.0,
|
89 |
-
1000.0,
|
90 |
-
1000.0,
|
91 |
-
1000.0,
|
92 |
-
1000.0,
|
93 |
-
1000.0,
|
94 |
-
1000.0,
|
95 |
-
1000.0,
|
96 |
-
1000.0,
|
97 |
-
1000.0,
|
98 |
-
1000.0,
|
99 |
-
1000.0,
|
100 |
-
1000.0,
|
101 |
-
1000.0,
|
102 |
-
1000.0,
|
103 |
-
1000.0
|
104 |
-
],
|
105 |
-
"model":"gpt-3.5-turbo-0125",
|
106 |
-
"score":50.0,
|
107 |
-
"lower":50.0,
|
108 |
-
"upper":50.0,
|
109 |
-
"avg_tokens":0.0
|
110 |
-
},
|
111 |
-
{
|
112 |
-
"results":[
|
113 |
-
855.5644665503,
|
114 |
-
859.0709454157,
|
115 |
-
865.0434024226,
|
116 |
-
860.399655762,
|
117 |
-
855.1731508697,
|
118 |
-
855.5326400531,
|
119 |
-
866.7819454641,
|
120 |
-
858.5219875589,
|
121 |
-
861.4603125434,
|
122 |
-
859.8350548067,
|
123 |
-
862.7609222876,
|
124 |
-
854.2414273092,
|
125 |
-
862.374147169,
|
126 |
-
863.1792770928,
|
127 |
-
865.2996605704,
|
128 |
-
864.8988771163,
|
129 |
-
867.0356240274,
|
130 |
-
871.6157440982,
|
131 |
-
861.9225322393,
|
132 |
-
864.7557130348,
|
133 |
-
853.284444198,
|
134 |
-
851.7087385877,
|
135 |
-
871.482425846,
|
136 |
-
866.6122634027,
|
137 |
-
852.7157509126,
|
138 |
-
859.7938560994,
|
139 |
-
874.1682886992,
|
140 |
-
855.4589887037,
|
141 |
-
850.0205093168,
|
142 |
-
875.7282859976,
|
143 |
-
865.3647024942,
|
144 |
-
856.1797064852,
|
145 |
-
867.6238850835,
|
146 |
-
857.7097671655,
|
147 |
-
874.4978660071,
|
148 |
-
857.5650653089,
|
149 |
-
890.8852955482,
|
150 |
-
855.6426165155,
|
151 |
-
859.3456423505,
|
152 |
-
857.4854945486,
|
153 |
-
880.1901418236,
|
154 |
-
849.6103242372,
|
155 |
-
871.0458800663,
|
156 |
-
877.4244267245,
|
157 |
-
875.3479511716,
|
158 |
-
859.1269918194,
|
159 |
-
857.8015195801,
|
160 |
-
868.2750694028,
|
161 |
-
868.0957706924,
|
162 |
-
870.6012679715,
|
163 |
-
862.269673472,
|
164 |
-
864.2488571071,
|
165 |
-
874.1624601722,
|
166 |
-
863.1194231025,
|
167 |
-
857.1192986285,
|
168 |
-
862.0030926827,
|
169 |
-
861.5474187298,
|
170 |
-
880.5566205251,
|
171 |
-
861.7223684538,
|
172 |
-
874.9512628918,
|
173 |
-
858.7260910186,
|
174 |
-
871.4133525673,
|
175 |
-
866.2715335516,
|
176 |
-
861.3256361213,
|
177 |
-
866.9022358038,
|
178 |
-
867.5601382523,
|
179 |
-
864.5272121008,
|
180 |
-
866.7782194777,
|
181 |
-
865.4086246736,
|
182 |
-
870.0314924292,
|
183 |
-
855.3587976891,
|
184 |
-
851.5511568095,
|
185 |
-
863.2094645624,
|
186 |
-
861.0624318318,
|
187 |
-
848.5397354473,
|
188 |
-
857.9432204946,
|
189 |
-
861.2370229881,
|
190 |
-
878.2964116149,
|
191 |
-
857.9909782749,
|
192 |
-
871.9069179589,
|
193 |
-
860.2445059252,
|
194 |
-
850.4012745111,
|
195 |
-
866.7922558028,
|
196 |
-
862.2175409513,
|
197 |
-
856.8494155845,
|
198 |
-
856.4641060792,
|
199 |
-
878.905415424,
|
200 |
-
851.8853822745,
|
201 |
-
859.2360763272,
|
202 |
-
869.1579952553,
|
203 |
-
855.2369472583,
|
204 |
-
859.2009612357,
|
205 |
-
876.2027799847,
|
206 |
-
849.6362696273,
|
207 |
-
865.1318475963,
|
208 |
-
855.8791178271,
|
209 |
-
873.3916447336,
|
210 |
-
867.1797828548,
|
211 |
-
865.1613697328,
|
212 |
-
875.1689869302
|
213 |
-
],
|
214 |
-
"model":"gigachat_pro",
|
215 |
-
"score":31.37,
|
216 |
-
"lower":29.64,
|
217 |
-
"upper":33.33,
|
218 |
-
"avg_tokens":0.0
|
219 |
-
},
|
220 |
-
{
|
221 |
-
"results":[
|
222 |
-
726.6208252619,
|
223 |
-
738.5741612323,
|
224 |
-
734.1011761886,
|
225 |
-
729.5571514643,
|
226 |
-
728.758372467,
|
227 |
-
733.7900136425,
|
228 |
-
719.043685497,
|
229 |
-
714.8370789545,
|
230 |
-
725.8752720444,
|
231 |
-
715.266084892,
|
232 |
-
727.2017077065,
|
233 |
-
739.3798608124,
|
234 |
-
719.6304899658,
|
235 |
-
734.0546251412,
|
236 |
-
718.4924449088,
|
237 |
-
721.0729415472,
|
238 |
-
738.5699274129,
|
239 |
-
723.7105361329,
|
240 |
-
728.2971721354,
|
241 |
-
737.8461934603,
|
242 |
-
748.9971545908,
|
243 |
-
713.1462726999,
|
244 |
-
720.2960317186,
|
245 |
-
727.2517234335,
|
246 |
-
694.2654473149,
|
247 |
-
735.6639839406,
|
248 |
-
730.5016731736,
|
249 |
-
734.4551919945,
|
250 |
-
728.8931636911,
|
251 |
-
717.6726330463,
|
252 |
-
733.3721052861,
|
253 |
-
725.7981758416,
|
254 |
-
731.0409312559,
|
255 |
-
715.3647090465,
|
256 |
-
737.7875979517,
|
257 |
-
729.3512200797,
|
258 |
-
715.9010959711,
|
259 |
-
722.2116159282,
|
260 |
-
724.6752254921,
|
261 |
-
718.5749125859,
|
262 |
-
723.0132896162,
|
263 |
-
732.3587564613,
|
264 |
-
740.6268654101,
|
265 |
-
724.6297632896,
|
266 |
-
743.701641735,
|
267 |
-
723.5736702859,
|
268 |
-
731.9752231934,
|
269 |
-
722.3929635211,
|
270 |
-
721.9705147906,
|
271 |
-
738.9123529498,
|
272 |
-
733.7609432817,
|
273 |
-
724.1850017217,
|
274 |
-
727.8550112565,
|
275 |
-
731.3315308989,
|
276 |
-
722.5721295254,
|
277 |
-
729.8940208849,
|
278 |
-
735.9873637973,
|
279 |
-
730.6501947523,
|
280 |
-
702.8268457509,
|
281 |
-
732.6491227137,
|
282 |
-
736.225411771,
|
283 |
-
745.6156113918,
|
284 |
-
721.0912474577,
|
285 |
-
736.2254117629,
|
286 |
-
732.9674153867,
|
287 |
-
723.0966793643,
|
288 |
-
718.0704518208,
|
289 |
-
722.2852812675,
|
290 |
-
745.1185090985,
|
291 |
-
736.9690722951,
|
292 |
-
742.6306627437,
|
293 |
-
733.1555506911,
|
294 |
-
721.7491525609,
|
295 |
-
723.0795022704,
|
296 |
-
717.9478748234,
|
297 |
-
726.703609728,
|
298 |
-
725.3073844986,
|
299 |
-
722.2116156669,
|
300 |
-
720.1865370325,
|
301 |
-
731.5240457448,
|
302 |
-
737.0781670626,
|
303 |
-
708.356058121,
|
304 |
-
730.3511179714,
|
305 |
-
727.5035049316,
|
306 |
-
706.4191731996,
|
307 |
-
734.2333848904,
|
308 |
-
736.5196621633,
|
309 |
-
724.9647865416,
|
310 |
-
718.7060814362,
|
311 |
-
722.5615781913,
|
312 |
-
731.6666527735,
|
313 |
-
722.1914533305,
|
314 |
-
719.1795542579,
|
315 |
-
730.3223324585,
|
316 |
-
724.1322488355,
|
317 |
-
734.6332090556,
|
318 |
-
716.1292305518,
|
319 |
-
726.7846008592,
|
320 |
-
717.027778133,
|
321 |
-
728.6562483681
|
322 |
-
],
|
323 |
-
"model":"gigachat_lite",
|
324 |
-
"score":17.2,
|
325 |
-
"lower":15.65,
|
326 |
-
"upper":18.68,
|
327 |
-
"avg_tokens":276.0
|
328 |
-
}
|
329 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/gen/arena_hard_leaderboard_20240515.json
DELETED
@@ -1,329 +0,0 @@
|
|
1 |
-
[
|
2 |
-
{
|
3 |
-
"results":[
|
4 |
-
1000.0,
|
5 |
-
1000.0,
|
6 |
-
1000.0,
|
7 |
-
1000.0,
|
8 |
-
1000.0,
|
9 |
-
1000.0,
|
10 |
-
1000.0,
|
11 |
-
1000.0,
|
12 |
-
1000.0,
|
13 |
-
1000.0,
|
14 |
-
1000.0,
|
15 |
-
1000.0,
|
16 |
-
1000.0,
|
17 |
-
1000.0,
|
18 |
-
1000.0,
|
19 |
-
1000.0,
|
20 |
-
1000.0,
|
21 |
-
1000.0,
|
22 |
-
1000.0,
|
23 |
-
1000.0,
|
24 |
-
1000.0,
|
25 |
-
1000.0,
|
26 |
-
1000.0,
|
27 |
-
1000.0,
|
28 |
-
1000.0,
|
29 |
-
1000.0,
|
30 |
-
1000.0,
|
31 |
-
1000.0,
|
32 |
-
1000.0,
|
33 |
-
1000.0,
|
34 |
-
1000.0,
|
35 |
-
1000.0,
|
36 |
-
1000.0,
|
37 |
-
1000.0,
|
38 |
-
1000.0,
|
39 |
-
1000.0,
|
40 |
-
1000.0,
|
41 |
-
1000.0,
|
42 |
-
1000.0,
|
43 |
-
1000.0,
|
44 |
-
1000.0,
|
45 |
-
1000.0,
|
46 |
-
1000.0,
|
47 |
-
1000.0,
|
48 |
-
1000.0,
|
49 |
-
1000.0,
|
50 |
-
1000.0,
|
51 |
-
1000.0,
|
52 |
-
1000.0,
|
53 |
-
1000.0,
|
54 |
-
1000.0,
|
55 |
-
1000.0,
|
56 |
-
1000.0,
|
57 |
-
1000.0,
|
58 |
-
1000.0,
|
59 |
-
1000.0,
|
60 |
-
1000.0,
|
61 |
-
1000.0,
|
62 |
-
1000.0,
|
63 |
-
1000.0,
|
64 |
-
1000.0,
|
65 |
-
1000.0,
|
66 |
-
1000.0,
|
67 |
-
1000.0,
|
68 |
-
1000.0,
|
69 |
-
1000.0,
|
70 |
-
1000.0,
|
71 |
-
1000.0,
|
72 |
-
1000.0,
|
73 |
-
1000.0,
|
74 |
-
1000.0,
|
75 |
-
1000.0,
|
76 |
-
1000.0,
|
77 |
-
1000.0,
|
78 |
-
1000.0,
|
79 |
-
1000.0,
|
80 |
-
1000.0,
|
81 |
-
1000.0,
|
82 |
-
1000.0,
|
83 |
-
1000.0,
|
84 |
-
1000.0,
|
85 |
-
1000.0,
|
86 |
-
1000.0,
|
87 |
-
1000.0,
|
88 |
-
1000.0,
|
89 |
-
1000.0,
|
90 |
-
1000.0,
|
91 |
-
1000.0,
|
92 |
-
1000.0,
|
93 |
-
1000.0,
|
94 |
-
1000.0,
|
95 |
-
1000.0,
|
96 |
-
1000.0,
|
97 |
-
1000.0,
|
98 |
-
1000.0,
|
99 |
-
1000.0,
|
100 |
-
1000.0,
|
101 |
-
1000.0,
|
102 |
-
1000.0,
|
103 |
-
1000.0
|
104 |
-
],
|
105 |
-
"model":"gpt-3.5-turbo-0125",
|
106 |
-
"score":50.0,
|
107 |
-
"lower":50.0,
|
108 |
-
"upper":50.0,
|
109 |
-
"avg_tokens":0.0
|
110 |
-
},
|
111 |
-
{
|
112 |
-
"results":[
|
113 |
-
855.5644665503,
|
114 |
-
859.0709454157,
|
115 |
-
865.0434024226,
|
116 |
-
860.399655762,
|
117 |
-
855.1731508697,
|
118 |
-
855.5326400531,
|
119 |
-
866.7819454641,
|
120 |
-
858.5219875589,
|
121 |
-
861.4603125434,
|
122 |
-
859.8350548067,
|
123 |
-
862.7609222876,
|
124 |
-
854.2414273092,
|
125 |
-
862.374147169,
|
126 |
-
863.1792770928,
|
127 |
-
865.2996605704,
|
128 |
-
864.8988771163,
|
129 |
-
867.0356240274,
|
130 |
-
871.6157440982,
|
131 |
-
861.9225322393,
|
132 |
-
864.7557130348,
|
133 |
-
853.284444198,
|
134 |
-
851.7087385877,
|
135 |
-
871.482425846,
|
136 |
-
866.6122634027,
|
137 |
-
852.7157509126,
|
138 |
-
859.7938560994,
|
139 |
-
874.1682886992,
|
140 |
-
855.4589887037,
|
141 |
-
850.0205093168,
|
142 |
-
875.7282859976,
|
143 |
-
865.3647024942,
|
144 |
-
856.1797064852,
|
145 |
-
867.6238850835,
|
146 |
-
857.7097671655,
|
147 |
-
874.4978660071,
|
148 |
-
857.5650653089,
|
149 |
-
890.8852955482,
|
150 |
-
855.6426165155,
|
151 |
-
859.3456423505,
|
152 |
-
857.4854945486,
|
153 |
-
880.1901418236,
|
154 |
-
849.6103242372,
|
155 |
-
871.0458800663,
|
156 |
-
877.4244267245,
|
157 |
-
875.3479511716,
|
158 |
-
859.1269918194,
|
159 |
-
857.8015195801,
|
160 |
-
868.2750694028,
|
161 |
-
868.0957706924,
|
162 |
-
870.6012679715,
|
163 |
-
862.269673472,
|
164 |
-
864.2488571071,
|
165 |
-
874.1624601722,
|
166 |
-
863.1194231025,
|
167 |
-
857.1192986285,
|
168 |
-
862.0030926827,
|
169 |
-
861.5474187298,
|
170 |
-
880.5566205251,
|
171 |
-
861.7223684538,
|
172 |
-
874.9512628918,
|
173 |
-
858.7260910186,
|
174 |
-
871.4133525673,
|
175 |
-
866.2715335516,
|
176 |
-
861.3256361213,
|
177 |
-
866.9022358038,
|
178 |
-
867.5601382523,
|
179 |
-
864.5272121008,
|
180 |
-
866.7782194777,
|
181 |
-
865.4086246736,
|
182 |
-
870.0314924292,
|
183 |
-
855.3587976891,
|
184 |
-
851.5511568095,
|
185 |
-
863.2094645624,
|
186 |
-
861.0624318318,
|
187 |
-
848.5397354473,
|
188 |
-
857.9432204946,
|
189 |
-
861.2370229881,
|
190 |
-
878.2964116149,
|
191 |
-
857.9909782749,
|
192 |
-
871.9069179589,
|
193 |
-
860.2445059252,
|
194 |
-
850.4012745111,
|
195 |
-
866.7922558028,
|
196 |
-
862.2175409513,
|
197 |
-
856.8494155845,
|
198 |
-
856.4641060792,
|
199 |
-
878.905415424,
|
200 |
-
851.8853822745,
|
201 |
-
859.2360763272,
|
202 |
-
869.1579952553,
|
203 |
-
855.2369472583,
|
204 |
-
859.2009612357,
|
205 |
-
876.2027799847,
|
206 |
-
849.6362696273,
|
207 |
-
865.1318475963,
|
208 |
-
855.8791178271,
|
209 |
-
873.3916447336,
|
210 |
-
867.1797828548,
|
211 |
-
865.1613697328,
|
212 |
-
875.1689869302
|
213 |
-
],
|
214 |
-
"model":"gigachat_pro",
|
215 |
-
"score":31.37,
|
216 |
-
"lower":29.64,
|
217 |
-
"upper":33.33,
|
218 |
-
"avg_tokens":0.0
|
219 |
-
},
|
220 |
-
{
|
221 |
-
"results":[
|
222 |
-
726.6208252619,
|
223 |
-
738.5741612323,
|
224 |
-
734.1011761886,
|
225 |
-
729.5571514643,
|
226 |
-
728.758372467,
|
227 |
-
733.7900136425,
|
228 |
-
719.043685497,
|
229 |
-
714.8370789545,
|
230 |
-
725.8752720444,
|
231 |
-
715.266084892,
|
232 |
-
727.2017077065,
|
233 |
-
739.3798608124,
|
234 |
-
719.6304899658,
|
235 |
-
734.0546251412,
|
236 |
-
718.4924449088,
|
237 |
-
721.0729415472,
|
238 |
-
738.5699274129,
|
239 |
-
723.7105361329,
|
240 |
-
728.2971721354,
|
241 |
-
737.8461934603,
|
242 |
-
748.9971545908,
|
243 |
-
713.1462726999,
|
244 |
-
720.2960317186,
|
245 |
-
727.2517234335,
|
246 |
-
694.2654473149,
|
247 |
-
735.6639839406,
|
248 |
-
730.5016731736,
|
249 |
-
734.4551919945,
|
250 |
-
728.8931636911,
|
251 |
-
717.6726330463,
|
252 |
-
733.3721052861,
|
253 |
-
725.7981758416,
|
254 |
-
731.0409312559,
|
255 |
-
715.3647090465,
|
256 |
-
737.7875979517,
|
257 |
-
729.3512200797,
|
258 |
-
715.9010959711,
|
259 |
-
722.2116159282,
|
260 |
-
724.6752254921,
|
261 |
-
718.5749125859,
|
262 |
-
723.0132896162,
|
263 |
-
732.3587564613,
|
264 |
-
740.6268654101,
|
265 |
-
724.6297632896,
|
266 |
-
743.701641735,
|
267 |
-
723.5736702859,
|
268 |
-
731.9752231934,
|
269 |
-
722.3929635211,
|
270 |
-
721.9705147906,
|
271 |
-
738.9123529498,
|
272 |
-
733.7609432817,
|
273 |
-
724.1850017217,
|
274 |
-
727.8550112565,
|
275 |
-
731.3315308989,
|
276 |
-
722.5721295254,
|
277 |
-
729.8940208849,
|
278 |
-
735.9873637973,
|
279 |
-
730.6501947523,
|
280 |
-
702.8268457509,
|
281 |
-
732.6491227137,
|
282 |
-
736.225411771,
|
283 |
-
745.6156113918,
|
284 |
-
721.0912474577,
|
285 |
-
736.2254117629,
|
286 |
-
732.9674153867,
|
287 |
-
723.0966793643,
|
288 |
-
718.0704518208,
|
289 |
-
722.2852812675,
|
290 |
-
745.1185090985,
|
291 |
-
736.9690722951,
|
292 |
-
742.6306627437,
|
293 |
-
733.1555506911,
|
294 |
-
721.7491525609,
|
295 |
-
723.0795022704,
|
296 |
-
717.9478748234,
|
297 |
-
726.703609728,
|
298 |
-
725.3073844986,
|
299 |
-
722.2116156669,
|
300 |
-
720.1865370325,
|
301 |
-
731.5240457448,
|
302 |
-
737.0781670626,
|
303 |
-
708.356058121,
|
304 |
-
730.3511179714,
|
305 |
-
727.5035049316,
|
306 |
-
706.4191731996,
|
307 |
-
734.2333848904,
|
308 |
-
736.5196621633,
|
309 |
-
724.9647865416,
|
310 |
-
718.7060814362,
|
311 |
-
722.5615781913,
|
312 |
-
731.6666527735,
|
313 |
-
722.1914533305,
|
314 |
-
719.1795542579,
|
315 |
-
730.3223324585,
|
316 |
-
724.1322488355,
|
317 |
-
734.6332090556,
|
318 |
-
716.1292305518,
|
319 |
-
726.7846008592,
|
320 |
-
717.027778133,
|
321 |
-
728.6562483681
|
322 |
-
],
|
323 |
-
"model":"gigachat_lite",
|
324 |
-
"score":17.2,
|
325 |
-
"lower":15.65,
|
326 |
-
"upper":18.68,
|
327 |
-
"avg_tokens":276.0
|
328 |
-
}
|
329 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/gen/show_result.py
CHANGED
@@ -263,13 +263,13 @@ if __name__ == "__main__":
|
|
263 |
huggingface_hub.HfApi().upload_file(
|
264 |
path_or_fileobj=json_file_name,
|
265 |
path_in_repo="data/leaderboard.json",
|
266 |
-
repo_id="Vikhrmodels/leaderboard",
|
267 |
repo_type="space",
|
268 |
)
|
269 |
|
270 |
huggingface_hub.HfApi().upload_file(
|
271 |
path_or_fileobj=json_file_name,
|
272 |
path_in_repo=f"data/leaderboard_logs/{json_file_name}",
|
273 |
-
repo_id="Vikhrmodels/leaderboard",
|
274 |
repo_type="dataset",
|
275 |
)
|
|
|
263 |
huggingface_hub.HfApi().upload_file(
|
264 |
path_or_fileobj=json_file_name,
|
265 |
path_in_repo="data/leaderboard.json",
|
266 |
+
repo_id="Vikhrmodels/arena-leaderboard-metainfo",
|
267 |
repo_type="space",
|
268 |
)
|
269 |
|
270 |
huggingface_hub.HfApi().upload_file(
|
271 |
path_or_fileobj=json_file_name,
|
272 |
path_in_repo=f"data/leaderboard_logs/{json_file_name}",
|
273 |
+
repo_id="Vikhrmodels/arena-leaderboard-metainfo",
|
274 |
repo_type="dataset",
|
275 |
)
|
src/leaderboard/build_leaderboard.py
CHANGED
@@ -1,13 +1,12 @@
|
|
1 |
import json
|
2 |
import logging
|
3 |
import os
|
4 |
-
import subprocess
|
5 |
import time
|
6 |
|
7 |
import pandas as pd
|
8 |
from huggingface_hub import snapshot_download
|
9 |
|
10 |
-
from src.envs import DATA_ARENA_PATH,
|
11 |
|
12 |
# Configure logging
|
13 |
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
|
@@ -55,21 +54,15 @@ def download_openbench():
|
|
55 |
"""Downloads pre generated data"""
|
56 |
os.makedirs(DATA_ARENA_PATH, exist_ok=True)
|
57 |
|
|
|
|
|
|
|
58 |
# download answers of different models that we trust
|
59 |
download_dataset("Vikhrmodels/openbench-eval", DATA_ARENA_PATH)
|
60 |
|
61 |
-
print("\nInternal models in openbench-eval:")
|
62 |
-
subprocess.run(["ls", f"{DATA_ARENA_PATH}/model_answers/internal/"], check=False)
|
63 |
-
|
64 |
-
print("\nExternal models in openbench-eval:")
|
65 |
-
subprocess.run(["ls", f"{DATA_ARENA_PATH}/model_answers/external/"], check=False)
|
66 |
-
|
67 |
-
print("\nJudgement in openbench-eval")
|
68 |
-
subprocess.run(["ls", f"{DATA_ARENA_PATH}/model_judgement/gpt-4-1106-preview"], check=False)
|
69 |
-
|
70 |
|
71 |
def build_leadearboard_df():
|
72 |
# Retrieve the leaderboard DataFrame
|
73 |
-
with open(f"{
|
74 |
leaderboard_df = pd.DataFrame.from_records(json.load(eval_file))
|
75 |
return leaderboard_df.copy()
|
|
|
1 |
import json
|
2 |
import logging
|
3 |
import os
|
|
|
4 |
import time
|
5 |
|
6 |
import pandas as pd
|
7 |
from huggingface_hub import snapshot_download
|
8 |
|
9 |
+
from src.envs import DATA_ARENA_PATH, DATA_PATH
|
10 |
|
11 |
# Configure logging
|
12 |
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
|
|
|
54 |
"""Downloads pre generated data"""
|
55 |
os.makedirs(DATA_ARENA_PATH, exist_ok=True)
|
56 |
|
57 |
+
# download prev autogenerated leaderboard files
|
58 |
+
download_dataset("Vikhrmodels/arena-leaderboard-metainfo", DATA_PATH)
|
59 |
+
|
60 |
# download answers of different models that we trust
|
61 |
download_dataset("Vikhrmodels/openbench-eval", DATA_ARENA_PATH)
|
62 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
63 |
|
64 |
def build_leadearboard_df():
|
65 |
# Retrieve the leaderboard DataFrame
|
66 |
+
with open(f"{DATA_PATH}/leaderboard.json", "r", encoding="utf-8") as eval_file:
|
67 |
leaderboard_df = pd.DataFrame.from_records(json.load(eval_file))
|
68 |
return leaderboard_df.copy()
|