hi-melnikov commited on
Commit
7555fc7
1 Parent(s): 49498de

moved data into persistent dataset

Browse files
data/arena-hard-v0.1/question.jsonl DELETED
The diff for this file is too large to render. See raw diff
 
data/arena_hard_battles.jsonl DELETED
The diff for this file is too large to render. See raw diff
 
data/bootstrapping_results.jsonl DELETED
@@ -1,100 +0,0 @@
1
- {"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":855.5644665503,"gigachat_lite":726.6208252619}
2
- {"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":859.0709454157,"gigachat_lite":738.5741612323}
3
- {"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":865.0434024226,"gigachat_lite":734.1011761886}
4
- {"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":860.399655762,"gigachat_lite":729.5571514643}
5
- {"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":855.1731508697,"gigachat_lite":728.758372467}
6
- {"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":855.5326400531,"gigachat_lite":733.7900136425}
7
- {"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":866.7819454641,"gigachat_lite":719.043685497}
8
- {"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":858.5219875589,"gigachat_lite":714.8370789545}
9
- {"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":861.4603125434,"gigachat_lite":725.8752720444}
10
- {"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":859.8350548067,"gigachat_lite":715.266084892}
11
- {"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":862.7609222876,"gigachat_lite":727.2017077065}
12
- {"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":854.2414273092,"gigachat_lite":739.3798608124}
13
- {"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":862.374147169,"gigachat_lite":719.6304899658}
14
- {"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":863.1792770928,"gigachat_lite":734.0546251412}
15
- {"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":865.2996605704,"gigachat_lite":718.4924449088}
16
- {"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":864.8988771163,"gigachat_lite":721.0729415472}
17
- {"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":867.0356240274,"gigachat_lite":738.5699274129}
18
- {"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":871.6157440982,"gigachat_lite":723.7105361329}
19
- {"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":861.9225322393,"gigachat_lite":728.2971721354}
20
- {"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":864.7557130348,"gigachat_lite":737.8461934603}
21
- {"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":853.284444198,"gigachat_lite":748.9971545908}
22
- {"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":851.7087385877,"gigachat_lite":713.1462726999}
23
- {"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":871.482425846,"gigachat_lite":720.2960317186}
24
- {"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":866.6122634027,"gigachat_lite":727.2517234335}
25
- {"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":852.7157509126,"gigachat_lite":694.2654473149}
26
- {"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":859.7938560994,"gigachat_lite":735.6639839406}
27
- {"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":874.1682886992,"gigachat_lite":730.5016731736}
28
- {"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":855.4589887037,"gigachat_lite":734.4551919945}
29
- {"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":850.0205093168,"gigachat_lite":728.8931636911}
30
- {"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":875.7282859976,"gigachat_lite":717.6726330463}
31
- {"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":865.3647024942,"gigachat_lite":733.3721052861}
32
- {"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":856.1797064852,"gigachat_lite":725.7981758416}
33
- {"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":867.6238850835,"gigachat_lite":731.0409312559}
34
- {"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":857.7097671655,"gigachat_lite":715.3647090465}
35
- {"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":874.4978660071,"gigachat_lite":737.7875979517}
36
- {"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":857.5650653089,"gigachat_lite":729.3512200797}
37
- {"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":890.8852955482,"gigachat_lite":715.9010959711}
38
- {"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":855.6426165155,"gigachat_lite":722.2116159282}
39
- {"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":859.3456423505,"gigachat_lite":724.6752254921}
40
- {"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":857.4854945486,"gigachat_lite":718.5749125859}
41
- {"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":880.1901418236,"gigachat_lite":723.0132896162}
42
- {"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":849.6103242372,"gigachat_lite":732.3587564613}
43
- {"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":871.0458800663,"gigachat_lite":740.6268654101}
44
- {"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":877.4244267245,"gigachat_lite":724.6297632896}
45
- {"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":875.3479511716,"gigachat_lite":743.701641735}
46
- {"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":859.1269918194,"gigachat_lite":723.5736702859}
47
- {"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":857.8015195801,"gigachat_lite":731.9752231934}
48
- {"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":868.2750694028,"gigachat_lite":722.3929635211}
49
- {"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":868.0957706924,"gigachat_lite":721.9705147906}
50
- {"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":870.6012679715,"gigachat_lite":738.9123529498}
51
- {"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":862.269673472,"gigachat_lite":733.7609432817}
52
- {"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":864.2488571071,"gigachat_lite":724.1850017217}
53
- {"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":874.1624601722,"gigachat_lite":727.8550112565}
54
- {"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":863.1194231025,"gigachat_lite":731.3315308989}
55
- {"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":857.1192986285,"gigachat_lite":722.5721295254}
56
- {"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":862.0030926827,"gigachat_lite":729.8940208849}
57
- {"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":861.5474187298,"gigachat_lite":735.9873637973}
58
- {"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":880.5566205251,"gigachat_lite":730.6501947523}
59
- {"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":861.7223684538,"gigachat_lite":702.8268457509}
60
- {"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":874.9512628918,"gigachat_lite":732.6491227137}
61
- {"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":858.7260910186,"gigachat_lite":736.225411771}
62
- {"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":871.4133525673,"gigachat_lite":745.6156113918}
63
- {"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":866.2715335516,"gigachat_lite":721.0912474577}
64
- {"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":861.3256361213,"gigachat_lite":736.2254117629}
65
- {"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":866.9022358038,"gigachat_lite":732.9674153867}
66
- {"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":867.5601382523,"gigachat_lite":723.0966793643}
67
- {"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":864.5272121008,"gigachat_lite":718.0704518208}
68
- {"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":866.7782194777,"gigachat_lite":722.2852812675}
69
- {"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":865.4086246736,"gigachat_lite":745.1185090985}
70
- {"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":870.0314924292,"gigachat_lite":736.9690722951}
71
- {"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":855.3587976891,"gigachat_lite":742.6306627437}
72
- {"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":851.5511568095,"gigachat_lite":733.1555506911}
73
- {"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":863.2094645624,"gigachat_lite":721.7491525609}
74
- {"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":861.0624318318,"gigachat_lite":723.0795022704}
75
- {"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":848.5397354473,"gigachat_lite":717.9478748234}
76
- {"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":857.9432204946,"gigachat_lite":726.703609728}
77
- {"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":861.2370229881,"gigachat_lite":725.3073844986}
78
- {"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":878.2964116149,"gigachat_lite":722.2116156669}
79
- {"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":857.9909782749,"gigachat_lite":720.1865370325}
80
- {"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":871.9069179589,"gigachat_lite":731.5240457448}
81
- {"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":860.2445059252,"gigachat_lite":737.0781670626}
82
- {"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":850.4012745111,"gigachat_lite":708.356058121}
83
- {"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":866.7922558028,"gigachat_lite":730.3511179714}
84
- {"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":862.2175409513,"gigachat_lite":727.5035049316}
85
- {"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":856.8494155845,"gigachat_lite":706.4191731996}
86
- {"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":856.4641060792,"gigachat_lite":734.2333848904}
87
- {"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":878.905415424,"gigachat_lite":736.5196621633}
88
- {"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":851.8853822745,"gigachat_lite":724.9647865416}
89
- {"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":859.2360763272,"gigachat_lite":718.7060814362}
90
- {"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":869.1579952553,"gigachat_lite":722.5615781913}
91
- {"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":855.2369472583,"gigachat_lite":731.6666527735}
92
- {"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":859.2009612357,"gigachat_lite":722.1914533305}
93
- {"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":876.2027799847,"gigachat_lite":719.1795542579}
94
- {"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":849.6362696273,"gigachat_lite":730.3223324585}
95
- {"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":865.1318475963,"gigachat_lite":724.1322488355}
96
- {"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":855.8791178271,"gigachat_lite":734.6332090556}
97
- {"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":873.3916447336,"gigachat_lite":716.1292305518}
98
- {"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":867.1797828548,"gigachat_lite":726.7846008592}
99
- {"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":865.1613697328,"gigachat_lite":717.027778133}
100
- {"gpt-3.5-turbo-0125":1000.0,"gigachat_pro":875.1689869302,"gigachat_lite":728.6562483681}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data/leaderboard.json DELETED
@@ -1,329 +0,0 @@
1
- [
2
- {
3
- "results":[
4
- 1000.0,
5
- 1000.0,
6
- 1000.0,
7
- 1000.0,
8
- 1000.0,
9
- 1000.0,
10
- 1000.0,
11
- 1000.0,
12
- 1000.0,
13
- 1000.0,
14
- 1000.0,
15
- 1000.0,
16
- 1000.0,
17
- 1000.0,
18
- 1000.0,
19
- 1000.0,
20
- 1000.0,
21
- 1000.0,
22
- 1000.0,
23
- 1000.0,
24
- 1000.0,
25
- 1000.0,
26
- 1000.0,
27
- 1000.0,
28
- 1000.0,
29
- 1000.0,
30
- 1000.0,
31
- 1000.0,
32
- 1000.0,
33
- 1000.0,
34
- 1000.0,
35
- 1000.0,
36
- 1000.0,
37
- 1000.0,
38
- 1000.0,
39
- 1000.0,
40
- 1000.0,
41
- 1000.0,
42
- 1000.0,
43
- 1000.0,
44
- 1000.0,
45
- 1000.0,
46
- 1000.0,
47
- 1000.0,
48
- 1000.0,
49
- 1000.0,
50
- 1000.0,
51
- 1000.0,
52
- 1000.0,
53
- 1000.0,
54
- 1000.0,
55
- 1000.0,
56
- 1000.0,
57
- 1000.0,
58
- 1000.0,
59
- 1000.0,
60
- 1000.0,
61
- 1000.0,
62
- 1000.0,
63
- 1000.0,
64
- 1000.0,
65
- 1000.0,
66
- 1000.0,
67
- 1000.0,
68
- 1000.0,
69
- 1000.0,
70
- 1000.0,
71
- 1000.0,
72
- 1000.0,
73
- 1000.0,
74
- 1000.0,
75
- 1000.0,
76
- 1000.0,
77
- 1000.0,
78
- 1000.0,
79
- 1000.0,
80
- 1000.0,
81
- 1000.0,
82
- 1000.0,
83
- 1000.0,
84
- 1000.0,
85
- 1000.0,
86
- 1000.0,
87
- 1000.0,
88
- 1000.0,
89
- 1000.0,
90
- 1000.0,
91
- 1000.0,
92
- 1000.0,
93
- 1000.0,
94
- 1000.0,
95
- 1000.0,
96
- 1000.0,
97
- 1000.0,
98
- 1000.0,
99
- 1000.0,
100
- 1000.0,
101
- 1000.0,
102
- 1000.0,
103
- 1000.0
104
- ],
105
- "model":"gpt-3.5-turbo-0125",
106
- "score":50.0,
107
- "lower":50.0,
108
- "upper":50.0,
109
- "avg_tokens":0.0
110
- },
111
- {
112
- "results":[
113
- 855.5644665503,
114
- 859.0709454157,
115
- 865.0434024226,
116
- 860.399655762,
117
- 855.1731508697,
118
- 855.5326400531,
119
- 866.7819454641,
120
- 858.5219875589,
121
- 861.4603125434,
122
- 859.8350548067,
123
- 862.7609222876,
124
- 854.2414273092,
125
- 862.374147169,
126
- 863.1792770928,
127
- 865.2996605704,
128
- 864.8988771163,
129
- 867.0356240274,
130
- 871.6157440982,
131
- 861.9225322393,
132
- 864.7557130348,
133
- 853.284444198,
134
- 851.7087385877,
135
- 871.482425846,
136
- 866.6122634027,
137
- 852.7157509126,
138
- 859.7938560994,
139
- 874.1682886992,
140
- 855.4589887037,
141
- 850.0205093168,
142
- 875.7282859976,
143
- 865.3647024942,
144
- 856.1797064852,
145
- 867.6238850835,
146
- 857.7097671655,
147
- 874.4978660071,
148
- 857.5650653089,
149
- 890.8852955482,
150
- 855.6426165155,
151
- 859.3456423505,
152
- 857.4854945486,
153
- 880.1901418236,
154
- 849.6103242372,
155
- 871.0458800663,
156
- 877.4244267245,
157
- 875.3479511716,
158
- 859.1269918194,
159
- 857.8015195801,
160
- 868.2750694028,
161
- 868.0957706924,
162
- 870.6012679715,
163
- 862.269673472,
164
- 864.2488571071,
165
- 874.1624601722,
166
- 863.1194231025,
167
- 857.1192986285,
168
- 862.0030926827,
169
- 861.5474187298,
170
- 880.5566205251,
171
- 861.7223684538,
172
- 874.9512628918,
173
- 858.7260910186,
174
- 871.4133525673,
175
- 866.2715335516,
176
- 861.3256361213,
177
- 866.9022358038,
178
- 867.5601382523,
179
- 864.5272121008,
180
- 866.7782194777,
181
- 865.4086246736,
182
- 870.0314924292,
183
- 855.3587976891,
184
- 851.5511568095,
185
- 863.2094645624,
186
- 861.0624318318,
187
- 848.5397354473,
188
- 857.9432204946,
189
- 861.2370229881,
190
- 878.2964116149,
191
- 857.9909782749,
192
- 871.9069179589,
193
- 860.2445059252,
194
- 850.4012745111,
195
- 866.7922558028,
196
- 862.2175409513,
197
- 856.8494155845,
198
- 856.4641060792,
199
- 878.905415424,
200
- 851.8853822745,
201
- 859.2360763272,
202
- 869.1579952553,
203
- 855.2369472583,
204
- 859.2009612357,
205
- 876.2027799847,
206
- 849.6362696273,
207
- 865.1318475963,
208
- 855.8791178271,
209
- 873.3916447336,
210
- 867.1797828548,
211
- 865.1613697328,
212
- 875.1689869302
213
- ],
214
- "model":"gigachat_pro",
215
- "score":31.37,
216
- "lower":29.64,
217
- "upper":33.33,
218
- "avg_tokens":0.0
219
- },
220
- {
221
- "results":[
222
- 726.6208252619,
223
- 738.5741612323,
224
- 734.1011761886,
225
- 729.5571514643,
226
- 728.758372467,
227
- 733.7900136425,
228
- 719.043685497,
229
- 714.8370789545,
230
- 725.8752720444,
231
- 715.266084892,
232
- 727.2017077065,
233
- 739.3798608124,
234
- 719.6304899658,
235
- 734.0546251412,
236
- 718.4924449088,
237
- 721.0729415472,
238
- 738.5699274129,
239
- 723.7105361329,
240
- 728.2971721354,
241
- 737.8461934603,
242
- 748.9971545908,
243
- 713.1462726999,
244
- 720.2960317186,
245
- 727.2517234335,
246
- 694.2654473149,
247
- 735.6639839406,
248
- 730.5016731736,
249
- 734.4551919945,
250
- 728.8931636911,
251
- 717.6726330463,
252
- 733.3721052861,
253
- 725.7981758416,
254
- 731.0409312559,
255
- 715.3647090465,
256
- 737.7875979517,
257
- 729.3512200797,
258
- 715.9010959711,
259
- 722.2116159282,
260
- 724.6752254921,
261
- 718.5749125859,
262
- 723.0132896162,
263
- 732.3587564613,
264
- 740.6268654101,
265
- 724.6297632896,
266
- 743.701641735,
267
- 723.5736702859,
268
- 731.9752231934,
269
- 722.3929635211,
270
- 721.9705147906,
271
- 738.9123529498,
272
- 733.7609432817,
273
- 724.1850017217,
274
- 727.8550112565,
275
- 731.3315308989,
276
- 722.5721295254,
277
- 729.8940208849,
278
- 735.9873637973,
279
- 730.6501947523,
280
- 702.8268457509,
281
- 732.6491227137,
282
- 736.225411771,
283
- 745.6156113918,
284
- 721.0912474577,
285
- 736.2254117629,
286
- 732.9674153867,
287
- 723.0966793643,
288
- 718.0704518208,
289
- 722.2852812675,
290
- 745.1185090985,
291
- 736.9690722951,
292
- 742.6306627437,
293
- 733.1555506911,
294
- 721.7491525609,
295
- 723.0795022704,
296
- 717.9478748234,
297
- 726.703609728,
298
- 725.3073844986,
299
- 722.2116156669,
300
- 720.1865370325,
301
- 731.5240457448,
302
- 737.0781670626,
303
- 708.356058121,
304
- 730.3511179714,
305
- 727.5035049316,
306
- 706.4191731996,
307
- 734.2333848904,
308
- 736.5196621633,
309
- 724.9647865416,
310
- 718.7060814362,
311
- 722.5615781913,
312
- 731.6666527735,
313
- 722.1914533305,
314
- 719.1795542579,
315
- 730.3223324585,
316
- 724.1322488355,
317
- 734.6332090556,
318
- 716.1292305518,
319
- 726.7846008592,
320
- 717.027778133,
321
- 728.6562483681
322
- ],
323
- "model":"gigachat_lite",
324
- "score":17.2,
325
- "lower":15.65,
326
- "upper":18.68,
327
- "avg_tokens":276.0
328
- }
329
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data/leaderboard_logs/README.md DELETED
@@ -1,3 +0,0 @@
1
- # Логи генерации leaderboard
2
- Сюда из space отправляются после генерации
3
- Сохраняется только последний за день
 
 
 
 
src/envs.py CHANGED
@@ -27,7 +27,8 @@ if not os.access(HF_HOME, os.W_OK):
27
  else:
28
  print("Write access confirmed for HF_HOME")
29
 
30
- DATA_ARENA_PATH = os.path.join(HF_HOME, "data/arena-hard-v0.1")
 
31
 
32
  RESET_JUDGEMENT_ENV = "RESET_JUDGEMENT"
33
 
 
27
  else:
28
  print("Write access confirmed for HF_HOME")
29
 
30
+ DATA_PATH = os.path.join(HF_HOME, "data")
31
+ DATA_ARENA_PATH = os.path.join(DATA_PATH, "arena-hard-v0.1")
32
 
33
  RESET_JUDGEMENT_ENV = "RESET_JUDGEMENT"
34
 
src/gen/arena_hard_leaderboard_20240514.json DELETED
@@ -1,329 +0,0 @@
1
- [
2
- {
3
- "results":[
4
- 1000.0,
5
- 1000.0,
6
- 1000.0,
7
- 1000.0,
8
- 1000.0,
9
- 1000.0,
10
- 1000.0,
11
- 1000.0,
12
- 1000.0,
13
- 1000.0,
14
- 1000.0,
15
- 1000.0,
16
- 1000.0,
17
- 1000.0,
18
- 1000.0,
19
- 1000.0,
20
- 1000.0,
21
- 1000.0,
22
- 1000.0,
23
- 1000.0,
24
- 1000.0,
25
- 1000.0,
26
- 1000.0,
27
- 1000.0,
28
- 1000.0,
29
- 1000.0,
30
- 1000.0,
31
- 1000.0,
32
- 1000.0,
33
- 1000.0,
34
- 1000.0,
35
- 1000.0,
36
- 1000.0,
37
- 1000.0,
38
- 1000.0,
39
- 1000.0,
40
- 1000.0,
41
- 1000.0,
42
- 1000.0,
43
- 1000.0,
44
- 1000.0,
45
- 1000.0,
46
- 1000.0,
47
- 1000.0,
48
- 1000.0,
49
- 1000.0,
50
- 1000.0,
51
- 1000.0,
52
- 1000.0,
53
- 1000.0,
54
- 1000.0,
55
- 1000.0,
56
- 1000.0,
57
- 1000.0,
58
- 1000.0,
59
- 1000.0,
60
- 1000.0,
61
- 1000.0,
62
- 1000.0,
63
- 1000.0,
64
- 1000.0,
65
- 1000.0,
66
- 1000.0,
67
- 1000.0,
68
- 1000.0,
69
- 1000.0,
70
- 1000.0,
71
- 1000.0,
72
- 1000.0,
73
- 1000.0,
74
- 1000.0,
75
- 1000.0,
76
- 1000.0,
77
- 1000.0,
78
- 1000.0,
79
- 1000.0,
80
- 1000.0,
81
- 1000.0,
82
- 1000.0,
83
- 1000.0,
84
- 1000.0,
85
- 1000.0,
86
- 1000.0,
87
- 1000.0,
88
- 1000.0,
89
- 1000.0,
90
- 1000.0,
91
- 1000.0,
92
- 1000.0,
93
- 1000.0,
94
- 1000.0,
95
- 1000.0,
96
- 1000.0,
97
- 1000.0,
98
- 1000.0,
99
- 1000.0,
100
- 1000.0,
101
- 1000.0,
102
- 1000.0,
103
- 1000.0
104
- ],
105
- "model":"gpt-3.5-turbo-0125",
106
- "score":50.0,
107
- "lower":50.0,
108
- "upper":50.0,
109
- "avg_tokens":0.0
110
- },
111
- {
112
- "results":[
113
- 855.5644665503,
114
- 859.0709454157,
115
- 865.0434024226,
116
- 860.399655762,
117
- 855.1731508697,
118
- 855.5326400531,
119
- 866.7819454641,
120
- 858.5219875589,
121
- 861.4603125434,
122
- 859.8350548067,
123
- 862.7609222876,
124
- 854.2414273092,
125
- 862.374147169,
126
- 863.1792770928,
127
- 865.2996605704,
128
- 864.8988771163,
129
- 867.0356240274,
130
- 871.6157440982,
131
- 861.9225322393,
132
- 864.7557130348,
133
- 853.284444198,
134
- 851.7087385877,
135
- 871.482425846,
136
- 866.6122634027,
137
- 852.7157509126,
138
- 859.7938560994,
139
- 874.1682886992,
140
- 855.4589887037,
141
- 850.0205093168,
142
- 875.7282859976,
143
- 865.3647024942,
144
- 856.1797064852,
145
- 867.6238850835,
146
- 857.7097671655,
147
- 874.4978660071,
148
- 857.5650653089,
149
- 890.8852955482,
150
- 855.6426165155,
151
- 859.3456423505,
152
- 857.4854945486,
153
- 880.1901418236,
154
- 849.6103242372,
155
- 871.0458800663,
156
- 877.4244267245,
157
- 875.3479511716,
158
- 859.1269918194,
159
- 857.8015195801,
160
- 868.2750694028,
161
- 868.0957706924,
162
- 870.6012679715,
163
- 862.269673472,
164
- 864.2488571071,
165
- 874.1624601722,
166
- 863.1194231025,
167
- 857.1192986285,
168
- 862.0030926827,
169
- 861.5474187298,
170
- 880.5566205251,
171
- 861.7223684538,
172
- 874.9512628918,
173
- 858.7260910186,
174
- 871.4133525673,
175
- 866.2715335516,
176
- 861.3256361213,
177
- 866.9022358038,
178
- 867.5601382523,
179
- 864.5272121008,
180
- 866.7782194777,
181
- 865.4086246736,
182
- 870.0314924292,
183
- 855.3587976891,
184
- 851.5511568095,
185
- 863.2094645624,
186
- 861.0624318318,
187
- 848.5397354473,
188
- 857.9432204946,
189
- 861.2370229881,
190
- 878.2964116149,
191
- 857.9909782749,
192
- 871.9069179589,
193
- 860.2445059252,
194
- 850.4012745111,
195
- 866.7922558028,
196
- 862.2175409513,
197
- 856.8494155845,
198
- 856.4641060792,
199
- 878.905415424,
200
- 851.8853822745,
201
- 859.2360763272,
202
- 869.1579952553,
203
- 855.2369472583,
204
- 859.2009612357,
205
- 876.2027799847,
206
- 849.6362696273,
207
- 865.1318475963,
208
- 855.8791178271,
209
- 873.3916447336,
210
- 867.1797828548,
211
- 865.1613697328,
212
- 875.1689869302
213
- ],
214
- "model":"gigachat_pro",
215
- "score":31.37,
216
- "lower":29.64,
217
- "upper":33.33,
218
- "avg_tokens":0.0
219
- },
220
- {
221
- "results":[
222
- 726.6208252619,
223
- 738.5741612323,
224
- 734.1011761886,
225
- 729.5571514643,
226
- 728.758372467,
227
- 733.7900136425,
228
- 719.043685497,
229
- 714.8370789545,
230
- 725.8752720444,
231
- 715.266084892,
232
- 727.2017077065,
233
- 739.3798608124,
234
- 719.6304899658,
235
- 734.0546251412,
236
- 718.4924449088,
237
- 721.0729415472,
238
- 738.5699274129,
239
- 723.7105361329,
240
- 728.2971721354,
241
- 737.8461934603,
242
- 748.9971545908,
243
- 713.1462726999,
244
- 720.2960317186,
245
- 727.2517234335,
246
- 694.2654473149,
247
- 735.6639839406,
248
- 730.5016731736,
249
- 734.4551919945,
250
- 728.8931636911,
251
- 717.6726330463,
252
- 733.3721052861,
253
- 725.7981758416,
254
- 731.0409312559,
255
- 715.3647090465,
256
- 737.7875979517,
257
- 729.3512200797,
258
- 715.9010959711,
259
- 722.2116159282,
260
- 724.6752254921,
261
- 718.5749125859,
262
- 723.0132896162,
263
- 732.3587564613,
264
- 740.6268654101,
265
- 724.6297632896,
266
- 743.701641735,
267
- 723.5736702859,
268
- 731.9752231934,
269
- 722.3929635211,
270
- 721.9705147906,
271
- 738.9123529498,
272
- 733.7609432817,
273
- 724.1850017217,
274
- 727.8550112565,
275
- 731.3315308989,
276
- 722.5721295254,
277
- 729.8940208849,
278
- 735.9873637973,
279
- 730.6501947523,
280
- 702.8268457509,
281
- 732.6491227137,
282
- 736.225411771,
283
- 745.6156113918,
284
- 721.0912474577,
285
- 736.2254117629,
286
- 732.9674153867,
287
- 723.0966793643,
288
- 718.0704518208,
289
- 722.2852812675,
290
- 745.1185090985,
291
- 736.9690722951,
292
- 742.6306627437,
293
- 733.1555506911,
294
- 721.7491525609,
295
- 723.0795022704,
296
- 717.9478748234,
297
- 726.703609728,
298
- 725.3073844986,
299
- 722.2116156669,
300
- 720.1865370325,
301
- 731.5240457448,
302
- 737.0781670626,
303
- 708.356058121,
304
- 730.3511179714,
305
- 727.5035049316,
306
- 706.4191731996,
307
- 734.2333848904,
308
- 736.5196621633,
309
- 724.9647865416,
310
- 718.7060814362,
311
- 722.5615781913,
312
- 731.6666527735,
313
- 722.1914533305,
314
- 719.1795542579,
315
- 730.3223324585,
316
- 724.1322488355,
317
- 734.6332090556,
318
- 716.1292305518,
319
- 726.7846008592,
320
- 717.027778133,
321
- 728.6562483681
322
- ],
323
- "model":"gigachat_lite",
324
- "score":17.2,
325
- "lower":15.65,
326
- "upper":18.68,
327
- "avg_tokens":276.0
328
- }
329
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/gen/arena_hard_leaderboard_20240515.json DELETED
@@ -1,329 +0,0 @@
1
- [
2
- {
3
- "results":[
4
- 1000.0,
5
- 1000.0,
6
- 1000.0,
7
- 1000.0,
8
- 1000.0,
9
- 1000.0,
10
- 1000.0,
11
- 1000.0,
12
- 1000.0,
13
- 1000.0,
14
- 1000.0,
15
- 1000.0,
16
- 1000.0,
17
- 1000.0,
18
- 1000.0,
19
- 1000.0,
20
- 1000.0,
21
- 1000.0,
22
- 1000.0,
23
- 1000.0,
24
- 1000.0,
25
- 1000.0,
26
- 1000.0,
27
- 1000.0,
28
- 1000.0,
29
- 1000.0,
30
- 1000.0,
31
- 1000.0,
32
- 1000.0,
33
- 1000.0,
34
- 1000.0,
35
- 1000.0,
36
- 1000.0,
37
- 1000.0,
38
- 1000.0,
39
- 1000.0,
40
- 1000.0,
41
- 1000.0,
42
- 1000.0,
43
- 1000.0,
44
- 1000.0,
45
- 1000.0,
46
- 1000.0,
47
- 1000.0,
48
- 1000.0,
49
- 1000.0,
50
- 1000.0,
51
- 1000.0,
52
- 1000.0,
53
- 1000.0,
54
- 1000.0,
55
- 1000.0,
56
- 1000.0,
57
- 1000.0,
58
- 1000.0,
59
- 1000.0,
60
- 1000.0,
61
- 1000.0,
62
- 1000.0,
63
- 1000.0,
64
- 1000.0,
65
- 1000.0,
66
- 1000.0,
67
- 1000.0,
68
- 1000.0,
69
- 1000.0,
70
- 1000.0,
71
- 1000.0,
72
- 1000.0,
73
- 1000.0,
74
- 1000.0,
75
- 1000.0,
76
- 1000.0,
77
- 1000.0,
78
- 1000.0,
79
- 1000.0,
80
- 1000.0,
81
- 1000.0,
82
- 1000.0,
83
- 1000.0,
84
- 1000.0,
85
- 1000.0,
86
- 1000.0,
87
- 1000.0,
88
- 1000.0,
89
- 1000.0,
90
- 1000.0,
91
- 1000.0,
92
- 1000.0,
93
- 1000.0,
94
- 1000.0,
95
- 1000.0,
96
- 1000.0,
97
- 1000.0,
98
- 1000.0,
99
- 1000.0,
100
- 1000.0,
101
- 1000.0,
102
- 1000.0,
103
- 1000.0
104
- ],
105
- "model":"gpt-3.5-turbo-0125",
106
- "score":50.0,
107
- "lower":50.0,
108
- "upper":50.0,
109
- "avg_tokens":0.0
110
- },
111
- {
112
- "results":[
113
- 855.5644665503,
114
- 859.0709454157,
115
- 865.0434024226,
116
- 860.399655762,
117
- 855.1731508697,
118
- 855.5326400531,
119
- 866.7819454641,
120
- 858.5219875589,
121
- 861.4603125434,
122
- 859.8350548067,
123
- 862.7609222876,
124
- 854.2414273092,
125
- 862.374147169,
126
- 863.1792770928,
127
- 865.2996605704,
128
- 864.8988771163,
129
- 867.0356240274,
130
- 871.6157440982,
131
- 861.9225322393,
132
- 864.7557130348,
133
- 853.284444198,
134
- 851.7087385877,
135
- 871.482425846,
136
- 866.6122634027,
137
- 852.7157509126,
138
- 859.7938560994,
139
- 874.1682886992,
140
- 855.4589887037,
141
- 850.0205093168,
142
- 875.7282859976,
143
- 865.3647024942,
144
- 856.1797064852,
145
- 867.6238850835,
146
- 857.7097671655,
147
- 874.4978660071,
148
- 857.5650653089,
149
- 890.8852955482,
150
- 855.6426165155,
151
- 859.3456423505,
152
- 857.4854945486,
153
- 880.1901418236,
154
- 849.6103242372,
155
- 871.0458800663,
156
- 877.4244267245,
157
- 875.3479511716,
158
- 859.1269918194,
159
- 857.8015195801,
160
- 868.2750694028,
161
- 868.0957706924,
162
- 870.6012679715,
163
- 862.269673472,
164
- 864.2488571071,
165
- 874.1624601722,
166
- 863.1194231025,
167
- 857.1192986285,
168
- 862.0030926827,
169
- 861.5474187298,
170
- 880.5566205251,
171
- 861.7223684538,
172
- 874.9512628918,
173
- 858.7260910186,
174
- 871.4133525673,
175
- 866.2715335516,
176
- 861.3256361213,
177
- 866.9022358038,
178
- 867.5601382523,
179
- 864.5272121008,
180
- 866.7782194777,
181
- 865.4086246736,
182
- 870.0314924292,
183
- 855.3587976891,
184
- 851.5511568095,
185
- 863.2094645624,
186
- 861.0624318318,
187
- 848.5397354473,
188
- 857.9432204946,
189
- 861.2370229881,
190
- 878.2964116149,
191
- 857.9909782749,
192
- 871.9069179589,
193
- 860.2445059252,
194
- 850.4012745111,
195
- 866.7922558028,
196
- 862.2175409513,
197
- 856.8494155845,
198
- 856.4641060792,
199
- 878.905415424,
200
- 851.8853822745,
201
- 859.2360763272,
202
- 869.1579952553,
203
- 855.2369472583,
204
- 859.2009612357,
205
- 876.2027799847,
206
- 849.6362696273,
207
- 865.1318475963,
208
- 855.8791178271,
209
- 873.3916447336,
210
- 867.1797828548,
211
- 865.1613697328,
212
- 875.1689869302
213
- ],
214
- "model":"gigachat_pro",
215
- "score":31.37,
216
- "lower":29.64,
217
- "upper":33.33,
218
- "avg_tokens":0.0
219
- },
220
- {
221
- "results":[
222
- 726.6208252619,
223
- 738.5741612323,
224
- 734.1011761886,
225
- 729.5571514643,
226
- 728.758372467,
227
- 733.7900136425,
228
- 719.043685497,
229
- 714.8370789545,
230
- 725.8752720444,
231
- 715.266084892,
232
- 727.2017077065,
233
- 739.3798608124,
234
- 719.6304899658,
235
- 734.0546251412,
236
- 718.4924449088,
237
- 721.0729415472,
238
- 738.5699274129,
239
- 723.7105361329,
240
- 728.2971721354,
241
- 737.8461934603,
242
- 748.9971545908,
243
- 713.1462726999,
244
- 720.2960317186,
245
- 727.2517234335,
246
- 694.2654473149,
247
- 735.6639839406,
248
- 730.5016731736,
249
- 734.4551919945,
250
- 728.8931636911,
251
- 717.6726330463,
252
- 733.3721052861,
253
- 725.7981758416,
254
- 731.0409312559,
255
- 715.3647090465,
256
- 737.7875979517,
257
- 729.3512200797,
258
- 715.9010959711,
259
- 722.2116159282,
260
- 724.6752254921,
261
- 718.5749125859,
262
- 723.0132896162,
263
- 732.3587564613,
264
- 740.6268654101,
265
- 724.6297632896,
266
- 743.701641735,
267
- 723.5736702859,
268
- 731.9752231934,
269
- 722.3929635211,
270
- 721.9705147906,
271
- 738.9123529498,
272
- 733.7609432817,
273
- 724.1850017217,
274
- 727.8550112565,
275
- 731.3315308989,
276
- 722.5721295254,
277
- 729.8940208849,
278
- 735.9873637973,
279
- 730.6501947523,
280
- 702.8268457509,
281
- 732.6491227137,
282
- 736.225411771,
283
- 745.6156113918,
284
- 721.0912474577,
285
- 736.2254117629,
286
- 732.9674153867,
287
- 723.0966793643,
288
- 718.0704518208,
289
- 722.2852812675,
290
- 745.1185090985,
291
- 736.9690722951,
292
- 742.6306627437,
293
- 733.1555506911,
294
- 721.7491525609,
295
- 723.0795022704,
296
- 717.9478748234,
297
- 726.703609728,
298
- 725.3073844986,
299
- 722.2116156669,
300
- 720.1865370325,
301
- 731.5240457448,
302
- 737.0781670626,
303
- 708.356058121,
304
- 730.3511179714,
305
- 727.5035049316,
306
- 706.4191731996,
307
- 734.2333848904,
308
- 736.5196621633,
309
- 724.9647865416,
310
- 718.7060814362,
311
- 722.5615781913,
312
- 731.6666527735,
313
- 722.1914533305,
314
- 719.1795542579,
315
- 730.3223324585,
316
- 724.1322488355,
317
- 734.6332090556,
318
- 716.1292305518,
319
- 726.7846008592,
320
- 717.027778133,
321
- 728.6562483681
322
- ],
323
- "model":"gigachat_lite",
324
- "score":17.2,
325
- "lower":15.65,
326
- "upper":18.68,
327
- "avg_tokens":276.0
328
- }
329
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/gen/show_result.py CHANGED
@@ -263,13 +263,13 @@ if __name__ == "__main__":
263
  huggingface_hub.HfApi().upload_file(
264
  path_or_fileobj=json_file_name,
265
  path_in_repo="data/leaderboard.json",
266
- repo_id="Vikhrmodels/leaderboard",
267
  repo_type="space",
268
  )
269
 
270
  huggingface_hub.HfApi().upload_file(
271
  path_or_fileobj=json_file_name,
272
  path_in_repo=f"data/leaderboard_logs/{json_file_name}",
273
- repo_id="Vikhrmodels/leaderboard",
274
  repo_type="dataset",
275
  )
 
263
  huggingface_hub.HfApi().upload_file(
264
  path_or_fileobj=json_file_name,
265
  path_in_repo="data/leaderboard.json",
266
+ repo_id="Vikhrmodels/arena-leaderboard-metainfo",
267
  repo_type="space",
268
  )
269
 
270
  huggingface_hub.HfApi().upload_file(
271
  path_or_fileobj=json_file_name,
272
  path_in_repo=f"data/leaderboard_logs/{json_file_name}",
273
+ repo_id="Vikhrmodels/arena-leaderboard-metainfo",
274
  repo_type="dataset",
275
  )
src/leaderboard/build_leaderboard.py CHANGED
@@ -1,13 +1,12 @@
1
  import json
2
  import logging
3
  import os
4
- import subprocess
5
  import time
6
 
7
  import pandas as pd
8
  from huggingface_hub import snapshot_download
9
 
10
- from src.envs import DATA_ARENA_PATH, HF_HOME
11
 
12
  # Configure logging
13
  logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
@@ -55,21 +54,15 @@ def download_openbench():
55
  """Downloads pre generated data"""
56
  os.makedirs(DATA_ARENA_PATH, exist_ok=True)
57
 
 
 
 
58
  # download answers of different models that we trust
59
  download_dataset("Vikhrmodels/openbench-eval", DATA_ARENA_PATH)
60
 
61
- print("\nInternal models in openbench-eval:")
62
- subprocess.run(["ls", f"{DATA_ARENA_PATH}/model_answers/internal/"], check=False)
63
-
64
- print("\nExternal models in openbench-eval:")
65
- subprocess.run(["ls", f"{DATA_ARENA_PATH}/model_answers/external/"], check=False)
66
-
67
- print("\nJudgement in openbench-eval")
68
- subprocess.run(["ls", f"{DATA_ARENA_PATH}/model_judgement/gpt-4-1106-preview"], check=False)
69
-
70
 
71
  def build_leadearboard_df():
72
  # Retrieve the leaderboard DataFrame
73
- with open(f"{HF_HOME}/data/leaderboard.json", "r", encoding="utf-8") as eval_file:
74
  leaderboard_df = pd.DataFrame.from_records(json.load(eval_file))
75
  return leaderboard_df.copy()
 
1
  import json
2
  import logging
3
  import os
 
4
  import time
5
 
6
  import pandas as pd
7
  from huggingface_hub import snapshot_download
8
 
9
+ from src.envs import DATA_ARENA_PATH, DATA_PATH
10
 
11
  # Configure logging
12
  logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
 
54
  """Downloads pre generated data"""
55
  os.makedirs(DATA_ARENA_PATH, exist_ok=True)
56
 
57
+ # download prev autogenerated leaderboard files
58
+ download_dataset("Vikhrmodels/arena-leaderboard-metainfo", DATA_PATH)
59
+
60
  # download answers of different models that we trust
61
  download_dataset("Vikhrmodels/openbench-eval", DATA_ARENA_PATH)
62
 
 
 
 
 
 
 
 
 
 
63
 
64
  def build_leadearboard_df():
65
  # Retrieve the leaderboard DataFrame
66
+ with open(f"{DATA_PATH}/leaderboard.json", "r", encoding="utf-8") as eval_file:
67
  leaderboard_df = pd.DataFrame.from_records(json.load(eval_file))
68
  return leaderboard_df.copy()