Spestly commited on
Commit
6cb0ca9
·
verified ·
1 Parent(s): 9f2df50

Update index.html

Browse files
Files changed (1) hide show
  1. index.html +698 -377
index.html CHANGED
@@ -1,732 +1,1053 @@
1
  <style type="text/css">
2
- #T_a7af5 td {
3
  overflow-wrap: break-word;
4
  max-width: 1px;
5
  }
6
- #T_a7af5 .col_heading {
7
- width: 25.0%;
8
  }
9
- #T_a7af5_row15_col0, #T_a7af5_row118_col1 {
10
  background-color: #f7cbe4;
11
  color: #000000;
12
  }
13
- #T_a7af5_row15_col1, #T_a7af5_row31_col3 {
14
  background-color: #f9eff4;
15
  color: #000000;
16
  }
17
- #T_a7af5_row15_col2, #T_a7af5_row58_col3, #T_a7af5_row109_col0 {
 
 
 
 
18
  background-color: #f4bfdf;
19
  color: #000000;
20
  }
21
- #T_a7af5_row15_col3, #T_a7af5_row28_col2, #T_a7af5_row46_col2, #T_a7af5_row85_col0, #T_a7af5_row107_col3 {
22
  background-color: #f6c7e3;
23
  color: #000000;
24
  }
25
- #T_a7af5_row19_col0 {
 
 
 
 
26
  background-color: #a9d874;
27
  color: #000000;
28
  }
29
- #T_a7af5_row19_col1 {
30
  background-color: #549825;
31
  color: #f1f1f1;
32
  }
33
- #T_a7af5_row19_col2, #T_a7af5_row70_col0, #T_a7af5_row115_col0 {
 
 
 
 
34
  background-color: #f7f7f6;
35
  color: #000000;
36
  }
37
- #T_a7af5_row19_col3 {
38
  background-color: #f1f6ea;
39
  color: #000000;
40
  }
41
- #T_a7af5_row22_col0 {
 
 
 
 
 
 
 
 
42
  background-color: #f5f7f3;
43
  color: #000000;
44
  }
45
- #T_a7af5_row22_col1 {
46
  background-color: #edf6df;
47
  color: #000000;
48
  }
49
- #T_a7af5_row22_col2, #T_a7af5_row22_col3, #T_a7af5_row37_col2, #T_a7af5_row70_col1 {
 
 
 
 
50
  background-color: #f4f7f0;
51
  color: #000000;
52
  }
53
- #T_a7af5_row25_col0, #T_a7af5_row100_col2, #T_a7af5_row145_col1 {
 
 
 
 
 
 
 
 
54
  background-color: #eeabd2;
55
  color: #000000;
56
  }
57
- #T_a7af5_row25_col1 {
58
  background-color: #eff6e4;
59
  color: #000000;
60
  }
61
- #T_a7af5_row25_col2, #T_a7af5_row40_col2 {
 
 
 
 
62
  background-color: #df7cb1;
63
  color: #f1f1f1;
64
  }
65
- #T_a7af5_row25_col3 {
66
  background-color: #fbd9ec;
67
  color: #000000;
68
  }
69
- #T_a7af5_row28_col0, #T_a7af5_row130_col0 {
 
 
 
 
 
 
 
 
70
  background-color: #fbe7f2;
71
  color: #000000;
72
  }
73
- #T_a7af5_row28_col1, #T_a7af5_row73_col1 {
74
  background-color: #ecf6de;
75
  color: #000000;
76
  }
77
- #T_a7af5_row28_col3 {
 
 
 
 
78
  background-color: #f2badc;
79
  color: #000000;
80
  }
81
- #T_a7af5_row31_col0, #T_a7af5_row105_col3 {
 
 
 
 
 
 
 
 
82
  background-color: #f9f1f5;
83
  color: #000000;
84
  }
85
- #T_a7af5_row31_col1, #T_a7af5_row37_col1 {
86
  background-color: #e8f5d5;
87
  color: #000000;
88
  }
89
- #T_a7af5_row31_col2, #T_a7af5_row40_col1 {
90
- background-color: #f9eef4;
 
 
 
 
91
  color: #000000;
92
  }
93
- #T_a7af5_row34_col0, #T_a7af5_row43_col0, #T_a7af5_row124_col1 {
94
  background-color: #d24c97;
95
  color: #f1f1f1;
96
  }
97
- #T_a7af5_row34_col1 {
98
  background-color: #faeaf2;
99
  color: #000000;
100
  }
101
- #T_a7af5_row34_col2, #T_a7af5_row67_col3 {
 
 
 
 
102
  background-color: #e283b7;
103
  color: #f1f1f1;
104
  }
105
- #T_a7af5_row34_col3 {
106
- background-color: #fbe6f1;
107
  color: #000000;
108
  }
109
- #T_a7af5_row37_col0, #T_a7af5_row49_col0 {
 
 
 
 
110
  background-color: #f7f6f7;
111
  color: #000000;
112
  }
113
- #T_a7af5_row37_col3, #T_a7af5_row46_col3, #T_a7af5_row52_col3 {
114
- background-color: #f8f3f6;
115
  color: #000000;
116
  }
117
- #T_a7af5_row40_col0 {
 
 
 
 
118
  background-color: #e388ba;
119
  color: #f1f1f1;
120
  }
121
- #T_a7af5_row40_col3 {
 
 
 
 
122
  background-color: #eeadd4;
123
  color: #000000;
124
  }
125
- #T_a7af5_row43_col1 {
 
 
 
 
 
 
 
 
126
  background-color: #f6c9e3;
127
  color: #000000;
128
  }
129
- #T_a7af5_row43_col2, #T_a7af5_row67_col2, #T_a7af5_row79_col1 {
 
 
 
 
130
  background-color: #cf4191;
131
  color: #f1f1f1;
132
  }
133
- #T_a7af5_row43_col3 {
134
  background-color: #d34f99;
135
  color: #f1f1f1;
136
  }
137
- #T_a7af5_row46_col0 {
 
 
 
 
 
 
 
 
138
  background-color: #fde2f0;
139
  color: #000000;
140
  }
141
- #T_a7af5_row46_col1 {
142
  background-color: #bbe28a;
143
  color: #000000;
144
  }
145
- #T_a7af5_row49_col1 {
 
 
 
 
 
 
 
 
146
  background-color: #e9f5d6;
147
  color: #000000;
148
  }
149
- #T_a7af5_row49_col2, #T_a7af5_row67_col1, #T_a7af5_row148_col0 {
 
 
 
 
150
  background-color: #ea9fca;
151
  color: #000000;
152
  }
153
- #T_a7af5_row49_col3 {
154
  background-color: #f0b2d7;
155
  color: #000000;
156
  }
157
- #T_a7af5_row52_col0, #T_a7af5_row142_col0 {
158
- background-color: #f6f7f5;
159
  color: #000000;
160
  }
161
- #T_a7af5_row52_col1 {
162
  background-color: #c0e593;
163
  color: #000000;
164
  }
165
- #T_a7af5_row52_col2, #T_a7af5_row58_col1, #T_a7af5_row64_col1, #T_a7af5_row109_col2 {
166
  background-color: #fce5f1;
167
  color: #000000;
168
  }
169
- #T_a7af5_row55_col0, #T_a7af5_row90_col1 {
170
- background-color: #f5c6e2;
171
  color: #000000;
172
  }
173
- #T_a7af5_row55_col1 {
174
- background-color: #fad6ea;
175
- color: #000000;
176
- }
177
- #T_a7af5_row55_col2 {
178
- background-color: #f8cee6;
179
  color: #000000;
180
  }
181
- #T_a7af5_row55_col3 {
182
  background-color: #c82884;
183
  color: #f1f1f1;
184
  }
185
- #T_a7af5_row58_col0, #T_a7af5_row121_col1 {
 
 
 
 
186
  background-color: #e897c4;
187
  color: #000000;
188
  }
189
- #T_a7af5_row58_col2, #T_a7af5_row79_col0, #T_a7af5_row107_col0 {
190
  background-color: #e07eb3;
191
  color: #f1f1f1;
192
  }
193
- #T_a7af5_row61_col0 {
 
 
 
 
194
  background-color: #eba3cd;
195
  color: #000000;
196
  }
197
- #T_a7af5_row61_col1, #T_a7af5_row88_col0, #T_a7af5_row88_col1, #T_a7af5_row88_col2, #T_a7af5_row88_col3, #T_a7af5_row109_col3 {
198
  background-color: #f8f4f6;
199
  color: #000000;
200
  }
201
- #T_a7af5_row61_col2, #T_a7af5_row136_col0 {
 
 
 
 
202
  background-color: #dc70aa;
203
  color: #f1f1f1;
204
  }
205
- #T_a7af5_row61_col3 {
206
- background-color: #d861a2;
207
- color: #f1f1f1;
208
  }
209
- #T_a7af5_row64_col0, #T_a7af5_row145_col3 {
210
  background-color: #e181b5;
211
  color: #f1f1f1;
212
  }
213
- #T_a7af5_row64_col2, #T_a7af5_row94_col3, #T_a7af5_row136_col3 {
214
  background-color: #e48bbc;
215
  color: #f1f1f1;
216
  }
217
- #T_a7af5_row64_col3, #T_a7af5_row138_col3, #T_a7af5_row148_col2 {
218
  background-color: #f9d1e8;
219
  color: #000000;
220
  }
221
- #T_a7af5_row67_col0, #T_a7af5_row76_col1 {
222
  background-color: #cc368b;
223
  color: #f1f1f1;
224
  }
225
- #T_a7af5_row70_col2, #T_a7af5_row73_col0 {
226
- background-color: #f0f6e7;
227
  color: #000000;
228
  }
229
- #T_a7af5_row70_col3, #T_a7af5_row73_col3 {
230
- background-color: #f8f2f5;
231
  color: #000000;
232
  }
233
- #T_a7af5_row73_col2 {
234
- background-color: #f3f6ed;
 
 
 
 
 
 
 
 
 
 
 
 
235
  color: #000000;
236
  }
237
- #T_a7af5_row76_col0, #T_a7af5_row133_col0 {
238
  background-color: #b51370;
239
  color: #f1f1f1;
240
  }
241
- #T_a7af5_row76_col2, #T_a7af5_row97_col2, #T_a7af5_row145_col0 {
242
- background-color: #e590bf;
243
- color: #f1f1f1;
244
  }
245
- #T_a7af5_row76_col3 {
246
  background-color: #e692c1;
247
  color: #000000;
248
  }
249
- #T_a7af5_row79_col2 {
 
 
 
 
 
 
 
 
250
  background-color: #e286b8;
251
  color: #f1f1f1;
252
  }
253
- #T_a7af5_row79_col3 {
254
  background-color: #c72482;
255
  color: #f1f1f1;
256
  }
257
- #T_a7af5_row82_col0 {
 
 
 
 
258
  background-color: #d65a9f;
259
  color: #f1f1f1;
260
  }
261
- #T_a7af5_row82_col1, #T_a7af5_row82_col3, #T_a7af5_row136_col2 {
262
  background-color: #c92b86;
263
  color: #f1f1f1;
264
  }
265
- #T_a7af5_row82_col2 {
266
- background-color: #d14895;
267
  color: #f1f1f1;
268
  }
269
- #T_a7af5_row85_col1, #T_a7af5_row94_col1 {
270
  background-color: #f3bdde;
271
  color: #000000;
272
  }
273
- #T_a7af5_row85_col2, #T_a7af5_row100_col1 {
 
 
 
 
274
  background-color: #f5c2e0;
275
  color: #000000;
276
  }
277
- #T_a7af5_row85_col3, #T_a7af5_row100_col3 {
278
  background-color: #f1b7da;
279
  color: #000000;
280
  }
281
- #T_a7af5_row90_col0, #T_a7af5_row105_col0, #T_a7af5_row136_col1 {
282
  background-color: #f1b5d9;
283
  color: #000000;
284
  }
285
- #T_a7af5_row90_col2 {
286
- background-color: #e89ac6;
287
  color: #000000;
288
  }
289
- #T_a7af5_row90_col3, #T_a7af5_row107_col2 {
290
  background-color: #eda8d1;
291
  color: #000000;
292
  }
293
- #T_a7af5_row94_col0, #T_a7af5_row127_col0 {
294
- background-color: #eba1cb;
295
- color: #000000;
296
- }
297
- #T_a7af5_row94_col2, #T_a7af5_row145_col2 {
298
  background-color: #e58dbe;
299
  color: #f1f1f1;
300
  }
301
- #T_a7af5_row97_col0, #T_a7af5_row107_col1 {
302
- background-color: #f3bcdd;
303
- color: #000000;
304
- }
305
- #T_a7af5_row97_col1 {
306
- background-color: #f7cce5;
307
- color: #000000;
308
- }
309
- #T_a7af5_row97_col3, #T_a7af5_row103_col2 {
310
  background-color: #eca6cf;
311
  color: #000000;
312
  }
313
- #T_a7af5_row100_col0 {
314
- background-color: #efb0d6;
315
- color: #000000;
316
- }
317
- #T_a7af5_row103_col0 {
318
  background-color: #db6ca8;
319
  color: #f1f1f1;
320
  }
321
- #T_a7af5_row103_col1 {
322
  background-color: #e795c3;
323
  color: #000000;
324
  }
325
- #T_a7af5_row103_col3, #T_a7af5_row138_col0 {
326
  background-color: #f5c4e1;
327
  color: #000000;
328
  }
329
- #T_a7af5_row105_col1 {
 
 
 
 
 
 
 
 
330
  background-color: #fbd8eb;
331
  color: #000000;
332
  }
333
- #T_a7af5_row105_col2, #T_a7af5_row127_col1 {
334
  background-color: #fce4f0;
335
  color: #000000;
336
  }
337
- #T_a7af5_row109_col1 {
338
- background-color: #fbe9f2;
339
- color: #000000;
340
  }
341
- #T_a7af5_row111_col0 {
342
- background-color: #e99cc8;
343
  color: #000000;
344
  }
345
- #T_a7af5_row111_col1, #T_a7af5_row138_col1 {
346
- background-color: #fcdbed;
 
 
 
 
347
  color: #000000;
348
  }
349
- #T_a7af5_row111_col2, #T_a7af5_row111_col3, #T_a7af5_row115_col2, #T_a7af5_row115_col3, #T_a7af5_row118_col2, #T_a7af5_row118_col3, #T_a7af5_row121_col2, #T_a7af5_row121_col3, #T_a7af5_row124_col2, #T_a7af5_row124_col3, #T_a7af5_row127_col2, #T_a7af5_row127_col3, #T_a7af5_row130_col2, #T_a7af5_row130_col3, #T_a7af5_row133_col2, #T_a7af5_row133_col3 {
 
 
 
 
 
 
 
 
 
 
 
 
350
  background-color: #8e0152;
351
  color: #f1f1f1;
352
  }
353
- #T_a7af5_row115_col1 {
 
 
 
 
354
  background-color: #ddf1c1;
355
  color: #000000;
356
  }
357
- #T_a7af5_row118_col0 {
 
 
 
 
 
 
 
 
358
  background-color: #d4539b;
359
  color: #f1f1f1;
360
  }
361
- #T_a7af5_row121_col0 {
362
- background-color: #ca2f88;
 
 
 
 
363
  color: #f1f1f1;
364
  }
365
- #T_a7af5_row124_col0 {
 
 
 
 
366
  background-color: #b1116d;
367
  color: #f1f1f1;
368
  }
369
- #T_a7af5_row130_col1 {
 
 
 
 
 
 
 
 
 
 
 
 
370
  background-color: #d0ecad;
371
  color: #000000;
372
  }
373
- #T_a7af5_row133_col1 {
 
 
 
 
374
  background-color: #cb3289;
375
  color: #f1f1f1;
376
  }
377
- #T_a7af5_row138_col2 {
378
- background-color: #f8d0e7;
379
  color: #000000;
380
  }
381
- #T_a7af5_row142_col1 {
382
  background-color: #f3f7ef;
383
  color: #000000;
384
  }
385
- #T_a7af5_row142_col2, #T_a7af5_row142_col3 {
386
  background-color: #f8f5f6;
387
  color: #000000;
388
  }
389
- #T_a7af5_row148_col1 {
390
- background-color: #fad4e9;
391
- color: #000000;
392
- }
393
- #T_a7af5_row148_col3 {
394
- background-color: #fcdded;
395
- color: #000000;
396
- }
397
  </style>
398
- <table id="T_a7af5">
399
  <thead>
400
  <tr>
401
  <th class="blank level0" >&nbsp;</th>
402
- <th id="T_a7af5_level0_col0" class="col_heading level0 col0" >Spestly/Atlas-Pro-1.5B-Preview</th>
403
- <th id="T_a7af5_level0_col1" class="col_heading level0 col1" >Spestly/Atlas-Pro-7B-Preview</th>
404
- <th id="T_a7af5_level0_col2" class="col_heading level0 col2" >deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B</th>
405
- <th id="T_a7af5_level0_col3" class="col_heading level0 col3" >deepseek-ai/DeepSeek-R1-Distill-Qwen-7B</th>
 
 
 
406
  </tr>
407
  </thead>
408
  <tbody>
409
  <tr>
410
- <th id="T_a7af5_level0_row15" class="row_heading level0 row15" >bbh.acc_norm</th>
411
- <td id="T_a7af5_row15_col0" class="data row15 col0" >0.348030</td>
412
- <td id="T_a7af5_row15_col1" class="data row15 col1" >0.465891</td>
413
- <td id="T_a7af5_row15_col2" class="data row15 col2" >0.321298</td>
414
- <td id="T_a7af5_row15_col3" class="data row15 col3" >0.341087</td>
 
 
 
415
  </tr>
416
  <tr>
417
- <th id="T_a7af5_level0_row19" class="row_heading level0 row19" >bbh_boolean_expressions.acc_norm</th>
418
- <td id="T_a7af5_row19_col0" class="data row19 col0" >0.724000</td>
419
- <td id="T_a7af5_row19_col1" class="data row19 col1" >0.884000</td>
420
- <td id="T_a7af5_row19_col2" class="data row19 col2" >0.500000</td>
421
- <td id="T_a7af5_row19_col3" class="data row19 col3" >0.532000</td>
 
 
 
422
  </tr>
423
  <tr>
424
- <th id="T_a7af5_level0_row22" class="row_heading level0 row22" >bbh_causal_judgement.acc_norm</th>
425
- <td id="T_a7af5_row22_col0" class="data row22 col0" >0.508021</td>
426
- <td id="T_a7af5_row22_col1" class="data row22 col1" >0.561497</td>
427
- <td id="T_a7af5_row22_col2" class="data row22 col2" >0.518717</td>
428
- <td id="T_a7af5_row22_col3" class="data row22 col3" >0.518717</td>
 
 
 
429
  </tr>
430
  <tr>
431
- <th id="T_a7af5_level0_row25" class="row_heading level0 row25" >bbh_date_understanding.acc_norm</th>
432
- <td id="T_a7af5_row25_col0" class="data row25 col0" >0.284000</td>
433
- <td id="T_a7af5_row25_col1" class="data row25 col1" >0.548000</td>
434
- <td id="T_a7af5_row25_col2" class="data row25 col2" >0.208000</td>
435
- <td id="T_a7af5_row25_col3" class="data row25 col3" >0.384000</td>
 
 
 
436
  </tr>
437
  <tr>
438
- <th id="T_a7af5_level0_row28" class="row_heading level0 row28" >bbh_disambiguation_qa.acc_norm</th>
439
- <td id="T_a7af5_row28_col0" class="data row28 col0" >0.432000</td>
440
- <td id="T_a7af5_row28_col1" class="data row28 col1" >0.564000</td>
441
- <td id="T_a7af5_row28_col2" class="data row28 col2" >0.340000</td>
442
- <td id="T_a7af5_row28_col3" class="data row28 col3" >0.312000</td>
 
 
 
443
  </tr>
444
  <tr>
445
- <th id="T_a7af5_level0_row31" class="row_heading level0 row31" >bbh_formal_fallacies.acc_norm</th>
446
- <td id="T_a7af5_row31_col0" class="data row31 col0" >0.476000</td>
447
- <td id="T_a7af5_row31_col1" class="data row31 col1" >0.588000</td>
448
- <td id="T_a7af5_row31_col2" class="data row31 col2" >0.464000</td>
449
- <td id="T_a7af5_row31_col3" class="data row31 col3" >0.468000</td>
 
 
 
450
  </tr>
451
  <tr>
452
- <th id="T_a7af5_level0_row34" class="row_heading level0 row34" >bbh_geometric_shapes.acc_norm</th>
453
- <td id="T_a7af5_row34_col0" class="data row34 col0" >0.156000</td>
454
- <td id="T_a7af5_row34_col1" class="data row34 col1" >0.444000</td>
455
- <td id="T_a7af5_row34_col2" class="data row34 col2" >0.220000</td>
456
- <td id="T_a7af5_row34_col3" class="data row34 col3" >0.428000</td>
 
 
 
457
  </tr>
458
  <tr>
459
- <th id="T_a7af5_level0_row37" class="row_heading level0 row37" >bbh_hyperbaton.acc_norm</th>
460
- <td id="T_a7af5_row37_col0" class="data row37 col0" >0.496000</td>
461
- <td id="T_a7af5_row37_col1" class="data row37 col1" >0.588000</td>
462
- <td id="T_a7af5_row37_col2" class="data row37 col2" >0.516000</td>
463
- <td id="T_a7af5_row37_col3" class="data row37 col3" >0.484000</td>
 
 
 
464
  </tr>
465
  <tr>
466
- <th id="T_a7af5_level0_row40" class="row_heading level0 row40" >bbh_logical_deduction_five_objects.acc_norm</th>
467
- <td id="T_a7af5_row40_col0" class="data row40 col0" >0.228000</td>
468
- <td id="T_a7af5_row40_col1" class="data row40 col1" >0.464000</td>
469
- <td id="T_a7af5_row40_col2" class="data row40 col2" >0.208000</td>
470
- <td id="T_a7af5_row40_col3" class="data row40 col3" >0.288000</td>
 
 
 
471
  </tr>
472
  <tr>
473
- <th id="T_a7af5_level0_row43" class="row_heading level0 row43" >bbh_logical_deduction_seven_objects.acc_norm</th>
474
- <td id="T_a7af5_row43_col0" class="data row43 col0" >0.156000</td>
475
- <td id="T_a7af5_row43_col1" class="data row43 col1" >0.344000</td>
476
- <td id="T_a7af5_row43_col2" class="data row43 col2" >0.144000</td>
477
- <td id="T_a7af5_row43_col3" class="data row43 col3" >0.160000</td>
 
 
 
478
  </tr>
479
  <tr>
480
- <th id="T_a7af5_level0_row46" class="row_heading level0 row46" >bbh_logical_deduction_three_objects.acc_norm</th>
481
- <td id="T_a7af5_row46_col0" class="data row46 col0" >0.408000</td>
482
- <td id="T_a7af5_row46_col1" class="data row46 col1" >0.692000</td>
483
- <td id="T_a7af5_row46_col2" class="data row46 col2" >0.340000</td>
484
- <td id="T_a7af5_row46_col3" class="data row46 col3" >0.484000</td>
 
 
 
485
  </tr>
486
  <tr>
487
- <th id="T_a7af5_level0_row49" class="row_heading level0 row49" >bbh_movie_recommendation.acc_norm</th>
488
- <td id="T_a7af5_row49_col0" class="data row49 col0" >0.496000</td>
489
- <td id="T_a7af5_row49_col1" class="data row49 col1" >0.584000</td>
490
- <td id="T_a7af5_row49_col2" class="data row49 col2" >0.264000</td>
491
- <td id="T_a7af5_row49_col3" class="data row49 col3" >0.296000</td>
 
 
 
492
  </tr>
493
  <tr>
494
- <th id="T_a7af5_level0_row52" class="row_heading level0 row52" >bbh_navigate.acc_norm</th>
495
- <td id="T_a7af5_row52_col0" class="data row52 col0" >0.504000</td>
496
- <td id="T_a7af5_row52_col1" class="data row52 col1" >0.680000</td>
497
- <td id="T_a7af5_row52_col2" class="data row52 col2" >0.420000</td>
498
- <td id="T_a7af5_row52_col3" class="data row52 col3" >0.484000</td>
 
 
 
499
  </tr>
500
  <tr>
501
- <th id="T_a7af5_level0_row55" class="row_heading level0 row55" >bbh_object_counting.acc_norm</th>
502
- <td id="T_a7af5_row55_col0" class="data row55 col0" >0.336000</td>
503
- <td id="T_a7af5_row55_col1" class="data row55 col1" >0.376000</td>
504
- <td id="T_a7af5_row55_col2" class="data row55 col2" >0.356000</td>
505
- <td id="T_a7af5_row55_col3" class="data row55 col3" >0.116000</td>
 
 
 
506
  </tr>
507
  <tr>
508
- <th id="T_a7af5_level0_row58" class="row_heading level0 row58" >bbh_penguins_in_a_table.acc_norm</th>
509
- <td id="T_a7af5_row58_col0" class="data row58 col0" >0.253425</td>
510
- <td id="T_a7af5_row58_col1" class="data row58 col1" >0.424658</td>
511
- <td id="T_a7af5_row58_col2" class="data row58 col2" >0.212329</td>
512
- <td id="T_a7af5_row58_col3" class="data row58 col3" >0.321918</td>
 
 
 
513
  </tr>
514
  <tr>
515
- <th id="T_a7af5_level0_row61" class="row_heading level0 row61" >bbh_reasoning_about_colored_objects.acc_norm</th>
516
- <td id="T_a7af5_row61_col0" class="data row61 col0" >0.272000</td>
517
- <td id="T_a7af5_row61_col1" class="data row61 col1" >0.488000</td>
518
- <td id="T_a7af5_row61_col2" class="data row61 col2" >0.192000</td>
519
- <td id="T_a7af5_row61_col3" class="data row61 col3" >0.176000</td>
 
 
 
520
  </tr>
521
  <tr>
522
- <th id="T_a7af5_level0_row64" class="row_heading level0 row64" >bbh_ruin_names.acc_norm</th>
523
- <td id="T_a7af5_row64_col0" class="data row64 col0" >0.216000</td>
524
- <td id="T_a7af5_row64_col1" class="data row64 col1" >0.424000</td>
525
- <td id="T_a7af5_row64_col2" class="data row64 col2" >0.232000</td>
526
- <td id="T_a7af5_row64_col3" class="data row64 col3" >0.364000</td>
 
 
 
527
  </tr>
528
  <tr>
529
- <th id="T_a7af5_level0_row67" class="row_heading level0 row67" >bbh_salient_translation_error_detection.acc_norm</th>
530
- <td id="T_a7af5_row67_col0" class="data row67 col0" >0.132000</td>
531
- <td id="T_a7af5_row67_col1" class="data row67 col1" >0.264000</td>
532
- <td id="T_a7af5_row67_col2" class="data row67 col2" >0.144000</td>
533
- <td id="T_a7af5_row67_col3" class="data row67 col3" >0.220000</td>
 
 
 
534
  </tr>
535
  <tr>
536
- <th id="T_a7af5_level0_row70" class="row_heading level0 row70" >bbh_snarks.acc_norm</th>
537
- <td id="T_a7af5_row70_col0" class="data row70 col0" >0.500000</td>
538
- <td id="T_a7af5_row70_col1" class="data row70 col1" >0.516854</td>
539
- <td id="T_a7af5_row70_col2" class="data row70 col2" >0.539326</td>
540
- <td id="T_a7af5_row70_col3" class="data row70 col3" >0.477528</td>
 
 
 
541
  </tr>
542
  <tr>
543
- <th id="T_a7af5_level0_row73" class="row_heading level0 row73" >bbh_sports_understanding.acc_norm</th>
544
- <td id="T_a7af5_row73_col0" class="data row73 col0" >0.540000</td>
545
- <td id="T_a7af5_row73_col1" class="data row73 col1" >0.564000</td>
546
- <td id="T_a7af5_row73_col2" class="data row73 col2" >0.524000</td>
547
- <td id="T_a7af5_row73_col3" class="data row73 col3" >0.480000</td>
 
 
 
548
  </tr>
549
  <tr>
550
- <th id="T_a7af5_level0_row76" class="row_heading level0 row76" >bbh_temporal_sequences.acc_norm</th>
551
- <td id="T_a7af5_row76_col0" class="data row76 col0" >0.072000</td>
552
- <td id="T_a7af5_row76_col1" class="data row76 col1" >0.132000</td>
553
- <td id="T_a7af5_row76_col2" class="data row76 col2" >0.240000</td>
554
- <td id="T_a7af5_row76_col3" class="data row76 col3" >0.244000</td>
 
 
 
555
  </tr>
556
  <tr>
557
- <th id="T_a7af5_level0_row79" class="row_heading level0 row79" >bbh_tracking_shuffled_objects_five_objects.acc_norm</th>
558
- <td id="T_a7af5_row79_col0" class="data row79 col0" >0.212000</td>
559
- <td id="T_a7af5_row79_col1" class="data row79 col1" >0.144000</td>
560
- <td id="T_a7af5_row79_col2" class="data row79 col2" >0.224000</td>
561
- <td id="T_a7af5_row79_col3" class="data row79 col3" >0.112000</td>
 
 
 
562
  </tr>
563
  <tr>
564
- <th id="T_a7af5_level0_row82" class="row_heading level0 row82" >bbh_tracking_shuffled_objects_seven_objects.acc_norm</th>
565
- <td id="T_a7af5_row82_col0" class="data row82 col0" >0.168000</td>
566
- <td id="T_a7af5_row82_col1" class="data row82 col1" >0.120000</td>
567
- <td id="T_a7af5_row82_col2" class="data row82 col2" >0.152000</td>
568
- <td id="T_a7af5_row82_col3" class="data row82 col3" >0.120000</td>
 
 
 
569
  </tr>
570
  <tr>
571
- <th id="T_a7af5_level0_row85" class="row_heading level0 row85" >bbh_tracking_shuffled_objects_three_objects.acc_norm</th>
572
- <td id="T_a7af5_row85_col0" class="data row85 col0" >0.340000</td>
573
- <td id="T_a7af5_row85_col1" class="data row85 col1" >0.320000</td>
574
- <td id="T_a7af5_row85_col2" class="data row85 col2" >0.332000</td>
575
- <td id="T_a7af5_row85_col3" class="data row85 col3" >0.304000</td>
 
 
 
576
  </tr>
577
  <tr>
578
- <th id="T_a7af5_level0_row88" class="row_heading level0 row88" >bbh_web_of_lies.acc_norm</th>
579
- <td id="T_a7af5_row88_col0" class="data row88 col0" >0.488000</td>
580
- <td id="T_a7af5_row88_col1" class="data row88 col1" >0.488000</td>
581
- <td id="T_a7af5_row88_col2" class="data row88 col2" >0.488000</td>
582
- <td id="T_a7af5_row88_col3" class="data row88 col3" >0.488000</td>
 
 
 
583
  </tr>
584
  <tr>
585
- <th id="T_a7af5_level0_row90" class="row_heading level0 row90" >gpqa.acc_norm</th>
586
- <td id="T_a7af5_row90_col0" class="data row90 col0" >0.296980</td>
587
- <td id="T_a7af5_row90_col1" class="data row90 col1" >0.337248</td>
588
- <td id="T_a7af5_row90_col2" class="data row90 col2" >0.255872</td>
589
- <td id="T_a7af5_row90_col3" class="data row90 col3" >0.279362</td>
 
 
 
590
  </tr>
591
  <tr>
592
- <th id="T_a7af5_level0_row94" class="row_heading level0 row94" >gpqa_diamond.acc_norm</th>
593
- <td id="T_a7af5_row94_col0" class="data row94 col0" >0.267677</td>
594
- <td id="T_a7af5_row94_col1" class="data row94 col1" >0.318182</td>
595
- <td id="T_a7af5_row94_col2" class="data row94 col2" >0.237374</td>
596
- <td id="T_a7af5_row94_col3" class="data row94 col3" >0.232323</td>
 
 
 
597
  </tr>
598
  <tr>
599
- <th id="T_a7af5_level0_row97" class="row_heading level0 row97" >gpqa_extended.acc_norm</th>
600
- <td id="T_a7af5_row97_col0" class="data row97 col0" >0.313187</td>
601
- <td id="T_a7af5_row97_col1" class="data row97 col1" >0.351648</td>
602
- <td id="T_a7af5_row97_col2" class="data row97 col2" >0.239927</td>
603
- <td id="T_a7af5_row97_col3" class="data row97 col3" >0.276557</td>
 
 
 
604
  </tr>
605
  <tr>
606
- <th id="T_a7af5_level0_row100" class="row_heading level0 row100" >gpqa_main.acc_norm</th>
607
- <td id="T_a7af5_row100_col0" class="data row100 col0" >0.290179</td>
608
- <td id="T_a7af5_row100_col1" class="data row100 col1" >0.328125</td>
609
- <td id="T_a7af5_row100_col2" class="data row100 col2" >0.283482</td>
610
- <td id="T_a7af5_row100_col3" class="data row100 col3" >0.303571</td>
 
 
 
611
  </tr>
612
  <tr>
613
- <th id="T_a7af5_level0_row103" class="row_heading level0 row103" >ifeval.prompt_level_strict_acc</th>
614
- <td id="T_a7af5_row103_col0" class="data row103 col0" >0.188540</td>
615
- <td id="T_a7af5_row103_col1" class="data row103 col1" >0.249538</td>
616
- <td id="T_a7af5_row103_col2" class="data row103 col2" >0.275416</td>
617
- <td id="T_a7af5_row103_col3" class="data row103 col3" >0.332717</td>
 
 
 
618
  </tr>
619
  <tr>
620
- <th id="T_a7af5_level0_row105" class="row_heading level0 row105" >ifeval.inst_level_strict_acc</th>
621
- <td id="T_a7af5_row105_col0" class="data row105 col0" >0.297362</td>
622
- <td id="T_a7af5_row105_col1" class="data row105 col1" >0.381295</td>
623
- <td id="T_a7af5_row105_col2" class="data row105 col2" >0.417266</td>
624
- <td id="T_a7af5_row105_col3" class="data row105 col3" >0.474820</td>
 
 
 
625
  </tr>
626
  <tr>
627
- <th id="T_a7af5_level0_row107" class="row_heading level0 row107" >ifeval.prompt_level_loose_acc</th>
628
- <td id="T_a7af5_row107_col0" class="data row107 col0" >0.214418</td>
629
- <td id="T_a7af5_row107_col1" class="data row107 col1" >0.314233</td>
630
- <td id="T_a7af5_row107_col2" class="data row107 col2" >0.280961</td>
631
- <td id="T_a7af5_row107_col3" class="data row107 col3" >0.340111</td>
 
 
 
632
  </tr>
633
  <tr>
634
- <th id="T_a7af5_level0_row109" class="row_heading level0 row109" >ifeval.inst_level_loose_acc</th>
635
- <td id="T_a7af5_row109_col0" class="data row109 col0" >0.323741</td>
636
- <td id="T_a7af5_row109_col1" class="data row109 col1" >0.437650</td>
637
- <td id="T_a7af5_row109_col2" class="data row109 col2" >0.423261</td>
638
- <td id="T_a7af5_row109_col3" class="data row109 col3" >0.484412</td>
 
 
 
639
  </tr>
640
  <tr>
641
- <th id="T_a7af5_level0_row111" class="row_heading level0 row111" >math_hard.exact_match</th>
642
- <td id="T_a7af5_row111_col0" class="data row111 col0" >0.258308</td>
643
- <td id="T_a7af5_row111_col1" class="data row111 col1" >0.388973</td>
644
- <td id="T_a7af5_row111_col2" class="data row111 col2" >0.000000</td>
645
- <td id="T_a7af5_row111_col3" class="data row111 col3" >0.000000</td>
 
 
 
646
  </tr>
647
  <tr>
648
- <th id="T_a7af5_level0_row115" class="row_heading level0 row115" >math_algebra_hard.exact_match</th>
649
- <td id="T_a7af5_row115_col0" class="data row115 col0" >0.501629</td>
650
- <td id="T_a7af5_row115_col1" class="data row115 col1" >0.618893</td>
651
- <td id="T_a7af5_row115_col2" class="data row115 col2" >0.000000</td>
652
- <td id="T_a7af5_row115_col3" class="data row115 col3" >0.000000</td>
 
 
 
653
  </tr>
654
  <tr>
655
- <th id="T_a7af5_level0_row118" class="row_heading level0 row118" >math_counting_and_prob_hard.exact_match</th>
656
- <td id="T_a7af5_row118_col0" class="data row118 col0" >0.162602</td>
657
- <td id="T_a7af5_row118_col1" class="data row118 col1" >0.349593</td>
658
- <td id="T_a7af5_row118_col2" class="data row118 col2" >0.000000</td>
659
- <td id="T_a7af5_row118_col3" class="data row118 col3" >0.000000</td>
 
 
 
660
  </tr>
661
  <tr>
662
- <th id="T_a7af5_level0_row121" class="row_heading level0 row121" >math_geometry_hard.exact_match</th>
663
- <td id="T_a7af5_row121_col0" class="data row121 col0" >0.121212</td>
664
- <td id="T_a7af5_row121_col1" class="data row121 col1" >0.250000</td>
665
- <td id="T_a7af5_row121_col2" class="data row121 col2" >0.000000</td>
666
- <td id="T_a7af5_row121_col3" class="data row121 col3" >0.000000</td>
 
 
 
667
  </tr>
668
  <tr>
669
- <th id="T_a7af5_level0_row124" class="row_heading level0 row124" >math_intermediate_algebra_hard.exact_match</th>
670
- <td id="T_a7af5_row124_col0" class="data row124 col0" >0.064286</td>
671
- <td id="T_a7af5_row124_col1" class="data row124 col1" >0.153571</td>
672
- <td id="T_a7af5_row124_col2" class="data row124 col2" >0.000000</td>
673
- <td id="T_a7af5_row124_col3" class="data row124 col3" >0.000000</td>
 
 
 
674
  </tr>
675
  <tr>
676
- <th id="T_a7af5_level0_row127" class="row_heading level0 row127" >math_num_theory_hard.exact_match</th>
677
- <td id="T_a7af5_row127_col0" class="data row127 col0" >0.266234</td>
678
- <td id="T_a7af5_row127_col1" class="data row127 col1" >0.415584</td>
679
- <td id="T_a7af5_row127_col2" class="data row127 col2" >0.000000</td>
680
- <td id="T_a7af5_row127_col3" class="data row127 col3" >0.000000</td>
 
 
 
681
  </tr>
682
  <tr>
683
- <th id="T_a7af5_level0_row130" class="row_heading level0 row130" >math_prealgebra_hard.exact_match</th>
684
- <td id="T_a7af5_row130_col0" class="data row130 col0" >0.430052</td>
685
- <td id="T_a7af5_row130_col1" class="data row130 col1" >0.647668</td>
686
- <td id="T_a7af5_row130_col2" class="data row130 col2" >0.000000</td>
687
- <td id="T_a7af5_row130_col3" class="data row130 col3" >0.000000</td>
 
 
 
688
  </tr>
689
  <tr>
690
- <th id="T_a7af5_level0_row133" class="row_heading level0 row133" >math_precalculus_hard.exact_match</th>
691
- <td id="T_a7af5_row133_col0" class="data row133 col0" >0.074074</td>
692
- <td id="T_a7af5_row133_col1" class="data row133 col1" >0.125926</td>
693
- <td id="T_a7af5_row133_col2" class="data row133 col2" >0.000000</td>
694
- <td id="T_a7af5_row133_col3" class="data row133 col3" >0.000000</td>
 
 
 
695
  </tr>
696
  <tr>
697
- <th id="T_a7af5_level0_row136" class="row_heading level0 row136" >mmlu_pro.acc</th>
698
- <td id="T_a7af5_row136_col0" class="data row136 col0" >0.192487</td>
699
- <td id="T_a7af5_row136_col1" class="data row136 col1" >0.297041</td>
700
- <td id="T_a7af5_row136_col2" class="data row136 col2" >0.118684</td>
701
- <td id="T_a7af5_row136_col3" class="data row136 col3" >0.232131</td>
 
 
 
702
  </tr>
703
  <tr>
704
- <th id="T_a7af5_level0_row138" class="row_heading level0 row138" >musr.acc_norm</th>
705
- <td id="T_a7af5_row138_col0" class="data row138 col0" >0.334656</td>
706
- <td id="T_a7af5_row138_col1" class="data row138 col1" >0.390212</td>
707
- <td id="T_a7af5_row138_col2" class="data row138 col2" >0.362434</td>
708
- <td id="T_a7af5_row138_col3" class="data row138 col3" >0.365079</td>
 
 
 
709
  </tr>
710
  <tr>
711
- <th id="T_a7af5_level0_row142" class="row_heading level0 row142" >musr_murder_mysteries.acc_norm</th>
712
- <td id="T_a7af5_row142_col0" class="data row142 col0" >0.504000</td>
713
- <td id="T_a7af5_row142_col1" class="data row142 col1" >0.520000</td>
714
- <td id="T_a7af5_row142_col2" class="data row142 col2" >0.492000</td>
715
- <td id="T_a7af5_row142_col3" class="data row142 col3" >0.492000</td>
 
 
 
716
  </tr>
717
  <tr>
718
- <th id="T_a7af5_level0_row145" class="row_heading level0 row145" >musr_object_placements.acc_norm</th>
719
- <td id="T_a7af5_row145_col0" class="data row145 col0" >0.238281</td>
720
- <td id="T_a7af5_row145_col1" class="data row145 col1" >0.281250</td>
721
- <td id="T_a7af5_row145_col2" class="data row145 col2" >0.234375</td>
722
- <td id="T_a7af5_row145_col3" class="data row145 col3" >0.214844</td>
 
 
 
723
  </tr>
724
  <tr>
725
- <th id="T_a7af5_level0_row148" class="row_heading level0 row148" >musr_team_allocation.acc_norm</th>
726
- <td id="T_a7af5_row148_col0" class="data row148 col0" >0.264000</td>
727
- <td id="T_a7af5_row148_col1" class="data row148 col1" >0.372000</td>
728
- <td id="T_a7af5_row148_col2" class="data row148 col2" >0.364000</td>
729
- <td id="T_a7af5_row148_col3" class="data row148 col3" >0.392000</td>
 
 
 
730
  </tr>
731
  </tbody>
732
  </table>
 
1
  <style type="text/css">
2
+ #T_7aef5 td {
3
  overflow-wrap: break-word;
4
  max-width: 1px;
5
  }
6
+ #T_7aef5 .col_heading {
7
+ width: 14.285714285714286%;
8
  }
9
+ #T_7aef5_row15_col0, #T_7aef5_row105_col2, #T_7aef5_row118_col1 {
10
  background-color: #f7cbe4;
11
  color: #000000;
12
  }
13
+ #T_7aef5_row15_col1, #T_7aef5_row31_col4 {
14
  background-color: #f9eff4;
15
  color: #000000;
16
  }
17
+ #T_7aef5_row15_col2, #T_7aef5_row34_col4 {
18
+ background-color: #fbe6f1;
19
+ color: #000000;
20
+ }
21
+ #T_7aef5_row15_col3, #T_7aef5_row58_col4, #T_7aef5_row85_col6, #T_7aef5_row109_col0 {
22
  background-color: #f4bfdf;
23
  color: #000000;
24
  }
25
+ #T_7aef5_row15_col4, #T_7aef5_row28_col3, #T_7aef5_row46_col3, #T_7aef5_row85_col0, #T_7aef5_row107_col4 {
26
  background-color: #f6c7e3;
27
  color: #000000;
28
  }
29
+ #T_7aef5_row15_col5, #T_7aef5_row15_col6, #T_7aef5_row31_col3, #T_7aef5_row40_col1, #T_7aef5_row52_col2 {
30
+ background-color: #f9eef4;
31
+ color: #000000;
32
+ }
33
+ #T_7aef5_row19_col0 {
34
  background-color: #a9d874;
35
  color: #000000;
36
  }
37
+ #T_7aef5_row19_col1 {
38
  background-color: #549825;
39
  color: #f1f1f1;
40
  }
41
+ #T_7aef5_row19_col2 {
42
+ background-color: #cdeaa7;
43
+ color: #000000;
44
+ }
45
+ #T_7aef5_row19_col3, #T_7aef5_row70_col0, #T_7aef5_row115_col0 {
46
  background-color: #f7f7f6;
47
  color: #000000;
48
  }
49
+ #T_7aef5_row19_col4, #T_7aef5_row88_col6 {
50
  background-color: #f1f6ea;
51
  color: #000000;
52
  }
53
+ #T_7aef5_row19_col5 {
54
+ background-color: #7fbc41;
55
+ color: #000000;
56
+ }
57
+ #T_7aef5_row19_col6 {
58
+ background-color: #8fc654;
59
+ color: #000000;
60
+ }
61
+ #T_7aef5_row22_col0, #T_7aef5_row52_col5 {
62
  background-color: #f5f7f3;
63
  color: #000000;
64
  }
65
+ #T_7aef5_row22_col1, #T_7aef5_row31_col5 {
66
  background-color: #edf6df;
67
  color: #000000;
68
  }
69
+ #T_7aef5_row22_col2 {
70
+ background-color: #ebf6db;
71
+ color: #000000;
72
+ }
73
+ #T_7aef5_row22_col3, #T_7aef5_row22_col4, #T_7aef5_row37_col3, #T_7aef5_row70_col1 {
74
  background-color: #f4f7f0;
75
  color: #000000;
76
  }
77
+ #T_7aef5_row22_col5 {
78
+ background-color: #eaf5d9;
79
+ color: #000000;
80
+ }
81
+ #T_7aef5_row22_col6, #T_7aef5_row37_col5 {
82
+ background-color: #d8efb9;
83
+ color: #000000;
84
+ }
85
+ #T_7aef5_row25_col0, #T_7aef5_row100_col3, #T_7aef5_row145_col1 {
86
  background-color: #eeabd2;
87
  color: #000000;
88
  }
89
+ #T_7aef5_row25_col1, #T_7aef5_row49_col6 {
90
  background-color: #eff6e4;
91
  color: #000000;
92
  }
93
+ #T_7aef5_row25_col2 {
94
+ background-color: #fce3f0;
95
+ color: #000000;
96
+ }
97
+ #T_7aef5_row25_col3, #T_7aef5_row40_col3 {
98
  background-color: #df7cb1;
99
  color: #f1f1f1;
100
  }
101
+ #T_7aef5_row25_col4 {
102
  background-color: #fbd9ec;
103
  color: #000000;
104
  }
105
+ #T_7aef5_row25_col5, #T_7aef5_row70_col4, #T_7aef5_row73_col4 {
106
+ background-color: #f8f2f5;
107
+ color: #000000;
108
+ }
109
+ #T_7aef5_row25_col6, #T_7aef5_row61_col6 {
110
+ background-color: #faecf3;
111
+ color: #000000;
112
+ }
113
+ #T_7aef5_row28_col0, #T_7aef5_row130_col0 {
114
  background-color: #fbe7f2;
115
  color: #000000;
116
  }
117
+ #T_7aef5_row28_col1, #T_7aef5_row73_col1 {
118
  background-color: #ecf6de;
119
  color: #000000;
120
  }
121
+ #T_7aef5_row28_col2, #T_7aef5_row64_col6, #T_7aef5_row70_col6 {
122
+ background-color: #e2f3ca;
123
+ color: #000000;
124
+ }
125
+ #T_7aef5_row28_col4 {
126
  background-color: #f2badc;
127
  color: #000000;
128
  }
129
+ #T_7aef5_row28_col5, #T_7aef5_row73_col3, #T_7aef5_row142_col2 {
130
+ background-color: #f3f6ed;
131
+ color: #000000;
132
+ }
133
+ #T_7aef5_row28_col6, #T_7aef5_row100_col0, #T_7aef5_row145_col2 {
134
+ background-color: #efb0d6;
135
+ color: #000000;
136
+ }
137
+ #T_7aef5_row31_col0, #T_7aef5_row105_col4 {
138
  background-color: #f9f1f5;
139
  color: #000000;
140
  }
141
+ #T_7aef5_row31_col1, #T_7aef5_row37_col1 {
142
  background-color: #e8f5d5;
143
  color: #000000;
144
  }
145
+ #T_7aef5_row31_col2, #T_7aef5_row70_col3, #T_7aef5_row73_col0, #T_7aef5_row142_col5 {
146
+ background-color: #f0f6e7;
147
+ color: #000000;
148
+ }
149
+ #T_7aef5_row31_col6, #T_7aef5_row37_col4, #T_7aef5_row46_col4, #T_7aef5_row52_col4 {
150
+ background-color: #f8f3f6;
151
  color: #000000;
152
  }
153
+ #T_7aef5_row34_col0, #T_7aef5_row43_col0, #T_7aef5_row124_col1 {
154
  background-color: #d24c97;
155
  color: #f1f1f1;
156
  }
157
+ #T_7aef5_row34_col1 {
158
  background-color: #faeaf2;
159
  color: #000000;
160
  }
161
+ #T_7aef5_row34_col2, #T_7aef5_row55_col1 {
162
+ background-color: #fad6ea;
163
+ color: #000000;
164
+ }
165
+ #T_7aef5_row34_col3, #T_7aef5_row67_col4 {
166
  background-color: #e283b7;
167
  color: #f1f1f1;
168
  }
169
+ #T_7aef5_row34_col5, #T_7aef5_row55_col0, #T_7aef5_row90_col1 {
170
+ background-color: #f5c6e2;
171
  color: #000000;
172
  }
173
+ #T_7aef5_row34_col6, #T_7aef5_row109_col2, #T_7aef5_row148_col2 {
174
+ background-color: #f9d3e8;
175
+ color: #000000;
176
+ }
177
+ #T_7aef5_row37_col0, #T_7aef5_row49_col0, #T_7aef5_row55_col5, #T_7aef5_row88_col2 {
178
  background-color: #f7f6f7;
179
  color: #000000;
180
  }
181
+ #T_7aef5_row37_col2, #T_7aef5_row46_col6, #T_7aef5_row52_col0, #T_7aef5_row64_col5, #T_7aef5_row142_col0, #T_7aef5_row142_col6 {
182
+ background-color: #f6f7f5;
183
  color: #000000;
184
  }
185
+ #T_7aef5_row37_col6 {
186
+ background-color: #c6e79c;
187
+ color: #000000;
188
+ }
189
+ #T_7aef5_row40_col0, #T_7aef5_row103_col2, #T_7aef5_row145_col6 {
190
  background-color: #e388ba;
191
  color: #f1f1f1;
192
  }
193
+ #T_7aef5_row40_col2, #T_7aef5_row76_col3, #T_7aef5_row97_col3, #T_7aef5_row145_col0 {
194
+ background-color: #e590bf;
195
+ color: #f1f1f1;
196
+ }
197
+ #T_7aef5_row40_col4 {
198
  background-color: #eeadd4;
199
  color: #000000;
200
  }
201
+ #T_7aef5_row40_col5, #T_7aef5_row55_col3 {
202
+ background-color: #f8cee6;
203
+ color: #000000;
204
+ }
205
+ #T_7aef5_row40_col6 {
206
+ background-color: #fbe8f2;
207
+ color: #000000;
208
+ }
209
+ #T_7aef5_row43_col1 {
210
  background-color: #f6c9e3;
211
  color: #000000;
212
  }
213
+ #T_7aef5_row43_col2, #T_7aef5_row61_col4, #T_7aef5_row82_col2 {
214
+ background-color: #d861a2;
215
+ color: #f1f1f1;
216
+ }
217
+ #T_7aef5_row43_col3, #T_7aef5_row67_col3, #T_7aef5_row79_col1, #T_7aef5_row82_col6 {
218
  background-color: #cf4191;
219
  color: #f1f1f1;
220
  }
221
+ #T_7aef5_row43_col4, #T_7aef5_row79_col2, #T_7aef5_row79_col5 {
222
  background-color: #d34f99;
223
  color: #f1f1f1;
224
  }
225
+ #T_7aef5_row43_col5, #T_7aef5_row130_col6, #T_7aef5_row136_col5, #T_7aef5_row148_col6 {
226
+ background-color: #f4c1df;
227
+ color: #000000;
228
+ }
229
+ #T_7aef5_row43_col6, #T_7aef5_row58_col6, #T_7aef5_row111_col1, #T_7aef5_row138_col1 {
230
+ background-color: #fcdbed;
231
+ color: #000000;
232
+ }
233
+ #T_7aef5_row46_col0 {
234
  background-color: #fde2f0;
235
  color: #000000;
236
  }
237
+ #T_7aef5_row46_col1 {
238
  background-color: #bbe28a;
239
  color: #000000;
240
  }
241
+ #T_7aef5_row46_col2 {
242
+ background-color: #fde0ef;
243
+ color: #000000;
244
+ }
245
+ #T_7aef5_row46_col5 {
246
+ background-color: #f5f7f2;
247
+ color: #000000;
248
+ }
249
+ #T_7aef5_row49_col1 {
250
  background-color: #e9f5d6;
251
  color: #000000;
252
  }
253
+ #T_7aef5_row49_col2 {
254
+ background-color: #81bd44;
255
+ color: #000000;
256
+ }
257
+ #T_7aef5_row49_col3, #T_7aef5_row67_col1, #T_7aef5_row148_col0 {
258
  background-color: #ea9fca;
259
  color: #000000;
260
  }
261
+ #T_7aef5_row49_col4, #T_7aef5_row90_col5, #T_7aef5_row97_col6 {
262
  background-color: #f0b2d7;
263
  color: #000000;
264
  }
265
+ #T_7aef5_row49_col5 {
266
+ background-color: #83bf46;
267
  color: #000000;
268
  }
269
+ #T_7aef5_row52_col1 {
270
  background-color: #c0e593;
271
  color: #000000;
272
  }
273
+ #T_7aef5_row52_col3, #T_7aef5_row58_col1, #T_7aef5_row58_col2, #T_7aef5_row64_col1, #T_7aef5_row109_col3 {
274
  background-color: #fce5f1;
275
  color: #000000;
276
  }
277
+ #T_7aef5_row52_col6 {
278
+ background-color: #e1f3c7;
279
  color: #000000;
280
  }
281
+ #T_7aef5_row55_col2, #T_7aef5_row97_col0, #T_7aef5_row107_col1 {
282
+ background-color: #f3bcdd;
 
 
 
 
283
  color: #000000;
284
  }
285
+ #T_7aef5_row55_col4 {
286
  background-color: #c82884;
287
  color: #f1f1f1;
288
  }
289
+ #T_7aef5_row55_col6, #T_7aef5_row138_col3 {
290
+ background-color: #f8d0e7;
291
+ color: #000000;
292
+ }
293
+ #T_7aef5_row58_col0, #T_7aef5_row121_col1 {
294
  background-color: #e897c4;
295
  color: #000000;
296
  }
297
+ #T_7aef5_row58_col3, #T_7aef5_row79_col0, #T_7aef5_row107_col0 {
298
  background-color: #e07eb3;
299
  color: #f1f1f1;
300
  }
301
+ #T_7aef5_row58_col5, #T_7aef5_row64_col2, #T_7aef5_row109_col1 {
302
+ background-color: #fbe9f2;
303
+ color: #000000;
304
+ }
305
+ #T_7aef5_row61_col0, #T_7aef5_row67_col2, #T_7aef5_row100_col2, #T_7aef5_row100_col6 {
306
  background-color: #eba3cd;
307
  color: #000000;
308
  }
309
+ #T_7aef5_row61_col1, #T_7aef5_row88_col0, #T_7aef5_row88_col1, #T_7aef5_row88_col3, #T_7aef5_row88_col4, #T_7aef5_row88_col5, #T_7aef5_row109_col4 {
310
  background-color: #f8f4f6;
311
  color: #000000;
312
  }
313
+ #T_7aef5_row61_col2, #T_7aef5_row97_col5, #T_7aef5_row100_col5 {
314
+ background-color: #f2b8db;
315
+ color: #000000;
316
+ }
317
+ #T_7aef5_row61_col3, #T_7aef5_row136_col0 {
318
  background-color: #dc70aa;
319
  color: #f1f1f1;
320
  }
321
+ #T_7aef5_row61_col5, #T_7aef5_row67_col6, #T_7aef5_row148_col1 {
322
+ background-color: #fad4e9;
323
+ color: #000000;
324
  }
325
+ #T_7aef5_row64_col0, #T_7aef5_row145_col4 {
326
  background-color: #e181b5;
327
  color: #f1f1f1;
328
  }
329
+ #T_7aef5_row64_col3, #T_7aef5_row94_col4, #T_7aef5_row136_col4 {
330
  background-color: #e48bbc;
331
  color: #f1f1f1;
332
  }
333
+ #T_7aef5_row64_col4, #T_7aef5_row138_col4, #T_7aef5_row148_col3 {
334
  background-color: #f9d1e8;
335
  color: #000000;
336
  }
337
+ #T_7aef5_row67_col0, #T_7aef5_row76_col1, #T_7aef5_row118_col6 {
338
  background-color: #cc368b;
339
  color: #f1f1f1;
340
  }
341
+ #T_7aef5_row67_col5 {
342
+ background-color: #fddeee;
343
  color: #000000;
344
  }
345
+ #T_7aef5_row70_col2 {
346
+ background-color: #cbe9a4;
347
  color: #000000;
348
  }
349
+ #T_7aef5_row70_col5 {
350
+ background-color: #d6eeb6;
351
+ color: #000000;
352
+ }
353
+ #T_7aef5_row73_col2 {
354
+ background-color: #a1d26a;
355
+ color: #000000;
356
+ }
357
+ #T_7aef5_row73_col5 {
358
+ background-color: #9acd61;
359
+ color: #000000;
360
+ }
361
+ #T_7aef5_row73_col6 {
362
+ background-color: #b2dd7f;
363
  color: #000000;
364
  }
365
+ #T_7aef5_row76_col0, #T_7aef5_row133_col0 {
366
  background-color: #b51370;
367
  color: #f1f1f1;
368
  }
369
+ #T_7aef5_row76_col2, #T_7aef5_row90_col3, #T_7aef5_row94_col6, #T_7aef5_row148_col5 {
370
+ background-color: #e89ac6;
371
+ color: #000000;
372
  }
373
+ #T_7aef5_row76_col4, #T_7aef5_row94_col5, #T_7aef5_row107_col2 {
374
  background-color: #e692c1;
375
  color: #000000;
376
  }
377
+ #T_7aef5_row76_col5, #T_7aef5_row127_col6 {
378
+ background-color: #c2197a;
379
+ color: #f1f1f1;
380
+ }
381
+ #T_7aef5_row76_col6, #T_7aef5_row111_col0 {
382
+ background-color: #e99cc8;
383
+ color: #000000;
384
+ }
385
+ #T_7aef5_row79_col3 {
386
  background-color: #e286b8;
387
  color: #f1f1f1;
388
  }
389
+ #T_7aef5_row79_col4 {
390
  background-color: #c72482;
391
  color: #f1f1f1;
392
  }
393
+ #T_7aef5_row79_col6, #T_7aef5_row82_col3 {
394
+ background-color: #d14895;
395
+ color: #f1f1f1;
396
+ }
397
+ #T_7aef5_row82_col0, #T_7aef5_row111_col6 {
398
  background-color: #d65a9f;
399
  color: #f1f1f1;
400
  }
401
+ #T_7aef5_row82_col1, #T_7aef5_row82_col4, #T_7aef5_row136_col3 {
402
  background-color: #c92b86;
403
  color: #f1f1f1;
404
  }
405
+ #T_7aef5_row82_col5, #T_7aef5_row121_col0 {
406
+ background-color: #ca2f88;
407
  color: #f1f1f1;
408
  }
409
+ #T_7aef5_row85_col1, #T_7aef5_row94_col1, #T_7aef5_row136_col6 {
410
  background-color: #f3bdde;
411
  color: #000000;
412
  }
413
+ #T_7aef5_row85_col2, #T_7aef5_row97_col1, #T_7aef5_row115_col6, #T_7aef5_row138_col6, #T_7aef5_row145_col5 {
414
+ background-color: #f7cce5;
415
+ color: #000000;
416
+ }
417
+ #T_7aef5_row85_col3, #T_7aef5_row85_col5, #T_7aef5_row100_col1 {
418
  background-color: #f5c2e0;
419
  color: #000000;
420
  }
421
+ #T_7aef5_row85_col4, #T_7aef5_row100_col4 {
422
  background-color: #f1b7da;
423
  color: #000000;
424
  }
425
+ #T_7aef5_row90_col0, #T_7aef5_row105_col0, #T_7aef5_row136_col1, #T_7aef5_row136_col2 {
426
  background-color: #f1b5d9;
427
  color: #000000;
428
  }
429
+ #T_7aef5_row90_col2, #T_7aef5_row94_col0, #T_7aef5_row94_col2, #T_7aef5_row97_col2, #T_7aef5_row127_col0 {
430
+ background-color: #eba1cb;
431
  color: #000000;
432
  }
433
+ #T_7aef5_row90_col4, #T_7aef5_row90_col6, #T_7aef5_row107_col3 {
434
  background-color: #eda8d1;
435
  color: #000000;
436
  }
437
+ #T_7aef5_row94_col3, #T_7aef5_row145_col3 {
 
 
 
 
438
  background-color: #e58dbe;
439
  color: #f1f1f1;
440
  }
441
+ #T_7aef5_row97_col4, #T_7aef5_row103_col3 {
 
 
 
 
 
 
 
 
442
  background-color: #eca6cf;
443
  color: #000000;
444
  }
445
+ #T_7aef5_row103_col0 {
 
 
 
 
446
  background-color: #db6ca8;
447
  color: #f1f1f1;
448
  }
449
+ #T_7aef5_row103_col1 {
450
  background-color: #e795c3;
451
  color: #000000;
452
  }
453
+ #T_7aef5_row103_col4, #T_7aef5_row138_col0 {
454
  background-color: #f5c4e1;
455
  color: #000000;
456
  }
457
+ #T_7aef5_row103_col5 {
458
+ background-color: #b91574;
459
+ color: #f1f1f1;
460
+ }
461
+ #T_7aef5_row103_col6 {
462
+ background-color: #b9e187;
463
+ color: #000000;
464
+ }
465
+ #T_7aef5_row105_col1, #T_7aef5_row138_col5 {
466
  background-color: #fbd8eb;
467
  color: #000000;
468
  }
469
+ #T_7aef5_row105_col3, #T_7aef5_row127_col1 {
470
  background-color: #fce4f0;
471
  color: #000000;
472
  }
473
+ #T_7aef5_row105_col5 {
474
+ background-color: #d75ea1;
475
+ color: #f1f1f1;
476
  }
477
+ #T_7aef5_row105_col6 {
478
+ background-color: #88c24c;
479
  color: #000000;
480
  }
481
+ #T_7aef5_row107_col5 {
482
+ background-color: #c01879;
483
+ color: #f1f1f1;
484
+ }
485
+ #T_7aef5_row107_col6 {
486
+ background-color: #95cb5c;
487
  color: #000000;
488
  }
489
+ #T_7aef5_row109_col5 {
490
+ background-color: #d965a4;
491
+ color: #f1f1f1;
492
+ }
493
+ #T_7aef5_row109_col6 {
494
+ background-color: #71b038;
495
+ color: #f1f1f1;
496
+ }
497
+ #T_7aef5_row111_col2, #T_7aef5_row118_col5 {
498
+ background-color: #970559;
499
+ color: #f1f1f1;
500
+ }
501
+ #T_7aef5_row111_col3, #T_7aef5_row111_col4, #T_7aef5_row115_col3, #T_7aef5_row115_col4, #T_7aef5_row118_col3, #T_7aef5_row118_col4, #T_7aef5_row121_col3, #T_7aef5_row121_col4, #T_7aef5_row124_col3, #T_7aef5_row124_col4, #T_7aef5_row127_col3, #T_7aef5_row127_col4, #T_7aef5_row130_col3, #T_7aef5_row130_col4, #T_7aef5_row133_col3, #T_7aef5_row133_col4 {
502
  background-color: #8e0152;
503
  color: #f1f1f1;
504
  }
505
+ #T_7aef5_row111_col5 {
506
+ background-color: #aa0e68;
507
+ color: #f1f1f1;
508
+ }
509
+ #T_7aef5_row115_col1 {
510
  background-color: #ddf1c1;
511
  color: #000000;
512
  }
513
+ #T_7aef5_row115_col2, #T_7aef5_row118_col2 {
514
+ background-color: #9b075c;
515
+ color: #f1f1f1;
516
+ }
517
+ #T_7aef5_row115_col5 {
518
+ background-color: #c41a7c;
519
+ color: #f1f1f1;
520
+ }
521
+ #T_7aef5_row118_col0 {
522
  background-color: #d4539b;
523
  color: #f1f1f1;
524
  }
525
+ #T_7aef5_row121_col2, #T_7aef5_row124_col6 {
526
+ background-color: #9d085e;
527
+ color: #f1f1f1;
528
+ }
529
+ #T_7aef5_row121_col5, #T_7aef5_row133_col5 {
530
+ background-color: #99065a;
531
  color: #f1f1f1;
532
  }
533
+ #T_7aef5_row121_col6 {
534
+ background-color: #bb1675;
535
+ color: #f1f1f1;
536
+ }
537
+ #T_7aef5_row124_col0 {
538
  background-color: #b1116d;
539
  color: #f1f1f1;
540
  }
541
+ #T_7aef5_row124_col2, #T_7aef5_row124_col5, #T_7aef5_row130_col2, #T_7aef5_row133_col2 {
542
+ background-color: #900254;
543
+ color: #f1f1f1;
544
+ }
545
+ #T_7aef5_row127_col2 {
546
+ background-color: #940457;
547
+ color: #f1f1f1;
548
+ }
549
+ #T_7aef5_row127_col5, #T_7aef5_row133_col6 {
550
+ background-color: #a60c65;
551
+ color: #f1f1f1;
552
+ }
553
+ #T_7aef5_row130_col1 {
554
  background-color: #d0ecad;
555
  color: #000000;
556
  }
557
+ #T_7aef5_row130_col5 {
558
+ background-color: #c51d7e;
559
+ color: #f1f1f1;
560
+ }
561
+ #T_7aef5_row133_col1 {
562
  background-color: #cb3289;
563
  color: #f1f1f1;
564
  }
565
+ #T_7aef5_row138_col2, #T_7aef5_row148_col4 {
566
+ background-color: #fcdded;
567
  color: #000000;
568
  }
569
+ #T_7aef5_row142_col1 {
570
  background-color: #f3f7ef;
571
  color: #000000;
572
  }
573
+ #T_7aef5_row142_col3, #T_7aef5_row142_col4 {
574
  background-color: #f8f5f6;
575
  color: #000000;
576
  }
 
 
 
 
 
 
 
 
577
  </style>
578
+ <table id="T_7aef5">
579
  <thead>
580
  <tr>
581
  <th class="blank level0" >&nbsp;</th>
582
+ <th id="T_7aef5_level0_col0" class="col_heading level0 col0" >Spestly/Atlas-Pro-1.5B-Preview</th>
583
+ <th id="T_7aef5_level0_col1" class="col_heading level0 col1" >Spestly/Atlas-Pro-7B-Preview</th>
584
+ <th id="T_7aef5_level0_col2" class="col_heading level0 col2" >01-ai/Yi-6B</th>
585
+ <th id="T_7aef5_level0_col3" class="col_heading level0 col3" >deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B</th>
586
+ <th id="T_7aef5_level0_col4" class="col_heading level0 col4" >deepseek-ai/DeepSeek-R1-Distill-Qwen-7B</th>
587
+ <th id="T_7aef5_level0_col5" class="col_heading level0 col5" >meta-llama/Meta-Llama-3.1-8B</th>
588
+ <th id="T_7aef5_level0_col6" class="col_heading level0 col6" >meta-llama/Llama-3.2-3B-Instruct</th>
589
  </tr>
590
  </thead>
591
  <tbody>
592
  <tr>
593
+ <th id="T_7aef5_level0_row15" class="row_heading level0 row15" >bbh.acc_norm</th>
594
+ <td id="T_7aef5_row15_col0" class="data row15 col0" >0.348030</td>
595
+ <td id="T_7aef5_row15_col1" class="data row15 col1" >0.465891</td>
596
+ <td id="T_7aef5_row15_col2" class="data row15 col2" >0.426662</td>
597
+ <td id="T_7aef5_row15_col3" class="data row15 col3" >0.321298</td>
598
+ <td id="T_7aef5_row15_col4" class="data row15 col4" >0.341087</td>
599
+ <td id="T_7aef5_row15_col5" class="data row15 col5" >0.463808</td>
600
+ <td id="T_7aef5_row15_col6" class="data row15 col6" >0.458601</td>
601
  </tr>
602
  <tr>
603
+ <th id="T_7aef5_level0_row19" class="row_heading level0 row19" >bbh_boolean_expressions.acc_norm</th>
604
+ <td id="T_7aef5_row19_col0" class="data row19 col0" >0.724000</td>
605
+ <td id="T_7aef5_row19_col1" class="data row19 col1" >0.884000</td>
606
+ <td id="T_7aef5_row19_col2" class="data row19 col2" >0.656000</td>
607
+ <td id="T_7aef5_row19_col3" class="data row19 col3" >0.500000</td>
608
+ <td id="T_7aef5_row19_col4" class="data row19 col4" >0.532000</td>
609
+ <td id="T_7aef5_row19_col5" class="data row19 col5" >0.800000</td>
610
+ <td id="T_7aef5_row19_col6" class="data row19 col6" >0.772000</td>
611
  </tr>
612
  <tr>
613
+ <th id="T_7aef5_level0_row22" class="row_heading level0 row22" >bbh_causal_judgement.acc_norm</th>
614
+ <td id="T_7aef5_row22_col0" class="data row22 col0" >0.508021</td>
615
+ <td id="T_7aef5_row22_col1" class="data row22 col1" >0.561497</td>
616
+ <td id="T_7aef5_row22_col2" class="data row22 col2" >0.572193</td>
617
+ <td id="T_7aef5_row22_col3" class="data row22 col3" >0.518717</td>
618
+ <td id="T_7aef5_row22_col4" class="data row22 col4" >0.518717</td>
619
+ <td id="T_7aef5_row22_col5" class="data row22 col5" >0.577540</td>
620
+ <td id="T_7aef5_row22_col6" class="data row22 col6" >0.631016</td>
621
  </tr>
622
  <tr>
623
+ <th id="T_7aef5_level0_row25" class="row_heading level0 row25" >bbh_date_understanding.acc_norm</th>
624
+ <td id="T_7aef5_row25_col0" class="data row25 col0" >0.284000</td>
625
+ <td id="T_7aef5_row25_col1" class="data row25 col1" >0.548000</td>
626
+ <td id="T_7aef5_row25_col2" class="data row25 col2" >0.412000</td>
627
+ <td id="T_7aef5_row25_col3" class="data row25 col3" >0.208000</td>
628
+ <td id="T_7aef5_row25_col4" class="data row25 col4" >0.384000</td>
629
+ <td id="T_7aef5_row25_col5" class="data row25 col5" >0.480000</td>
630
+ <td id="T_7aef5_row25_col6" class="data row25 col6" >0.452000</td>
631
  </tr>
632
  <tr>
633
+ <th id="T_7aef5_level0_row28" class="row_heading level0 row28" >bbh_disambiguation_qa.acc_norm</th>
634
+ <td id="T_7aef5_row28_col0" class="data row28 col0" >0.432000</td>
635
+ <td id="T_7aef5_row28_col1" class="data row28 col1" >0.564000</td>
636
+ <td id="T_7aef5_row28_col2" class="data row28 col2" >0.608000</td>
637
+ <td id="T_7aef5_row28_col3" class="data row28 col3" >0.340000</td>
638
+ <td id="T_7aef5_row28_col4" class="data row28 col4" >0.312000</td>
639
+ <td id="T_7aef5_row28_col5" class="data row28 col5" >0.524000</td>
640
+ <td id="T_7aef5_row28_col6" class="data row28 col6" >0.292000</td>
641
  </tr>
642
  <tr>
643
+ <th id="T_7aef5_level0_row31" class="row_heading level0 row31" >bbh_formal_fallacies.acc_norm</th>
644
+ <td id="T_7aef5_row31_col0" class="data row31 col0" >0.476000</td>
645
+ <td id="T_7aef5_row31_col1" class="data row31 col1" >0.588000</td>
646
+ <td id="T_7aef5_row31_col2" class="data row31 col2" >0.540000</td>
647
+ <td id="T_7aef5_row31_col3" class="data row31 col3" >0.464000</td>
648
+ <td id="T_7aef5_row31_col4" class="data row31 col4" >0.468000</td>
649
+ <td id="T_7aef5_row31_col5" class="data row31 col5" >0.560000</td>
650
+ <td id="T_7aef5_row31_col6" class="data row31 col6" >0.484000</td>
651
  </tr>
652
  <tr>
653
+ <th id="T_7aef5_level0_row34" class="row_heading level0 row34" >bbh_geometric_shapes.acc_norm</th>
654
+ <td id="T_7aef5_row34_col0" class="data row34 col0" >0.156000</td>
655
+ <td id="T_7aef5_row34_col1" class="data row34 col1" >0.444000</td>
656
+ <td id="T_7aef5_row34_col2" class="data row34 col2" >0.376000</td>
657
+ <td id="T_7aef5_row34_col3" class="data row34 col3" >0.220000</td>
658
+ <td id="T_7aef5_row34_col4" class="data row34 col4" >0.428000</td>
659
+ <td id="T_7aef5_row34_col5" class="data row34 col5" >0.336000</td>
660
+ <td id="T_7aef5_row34_col6" class="data row34 col6" >0.368000</td>
661
  </tr>
662
  <tr>
663
+ <th id="T_7aef5_level0_row37" class="row_heading level0 row37" >bbh_hyperbaton.acc_norm</th>
664
+ <td id="T_7aef5_row37_col0" class="data row37 col0" >0.496000</td>
665
+ <td id="T_7aef5_row37_col1" class="data row37 col1" >0.588000</td>
666
+ <td id="T_7aef5_row37_col2" class="data row37 col2" >0.504000</td>
667
+ <td id="T_7aef5_row37_col3" class="data row37 col3" >0.516000</td>
668
+ <td id="T_7aef5_row37_col4" class="data row37 col4" >0.484000</td>
669
+ <td id="T_7aef5_row37_col5" class="data row37 col5" >0.632000</td>
670
+ <td id="T_7aef5_row37_col6" class="data row37 col6" >0.668000</td>
671
  </tr>
672
  <tr>
673
+ <th id="T_7aef5_level0_row40" class="row_heading level0 row40" >bbh_logical_deduction_five_objects.acc_norm</th>
674
+ <td id="T_7aef5_row40_col0" class="data row40 col0" >0.228000</td>
675
+ <td id="T_7aef5_row40_col1" class="data row40 col1" >0.464000</td>
676
+ <td id="T_7aef5_row40_col2" class="data row40 col2" >0.240000</td>
677
+ <td id="T_7aef5_row40_col3" class="data row40 col3" >0.208000</td>
678
+ <td id="T_7aef5_row40_col4" class="data row40 col4" >0.288000</td>
679
+ <td id="T_7aef5_row40_col5" class="data row40 col5" >0.356000</td>
680
+ <td id="T_7aef5_row40_col6" class="data row40 col6" >0.436000</td>
681
  </tr>
682
  <tr>
683
+ <th id="T_7aef5_level0_row43" class="row_heading level0 row43" >bbh_logical_deduction_seven_objects.acc_norm</th>
684
+ <td id="T_7aef5_row43_col0" class="data row43 col0" >0.156000</td>
685
+ <td id="T_7aef5_row43_col1" class="data row43 col1" >0.344000</td>
686
+ <td id="T_7aef5_row43_col2" class="data row43 col2" >0.176000</td>
687
+ <td id="T_7aef5_row43_col3" class="data row43 col3" >0.144000</td>
688
+ <td id="T_7aef5_row43_col4" class="data row43 col4" >0.160000</td>
689
+ <td id="T_7aef5_row43_col5" class="data row43 col5" >0.328000</td>
690
+ <td id="T_7aef5_row43_col6" class="data row43 col6" >0.388000</td>
691
  </tr>
692
  <tr>
693
+ <th id="T_7aef5_level0_row46" class="row_heading level0 row46" >bbh_logical_deduction_three_objects.acc_norm</th>
694
+ <td id="T_7aef5_row46_col0" class="data row46 col0" >0.408000</td>
695
+ <td id="T_7aef5_row46_col1" class="data row46 col1" >0.692000</td>
696
+ <td id="T_7aef5_row46_col2" class="data row46 col2" >0.400000</td>
697
+ <td id="T_7aef5_row46_col3" class="data row46 col3" >0.340000</td>
698
+ <td id="T_7aef5_row46_col4" class="data row46 col4" >0.484000</td>
699
+ <td id="T_7aef5_row46_col5" class="data row46 col5" >0.512000</td>
700
+ <td id="T_7aef5_row46_col6" class="data row46 col6" >0.504000</td>
701
  </tr>
702
  <tr>
703
+ <th id="T_7aef5_level0_row49" class="row_heading level0 row49" >bbh_movie_recommendation.acc_norm</th>
704
+ <td id="T_7aef5_row49_col0" class="data row49 col0" >0.496000</td>
705
+ <td id="T_7aef5_row49_col1" class="data row49 col1" >0.584000</td>
706
+ <td id="T_7aef5_row49_col2" class="data row49 col2" >0.796000</td>
707
+ <td id="T_7aef5_row49_col3" class="data row49 col3" >0.264000</td>
708
+ <td id="T_7aef5_row49_col4" class="data row49 col4" >0.296000</td>
709
+ <td id="T_7aef5_row49_col5" class="data row49 col5" >0.792000</td>
710
+ <td id="T_7aef5_row49_col6" class="data row49 col6" >0.548000</td>
711
  </tr>
712
  <tr>
713
+ <th id="T_7aef5_level0_row52" class="row_heading level0 row52" >bbh_navigate.acc_norm</th>
714
+ <td id="T_7aef5_row52_col0" class="data row52 col0" >0.504000</td>
715
+ <td id="T_7aef5_row52_col1" class="data row52 col1" >0.680000</td>
716
+ <td id="T_7aef5_row52_col2" class="data row52 col2" >0.464000</td>
717
+ <td id="T_7aef5_row52_col3" class="data row52 col3" >0.420000</td>
718
+ <td id="T_7aef5_row52_col4" class="data row52 col4" >0.484000</td>
719
+ <td id="T_7aef5_row52_col5" class="data row52 col5" >0.508000</td>
720
+ <td id="T_7aef5_row52_col6" class="data row52 col6" >0.612000</td>
721
  </tr>
722
  <tr>
723
+ <th id="T_7aef5_level0_row55" class="row_heading level0 row55" >bbh_object_counting.acc_norm</th>
724
+ <td id="T_7aef5_row55_col0" class="data row55 col0" >0.336000</td>
725
+ <td id="T_7aef5_row55_col1" class="data row55 col1" >0.376000</td>
726
+ <td id="T_7aef5_row55_col2" class="data row55 col2" >0.316000</td>
727
+ <td id="T_7aef5_row55_col3" class="data row55 col3" >0.356000</td>
728
+ <td id="T_7aef5_row55_col4" class="data row55 col4" >0.116000</td>
729
+ <td id="T_7aef5_row55_col5" class="data row55 col5" >0.496000</td>
730
+ <td id="T_7aef5_row55_col6" class="data row55 col6" >0.360000</td>
731
  </tr>
732
  <tr>
733
+ <th id="T_7aef5_level0_row58" class="row_heading level0 row58" >bbh_penguins_in_a_table.acc_norm</th>
734
+ <td id="T_7aef5_row58_col0" class="data row58 col0" >0.253425</td>
735
+ <td id="T_7aef5_row58_col1" class="data row58 col1" >0.424658</td>
736
+ <td id="T_7aef5_row58_col2" class="data row58 col2" >0.424658</td>
737
+ <td id="T_7aef5_row58_col3" class="data row58 col3" >0.212329</td>
738
+ <td id="T_7aef5_row58_col4" class="data row58 col4" >0.321918</td>
739
+ <td id="T_7aef5_row58_col5" class="data row58 col5" >0.438356</td>
740
+ <td id="T_7aef5_row58_col6" class="data row58 col6" >0.390411</td>
741
  </tr>
742
  <tr>
743
+ <th id="T_7aef5_level0_row61" class="row_heading level0 row61" >bbh_reasoning_about_colored_objects.acc_norm</th>
744
+ <td id="T_7aef5_row61_col0" class="data row61 col0" >0.272000</td>
745
+ <td id="T_7aef5_row61_col1" class="data row61 col1" >0.488000</td>
746
+ <td id="T_7aef5_row61_col2" class="data row61 col2" >0.308000</td>
747
+ <td id="T_7aef5_row61_col3" class="data row61 col3" >0.192000</td>
748
+ <td id="T_7aef5_row61_col4" class="data row61 col4" >0.176000</td>
749
+ <td id="T_7aef5_row61_col5" class="data row61 col5" >0.372000</td>
750
+ <td id="T_7aef5_row61_col6" class="data row61 col6" >0.452000</td>
751
  </tr>
752
  <tr>
753
+ <th id="T_7aef5_level0_row64" class="row_heading level0 row64" >bbh_ruin_names.acc_norm</th>
754
+ <td id="T_7aef5_row64_col0" class="data row64 col0" >0.216000</td>
755
+ <td id="T_7aef5_row64_col1" class="data row64 col1" >0.424000</td>
756
+ <td id="T_7aef5_row64_col2" class="data row64 col2" >0.440000</td>
757
+ <td id="T_7aef5_row64_col3" class="data row64 col3" >0.232000</td>
758
+ <td id="T_7aef5_row64_col4" class="data row64 col4" >0.364000</td>
759
+ <td id="T_7aef5_row64_col5" class="data row64 col5" >0.504000</td>
760
+ <td id="T_7aef5_row64_col6" class="data row64 col6" >0.608000</td>
761
  </tr>
762
  <tr>
763
+ <th id="T_7aef5_level0_row67" class="row_heading level0 row67" >bbh_salient_translation_error_detection.acc_norm</th>
764
+ <td id="T_7aef5_row67_col0" class="data row67 col0" >0.132000</td>
765
+ <td id="T_7aef5_row67_col1" class="data row67 col1" >0.264000</td>
766
+ <td id="T_7aef5_row67_col2" class="data row67 col2" >0.272000</td>
767
+ <td id="T_7aef5_row67_col3" class="data row67 col3" >0.144000</td>
768
+ <td id="T_7aef5_row67_col4" class="data row67 col4" >0.220000</td>
769
+ <td id="T_7aef5_row67_col5" class="data row67 col5" >0.396000</td>
770
+ <td id="T_7aef5_row67_col6" class="data row67 col6" >0.372000</td>
771
  </tr>
772
  <tr>
773
+ <th id="T_7aef5_level0_row70" class="row_heading level0 row70" >bbh_snarks.acc_norm</th>
774
+ <td id="T_7aef5_row70_col0" class="data row70 col0" >0.500000</td>
775
+ <td id="T_7aef5_row70_col1" class="data row70 col1" >0.516854</td>
776
+ <td id="T_7aef5_row70_col2" class="data row70 col2" >0.657303</td>
777
+ <td id="T_7aef5_row70_col3" class="data row70 col3" >0.539326</td>
778
+ <td id="T_7aef5_row70_col4" class="data row70 col4" >0.477528</td>
779
+ <td id="T_7aef5_row70_col5" class="data row70 col5" >0.634831</td>
780
+ <td id="T_7aef5_row70_col6" class="data row70 col6" >0.606742</td>
781
  </tr>
782
  <tr>
783
+ <th id="T_7aef5_level0_row73" class="row_heading level0 row73" >bbh_sports_understanding.acc_norm</th>
784
+ <td id="T_7aef5_row73_col0" class="data row73 col0" >0.540000</td>
785
+ <td id="T_7aef5_row73_col1" class="data row73 col1" >0.564000</td>
786
+ <td id="T_7aef5_row73_col2" class="data row73 col2" >0.740000</td>
787
+ <td id="T_7aef5_row73_col3" class="data row73 col3" >0.524000</td>
788
+ <td id="T_7aef5_row73_col4" class="data row73 col4" >0.480000</td>
789
+ <td id="T_7aef5_row73_col5" class="data row73 col5" >0.752000</td>
790
+ <td id="T_7aef5_row73_col6" class="data row73 col6" >0.708000</td>
791
  </tr>
792
  <tr>
793
+ <th id="T_7aef5_level0_row76" class="row_heading level0 row76" >bbh_temporal_sequences.acc_norm</th>
794
+ <td id="T_7aef5_row76_col0" class="data row76 col0" >0.072000</td>
795
+ <td id="T_7aef5_row76_col1" class="data row76 col1" >0.132000</td>
796
+ <td id="T_7aef5_row76_col2" class="data row76 col2" >0.256000</td>
797
+ <td id="T_7aef5_row76_col3" class="data row76 col3" >0.240000</td>
798
+ <td id="T_7aef5_row76_col4" class="data row76 col4" >0.244000</td>
799
+ <td id="T_7aef5_row76_col5" class="data row76 col5" >0.096000</td>
800
+ <td id="T_7aef5_row76_col6" class="data row76 col6" >0.260000</td>
801
  </tr>
802
  <tr>
803
+ <th id="T_7aef5_level0_row79" class="row_heading level0 row79" >bbh_tracking_shuffled_objects_five_objects.acc_norm</th>
804
+ <td id="T_7aef5_row79_col0" class="data row79 col0" >0.212000</td>
805
+ <td id="T_7aef5_row79_col1" class="data row79 col1" >0.144000</td>
806
+ <td id="T_7aef5_row79_col2" class="data row79 col2" >0.160000</td>
807
+ <td id="T_7aef5_row79_col3" class="data row79 col3" >0.224000</td>
808
+ <td id="T_7aef5_row79_col4" class="data row79 col4" >0.112000</td>
809
+ <td id="T_7aef5_row79_col5" class="data row79 col5" >0.160000</td>
810
+ <td id="T_7aef5_row79_col6" class="data row79 col6" >0.152000</td>
811
  </tr>
812
  <tr>
813
+ <th id="T_7aef5_level0_row82" class="row_heading level0 row82" >bbh_tracking_shuffled_objects_seven_objects.acc_norm</th>
814
+ <td id="T_7aef5_row82_col0" class="data row82 col0" >0.168000</td>
815
+ <td id="T_7aef5_row82_col1" class="data row82 col1" >0.120000</td>
816
+ <td id="T_7aef5_row82_col2" class="data row82 col2" >0.176000</td>
817
+ <td id="T_7aef5_row82_col3" class="data row82 col3" >0.152000</td>
818
+ <td id="T_7aef5_row82_col4" class="data row82 col4" >0.120000</td>
819
+ <td id="T_7aef5_row82_col5" class="data row82 col5" >0.124000</td>
820
+ <td id="T_7aef5_row82_col6" class="data row82 col6" >0.144000</td>
821
  </tr>
822
  <tr>
823
+ <th id="T_7aef5_level0_row85" class="row_heading level0 row85" >bbh_tracking_shuffled_objects_three_objects.acc_norm</th>
824
+ <td id="T_7aef5_row85_col0" class="data row85 col0" >0.340000</td>
825
+ <td id="T_7aef5_row85_col1" class="data row85 col1" >0.320000</td>
826
+ <td id="T_7aef5_row85_col2" class="data row85 col2" >0.352000</td>
827
+ <td id="T_7aef5_row85_col3" class="data row85 col3" >0.332000</td>
828
+ <td id="T_7aef5_row85_col4" class="data row85 col4" >0.304000</td>
829
+ <td id="T_7aef5_row85_col5" class="data row85 col5" >0.332000</td>
830
+ <td id="T_7aef5_row85_col6" class="data row85 col6" >0.324000</td>
831
  </tr>
832
  <tr>
833
+ <th id="T_7aef5_level0_row88" class="row_heading level0 row88" >bbh_web_of_lies.acc_norm</th>
834
+ <td id="T_7aef5_row88_col0" class="data row88 col0" >0.488000</td>
835
+ <td id="T_7aef5_row88_col1" class="data row88 col1" >0.488000</td>
836
+ <td id="T_7aef5_row88_col2" class="data row88 col2" >0.496000</td>
837
+ <td id="T_7aef5_row88_col3" class="data row88 col3" >0.488000</td>
838
+ <td id="T_7aef5_row88_col4" class="data row88 col4" >0.488000</td>
839
+ <td id="T_7aef5_row88_col5" class="data row88 col5" >0.488000</td>
840
+ <td id="T_7aef5_row88_col6" class="data row88 col6" >0.532000</td>
841
  </tr>
842
  <tr>
843
+ <th id="T_7aef5_level0_row90" class="row_heading level0 row90" >gpqa.acc_norm</th>
844
+ <td id="T_7aef5_row90_col0" class="data row90 col0" >0.296980</td>
845
+ <td id="T_7aef5_row90_col1" class="data row90 col1" >0.337248</td>
846
+ <td id="T_7aef5_row90_col2" class="data row90 col2" >0.269295</td>
847
+ <td id="T_7aef5_row90_col3" class="data row90 col3" >0.255872</td>
848
+ <td id="T_7aef5_row90_col4" class="data row90 col4" >0.279362</td>
849
+ <td id="T_7aef5_row90_col5" class="data row90 col5" >0.296141</td>
850
+ <td id="T_7aef5_row90_col6" class="data row90 col6" >0.278523</td>
851
  </tr>
852
  <tr>
853
+ <th id="T_7aef5_level0_row94" class="row_heading level0 row94" >gpqa_diamond.acc_norm</th>
854
+ <td id="T_7aef5_row94_col0" class="data row94 col0" >0.267677</td>
855
+ <td id="T_7aef5_row94_col1" class="data row94 col1" >0.318182</td>
856
+ <td id="T_7aef5_row94_col2" class="data row94 col2" >0.267677</td>
857
+ <td id="T_7aef5_row94_col3" class="data row94 col3" >0.237374</td>
858
+ <td id="T_7aef5_row94_col4" class="data row94 col4" >0.232323</td>
859
+ <td id="T_7aef5_row94_col5" class="data row94 col5" >0.242424</td>
860
+ <td id="T_7aef5_row94_col6" class="data row94 col6" >0.257576</td>
861
  </tr>
862
  <tr>
863
+ <th id="T_7aef5_level0_row97" class="row_heading level0 row97" >gpqa_extended.acc_norm</th>
864
+ <td id="T_7aef5_row97_col0" class="data row97 col0" >0.313187</td>
865
+ <td id="T_7aef5_row97_col1" class="data row97 col1" >0.351648</td>
866
+ <td id="T_7aef5_row97_col2" class="data row97 col2" >0.267399</td>
867
+ <td id="T_7aef5_row97_col3" class="data row97 col3" >0.239927</td>
868
+ <td id="T_7aef5_row97_col4" class="data row97 col4" >0.276557</td>
869
+ <td id="T_7aef5_row97_col5" class="data row97 col5" >0.305861</td>
870
+ <td id="T_7aef5_row97_col6" class="data row97 col6" >0.293040</td>
871
  </tr>
872
  <tr>
873
+ <th id="T_7aef5_level0_row100" class="row_heading level0 row100" >gpqa_main.acc_norm</th>
874
+ <td id="T_7aef5_row100_col0" class="data row100 col0" >0.290179</td>
875
+ <td id="T_7aef5_row100_col1" class="data row100 col1" >0.328125</td>
876
+ <td id="T_7aef5_row100_col2" class="data row100 col2" >0.272321</td>
877
+ <td id="T_7aef5_row100_col3" class="data row100 col3" >0.283482</td>
878
+ <td id="T_7aef5_row100_col4" class="data row100 col4" >0.303571</td>
879
+ <td id="T_7aef5_row100_col5" class="data row100 col5" >0.308036</td>
880
+ <td id="T_7aef5_row100_col6" class="data row100 col6" >0.270089</td>
881
  </tr>
882
  <tr>
883
+ <th id="T_7aef5_level0_row103" class="row_heading level0 row103" >ifeval.prompt_level_strict_acc</th>
884
+ <td id="T_7aef5_row103_col0" class="data row103 col0" >0.188540</td>
885
+ <td id="T_7aef5_row103_col1" class="data row103 col1" >0.249538</td>
886
+ <td id="T_7aef5_row103_col2" class="data row103 col2" >0.227357</td>
887
+ <td id="T_7aef5_row103_col3" class="data row103 col3" >0.275416</td>
888
+ <td id="T_7aef5_row103_col4" class="data row103 col4" >0.332717</td>
889
+ <td id="T_7aef5_row103_col5" class="data row103 col5" >0.081331</td>
890
+ <td id="T_7aef5_row103_col6" class="data row103 col6" >0.696858</td>
891
  </tr>
892
  <tr>
893
+ <th id="T_7aef5_level0_row105" class="row_heading level0 row105" >ifeval.inst_level_strict_acc</th>
894
+ <td id="T_7aef5_row105_col0" class="data row105 col0" >0.297362</td>
895
+ <td id="T_7aef5_row105_col1" class="data row105 col1" >0.381295</td>
896
+ <td id="T_7aef5_row105_col2" class="data row105 col2" >0.351319</td>
897
+ <td id="T_7aef5_row105_col3" class="data row105 col3" >0.417266</td>
898
+ <td id="T_7aef5_row105_col4" class="data row105 col4" >0.474820</td>
899
+ <td id="T_7aef5_row105_col5" class="data row105 col5" >0.172662</td>
900
+ <td id="T_7aef5_row105_col6" class="data row105 col6" >0.781775</td>
901
  </tr>
902
  <tr>
903
+ <th id="T_7aef5_level0_row107" class="row_heading level0 row107" >ifeval.prompt_level_loose_acc</th>
904
+ <td id="T_7aef5_row107_col0" class="data row107 col0" >0.214418</td>
905
+ <td id="T_7aef5_row107_col1" class="data row107 col1" >0.314233</td>
906
+ <td id="T_7aef5_row107_col2" class="data row107 col2" >0.243993</td>
907
+ <td id="T_7aef5_row107_col3" class="data row107 col3" >0.280961</td>
908
+ <td id="T_7aef5_row107_col4" class="data row107 col4" >0.340111</td>
909
+ <td id="T_7aef5_row107_col5" class="data row107 col5" >0.092421</td>
910
+ <td id="T_7aef5_row107_col6" class="data row107 col6" >0.757856</td>
911
  </tr>
912
  <tr>
913
+ <th id="T_7aef5_level0_row109" class="row_heading level0 row109" >ifeval.inst_level_loose_acc</th>
914
+ <td id="T_7aef5_row109_col0" class="data row109 col0" >0.323741</td>
915
+ <td id="T_7aef5_row109_col1" class="data row109 col1" >0.437650</td>
916
+ <td id="T_7aef5_row109_col2" class="data row109 col2" >0.368106</td>
917
+ <td id="T_7aef5_row109_col3" class="data row109 col3" >0.423261</td>
918
+ <td id="T_7aef5_row109_col4" class="data row109 col4" >0.484412</td>
919
+ <td id="T_7aef5_row109_col5" class="data row109 col5" >0.179856</td>
920
+ <td id="T_7aef5_row109_col6" class="data row109 col6" >0.826139</td>
921
  </tr>
922
  <tr>
923
+ <th id="T_7aef5_level0_row111" class="row_heading level0 row111" >math_hard.exact_match</th>
924
+ <td id="T_7aef5_row111_col0" class="data row111 col0" >0.258308</td>
925
+ <td id="T_7aef5_row111_col1" class="data row111 col1" >0.388973</td>
926
+ <td id="T_7aef5_row111_col2" class="data row111 col2" >0.015861</td>
927
+ <td id="T_7aef5_row111_col3" class="data row111 col3" >0.000000</td>
928
+ <td id="T_7aef5_row111_col4" class="data row111 col4" >0.000000</td>
929
+ <td id="T_7aef5_row111_col5" class="data row111 col5" >0.051360</td>
930
+ <td id="T_7aef5_row111_col6" class="data row111 col6" >0.171450</td>
931
  </tr>
932
  <tr>
933
+ <th id="T_7aef5_level0_row115" class="row_heading level0 row115" >math_algebra_hard.exact_match</th>
934
+ <td id="T_7aef5_row115_col0" class="data row115 col0" >0.501629</td>
935
+ <td id="T_7aef5_row115_col1" class="data row115 col1" >0.618893</td>
936
+ <td id="T_7aef5_row115_col2" class="data row115 col2" >0.026059</td>
937
+ <td id="T_7aef5_row115_col3" class="data row115 col3" >0.000000</td>
938
+ <td id="T_7aef5_row115_col4" class="data row115 col4" >0.000000</td>
939
+ <td id="T_7aef5_row115_col5" class="data row115 col5" >0.100977</td>
940
+ <td id="T_7aef5_row115_col6" class="data row115 col6" >0.351792</td>
941
  </tr>
942
  <tr>
943
+ <th id="T_7aef5_level0_row118" class="row_heading level0 row118" >math_counting_and_prob_hard.exact_match</th>
944
+ <td id="T_7aef5_row118_col0" class="data row118 col0" >0.162602</td>
945
+ <td id="T_7aef5_row118_col1" class="data row118 col1" >0.349593</td>
946
+ <td id="T_7aef5_row118_col2" class="data row118 col2" >0.024390</td>
947
+ <td id="T_7aef5_row118_col3" class="data row118 col3" >0.000000</td>
948
+ <td id="T_7aef5_row118_col4" class="data row118 col4" >0.000000</td>
949
+ <td id="T_7aef5_row118_col5" class="data row118 col5" >0.016260</td>
950
+ <td id="T_7aef5_row118_col6" class="data row118 col6" >0.130081</td>
951
  </tr>
952
  <tr>
953
+ <th id="T_7aef5_level0_row121" class="row_heading level0 row121" >math_geometry_hard.exact_match</th>
954
+ <td id="T_7aef5_row121_col0" class="data row121 col0" >0.121212</td>
955
+ <td id="T_7aef5_row121_col1" class="data row121 col1" >0.250000</td>
956
+ <td id="T_7aef5_row121_col2" class="data row121 col2" >0.030303</td>
957
+ <td id="T_7aef5_row121_col3" class="data row121 col3" >0.000000</td>
958
+ <td id="T_7aef5_row121_col4" class="data row121 col4" >0.000000</td>
959
+ <td id="T_7aef5_row121_col5" class="data row121 col5" >0.022727</td>
960
+ <td id="T_7aef5_row121_col6" class="data row121 col6" >0.083333</td>
961
  </tr>
962
  <tr>
963
+ <th id="T_7aef5_level0_row124" class="row_heading level0 row124" >math_intermediate_algebra_hard.exact_match</th>
964
+ <td id="T_7aef5_row124_col0" class="data row124 col0" >0.064286</td>
965
+ <td id="T_7aef5_row124_col1" class="data row124 col1" >0.153571</td>
966
+ <td id="T_7aef5_row124_col2" class="data row124 col2" >0.007143</td>
967
+ <td id="T_7aef5_row124_col3" class="data row124 col3" >0.000000</td>
968
+ <td id="T_7aef5_row124_col4" class="data row124 col4" >0.000000</td>
969
+ <td id="T_7aef5_row124_col5" class="data row124 col5" >0.007143</td>
970
+ <td id="T_7aef5_row124_col6" class="data row124 col6" >0.028571</td>
971
  </tr>
972
  <tr>
973
+ <th id="T_7aef5_level0_row127" class="row_heading level0 row127" >math_num_theory_hard.exact_match</th>
974
+ <td id="T_7aef5_row127_col0" class="data row127 col0" >0.266234</td>
975
+ <td id="T_7aef5_row127_col1" class="data row127 col1" >0.415584</td>
976
+ <td id="T_7aef5_row127_col2" class="data row127 col2" >0.012987</td>
977
+ <td id="T_7aef5_row127_col3" class="data row127 col3" >0.000000</td>
978
+ <td id="T_7aef5_row127_col4" class="data row127 col4" >0.000000</td>
979
+ <td id="T_7aef5_row127_col5" class="data row127 col5" >0.045455</td>
980
+ <td id="T_7aef5_row127_col6" class="data row127 col6" >0.097403</td>
981
  </tr>
982
  <tr>
983
+ <th id="T_7aef5_level0_row130" class="row_heading level0 row130" >math_prealgebra_hard.exact_match</th>
984
+ <td id="T_7aef5_row130_col0" class="data row130 col0" >0.430052</td>
985
+ <td id="T_7aef5_row130_col1" class="data row130 col1" >0.647668</td>
986
+ <td id="T_7aef5_row130_col2" class="data row130 col2" >0.005181</td>
987
+ <td id="T_7aef5_row130_col3" class="data row130 col3" >0.000000</td>
988
+ <td id="T_7aef5_row130_col4" class="data row130 col4" >0.000000</td>
989
+ <td id="T_7aef5_row130_col5" class="data row130 col5" >0.103627</td>
990
+ <td id="T_7aef5_row130_col6" class="data row130 col6" >0.326425</td>
991
  </tr>
992
  <tr>
993
+ <th id="T_7aef5_level0_row133" class="row_heading level0 row133" >math_precalculus_hard.exact_match</th>
994
+ <td id="T_7aef5_row133_col0" class="data row133 col0" >0.074074</td>
995
+ <td id="T_7aef5_row133_col1" class="data row133 col1" >0.125926</td>
996
+ <td id="T_7aef5_row133_col2" class="data row133 col2" >0.007407</td>
997
+ <td id="T_7aef5_row133_col3" class="data row133 col3" >0.000000</td>
998
+ <td id="T_7aef5_row133_col4" class="data row133 col4" >0.000000</td>
999
+ <td id="T_7aef5_row133_col5" class="data row133 col5" >0.022222</td>
1000
+ <td id="T_7aef5_row133_col6" class="data row133 col6" >0.044444</td>
1001
  </tr>
1002
  <tr>
1003
+ <th id="T_7aef5_level0_row136" class="row_heading level0 row136" >mmlu_pro.acc</th>
1004
+ <td id="T_7aef5_row136_col0" class="data row136 col0" >0.192487</td>
1005
+ <td id="T_7aef5_row136_col1" class="data row136 col1" >0.297041</td>
1006
+ <td id="T_7aef5_row136_col2" class="data row136 col2" >0.299119</td>
1007
+ <td id="T_7aef5_row136_col3" class="data row136 col3" >0.118684</td>
1008
+ <td id="T_7aef5_row136_col4" class="data row136 col4" >0.232131</td>
1009
+ <td id="T_7aef5_row136_col5" class="data row136 col5" >0.324551</td>
1010
+ <td id="T_7aef5_row136_col6" class="data row136 col6" >0.319481</td>
1011
  </tr>
1012
  <tr>
1013
+ <th id="T_7aef5_level0_row138" class="row_heading level0 row138" >musr.acc_norm</th>
1014
+ <td id="T_7aef5_row138_col0" class="data row138 col0" >0.334656</td>
1015
+ <td id="T_7aef5_row138_col1" class="data row138 col1" >0.390212</td>
1016
+ <td id="T_7aef5_row138_col2" class="data row138 col2" >0.392857</td>
1017
+ <td id="T_7aef5_row138_col3" class="data row138 col3" >0.362434</td>
1018
+ <td id="T_7aef5_row138_col4" class="data row138 col4" >0.365079</td>
1019
+ <td id="T_7aef5_row138_col5" class="data row138 col5" >0.382275</td>
1020
+ <td id="T_7aef5_row138_col6" class="data row138 col6" >0.351852</td>
1021
  </tr>
1022
  <tr>
1023
+ <th id="T_7aef5_level0_row142" class="row_heading level0 row142" >musr_murder_mysteries.acc_norm</th>
1024
+ <td id="T_7aef5_row142_col0" class="data row142 col0" >0.504000</td>
1025
+ <td id="T_7aef5_row142_col1" class="data row142 col1" >0.520000</td>
1026
+ <td id="T_7aef5_row142_col2" class="data row142 col2" >0.524000</td>
1027
+ <td id="T_7aef5_row142_col3" class="data row142 col3" >0.492000</td>
1028
+ <td id="T_7aef5_row142_col4" class="data row142 col4" >0.492000</td>
1029
+ <td id="T_7aef5_row142_col5" class="data row142 col5" >0.540000</td>
1030
+ <td id="T_7aef5_row142_col6" class="data row142 col6" >0.504000</td>
1031
  </tr>
1032
  <tr>
1033
+ <th id="T_7aef5_level0_row145" class="row_heading level0 row145" >musr_object_placements.acc_norm</th>
1034
+ <td id="T_7aef5_row145_col0" class="data row145 col0" >0.238281</td>
1035
+ <td id="T_7aef5_row145_col1" class="data row145 col1" >0.281250</td>
1036
+ <td id="T_7aef5_row145_col2" class="data row145 col2" >0.289062</td>
1037
+ <td id="T_7aef5_row145_col3" class="data row145 col3" >0.234375</td>
1038
+ <td id="T_7aef5_row145_col4" class="data row145 col4" >0.214844</td>
1039
+ <td id="T_7aef5_row145_col5" class="data row145 col5" >0.351562</td>
1040
+ <td id="T_7aef5_row145_col6" class="data row145 col6" >0.226562</td>
1041
  </tr>
1042
  <tr>
1043
+ <th id="T_7aef5_level0_row148" class="row_heading level0 row148" >musr_team_allocation.acc_norm</th>
1044
+ <td id="T_7aef5_row148_col0" class="data row148 col0" >0.264000</td>
1045
+ <td id="T_7aef5_row148_col1" class="data row148 col1" >0.372000</td>
1046
+ <td id="T_7aef5_row148_col2" class="data row148 col2" >0.368000</td>
1047
+ <td id="T_7aef5_row148_col3" class="data row148 col3" >0.364000</td>
1048
+ <td id="T_7aef5_row148_col4" class="data row148 col4" >0.392000</td>
1049
+ <td id="T_7aef5_row148_col5" class="data row148 col5" >0.256000</td>
1050
+ <td id="T_7aef5_row148_col6" class="data row148 col6" >0.328000</td>
1051
  </tr>
1052
  </tbody>
1053
  </table>