File size: 34,538 Bytes
4d9b54f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
{
  "results": {
    "harness|arc:challenge|25": {
      "acc": 0.6049488054607508,
      "acc_stderr": 0.01428589829293817,
      "acc_norm": 0.6459044368600683,
      "acc_norm_stderr": 0.013975454122756564
    },
    "harness|hellaswag|10": {
      "acc": 0.6693885680143398,
      "acc_stderr": 0.004694718918225751,
      "acc_norm": 0.8587930691097391,
      "acc_norm_stderr": 0.003475231889452833
    },
    "harness|truthfulqa:mc|0": {
      "mc1": 0.3561811505507956,
      "mc1_stderr": 0.016763790728446335,
      "mc2": 0.5280473232260097,
      "mc2_stderr": 0.01553022126123046
    },
    "harness|hendrycksTest-abstract_algebra|5": {
      "acc": 0.35,
      "acc_stderr": 0.04793724854411021,
      "acc_norm": 0.35,
      "acc_norm_stderr": 0.04793724854411021
    },
    "harness|hendrycksTest-anatomy|5": {
      "acc": 0.5185185185185185,
      "acc_stderr": 0.043163785995113245,
      "acc_norm": 0.5185185185185185,
      "acc_norm_stderr": 0.043163785995113245
    },
    "harness|hendrycksTest-astronomy|5": {
      "acc": 0.7302631578947368,
      "acc_stderr": 0.03611780560284898,
      "acc_norm": 0.7302631578947368,
      "acc_norm_stderr": 0.03611780560284898
    },
    "harness|hendrycksTest-business_ethics|5": {
      "acc": 0.65,
      "acc_stderr": 0.0479372485441102,
      "acc_norm": 0.65,
      "acc_norm_stderr": 0.0479372485441102
    },
    "harness|hendrycksTest-clinical_knowledge|5": {
      "acc": 0.6377358490566037,
      "acc_stderr": 0.029582245128384303,
      "acc_norm": 0.6377358490566037,
      "acc_norm_stderr": 0.029582245128384303
    },
    "harness|hendrycksTest-college_biology|5": {
      "acc": 0.75,
      "acc_stderr": 0.03621034121889507,
      "acc_norm": 0.75,
      "acc_norm_stderr": 0.03621034121889507
    },
    "harness|hendrycksTest-college_chemistry|5": {
      "acc": 0.48,
      "acc_stderr": 0.050211673156867795,
      "acc_norm": 0.48,
      "acc_norm_stderr": 0.050211673156867795
    },
    "harness|hendrycksTest-college_computer_science|5": {
      "acc": 0.59,
      "acc_stderr": 0.04943110704237101,
      "acc_norm": 0.59,
      "acc_norm_stderr": 0.04943110704237101
    },
    "harness|hendrycksTest-college_mathematics|5": {
      "acc": 0.34,
      "acc_stderr": 0.04760952285695235,
      "acc_norm": 0.34,
      "acc_norm_stderr": 0.04760952285695235
    },
    "harness|hendrycksTest-college_medicine|5": {
      "acc": 0.6011560693641619,
      "acc_stderr": 0.0373362665538351,
      "acc_norm": 0.6011560693641619,
      "acc_norm_stderr": 0.0373362665538351
    },
    "harness|hendrycksTest-college_physics|5": {
      "acc": 0.3333333333333333,
      "acc_stderr": 0.04690650298201943,
      "acc_norm": 0.3333333333333333,
      "acc_norm_stderr": 0.04690650298201943
    },
    "harness|hendrycksTest-computer_security|5": {
      "acc": 0.71,
      "acc_stderr": 0.045604802157206845,
      "acc_norm": 0.71,
      "acc_norm_stderr": 0.045604802157206845
    },
    "harness|hendrycksTest-conceptual_physics|5": {
      "acc": 0.5829787234042553,
      "acc_stderr": 0.032232762667117124,
      "acc_norm": 0.5829787234042553,
      "acc_norm_stderr": 0.032232762667117124
    },
    "harness|hendrycksTest-econometrics|5": {
      "acc": 0.41228070175438597,
      "acc_stderr": 0.04630653203366595,
      "acc_norm": 0.41228070175438597,
      "acc_norm_stderr": 0.04630653203366595
    },
    "harness|hendrycksTest-electrical_engineering|5": {
      "acc": 0.5793103448275863,
      "acc_stderr": 0.0411391498118926,
      "acc_norm": 0.5793103448275863,
      "acc_norm_stderr": 0.0411391498118926
    },
    "harness|hendrycksTest-elementary_mathematics|5": {
      "acc": 0.41005291005291006,
      "acc_stderr": 0.02533120243894442,
      "acc_norm": 0.41005291005291006,
      "acc_norm_stderr": 0.02533120243894442
    },
    "harness|hendrycksTest-formal_logic|5": {
      "acc": 0.4126984126984127,
      "acc_stderr": 0.04403438954768176,
      "acc_norm": 0.4126984126984127,
      "acc_norm_stderr": 0.04403438954768176
    },
    "harness|hendrycksTest-global_facts|5": {
      "acc": 0.43,
      "acc_stderr": 0.049756985195624284,
      "acc_norm": 0.43,
      "acc_norm_stderr": 0.049756985195624284
    },
    "harness|hendrycksTest-high_school_biology|5": {
      "acc": 0.7645161290322581,
      "acc_stderr": 0.02413763242933771,
      "acc_norm": 0.7645161290322581,
      "acc_norm_stderr": 0.02413763242933771
    },
    "harness|hendrycksTest-high_school_chemistry|5": {
      "acc": 0.4630541871921182,
      "acc_stderr": 0.035083705204426656,
      "acc_norm": 0.4630541871921182,
      "acc_norm_stderr": 0.035083705204426656
    },
    "harness|hendrycksTest-high_school_computer_science|5": {
      "acc": 0.65,
      "acc_stderr": 0.047937248544110196,
      "acc_norm": 0.65,
      "acc_norm_stderr": 0.047937248544110196
    },
    "harness|hendrycksTest-high_school_european_history|5": {
      "acc": 0.8181818181818182,
      "acc_stderr": 0.03011768892950359,
      "acc_norm": 0.8181818181818182,
      "acc_norm_stderr": 0.03011768892950359
    },
    "harness|hendrycksTest-high_school_geography|5": {
      "acc": 0.8080808080808081,
      "acc_stderr": 0.02805779167298902,
      "acc_norm": 0.8080808080808081,
      "acc_norm_stderr": 0.02805779167298902
    },
    "harness|hendrycksTest-high_school_government_and_politics|5": {
      "acc": 0.8911917098445595,
      "acc_stderr": 0.022473253332768783,
      "acc_norm": 0.8911917098445595,
      "acc_norm_stderr": 0.022473253332768783
    },
    "harness|hendrycksTest-high_school_macroeconomics|5": {
      "acc": 0.6410256410256411,
      "acc_stderr": 0.02432173848460235,
      "acc_norm": 0.6410256410256411,
      "acc_norm_stderr": 0.02432173848460235
    },
    "harness|hendrycksTest-high_school_mathematics|5": {
      "acc": 0.3,
      "acc_stderr": 0.027940457136228416,
      "acc_norm": 0.3,
      "acc_norm_stderr": 0.027940457136228416
    },
    "harness|hendrycksTest-high_school_microeconomics|5": {
      "acc": 0.6596638655462185,
      "acc_stderr": 0.030778057422931673,
      "acc_norm": 0.6596638655462185,
      "acc_norm_stderr": 0.030778057422931673
    },
    "harness|hendrycksTest-high_school_physics|5": {
      "acc": 0.423841059602649,
      "acc_stderr": 0.04034846678603397,
      "acc_norm": 0.423841059602649,
      "acc_norm_stderr": 0.04034846678603397
    },
    "harness|hendrycksTest-high_school_psychology|5": {
      "acc": 0.8385321100917431,
      "acc_stderr": 0.015776239256163255,
      "acc_norm": 0.8385321100917431,
      "acc_norm_stderr": 0.015776239256163255
    },
    "harness|hendrycksTest-high_school_statistics|5": {
      "acc": 0.48148148148148145,
      "acc_stderr": 0.03407632093854052,
      "acc_norm": 0.48148148148148145,
      "acc_norm_stderr": 0.03407632093854052
    },
    "harness|hendrycksTest-high_school_us_history|5": {
      "acc": 0.8578431372549019,
      "acc_stderr": 0.024509803921568606,
      "acc_norm": 0.8578431372549019,
      "acc_norm_stderr": 0.024509803921568606
    },
    "harness|hendrycksTest-high_school_world_history|5": {
      "acc": 0.8438818565400844,
      "acc_stderr": 0.02362715946031867,
      "acc_norm": 0.8438818565400844,
      "acc_norm_stderr": 0.02362715946031867
    },
    "harness|hendrycksTest-human_aging|5": {
      "acc": 0.726457399103139,
      "acc_stderr": 0.02991858670779883,
      "acc_norm": 0.726457399103139,
      "acc_norm_stderr": 0.02991858670779883
    },
    "harness|hendrycksTest-human_sexuality|5": {
      "acc": 0.7099236641221374,
      "acc_stderr": 0.039800662464677665,
      "acc_norm": 0.7099236641221374,
      "acc_norm_stderr": 0.039800662464677665
    },
    "harness|hendrycksTest-international_law|5": {
      "acc": 0.8016528925619835,
      "acc_stderr": 0.03640118271990946,
      "acc_norm": 0.8016528925619835,
      "acc_norm_stderr": 0.03640118271990946
    },
    "harness|hendrycksTest-jurisprudence|5": {
      "acc": 0.8240740740740741,
      "acc_stderr": 0.036809181416738807,
      "acc_norm": 0.8240740740740741,
      "acc_norm_stderr": 0.036809181416738807
    },
    "harness|hendrycksTest-logical_fallacies|5": {
      "acc": 0.7607361963190185,
      "acc_stderr": 0.033519538795212696,
      "acc_norm": 0.7607361963190185,
      "acc_norm_stderr": 0.033519538795212696
    },
    "harness|hendrycksTest-machine_learning|5": {
      "acc": 0.48214285714285715,
      "acc_stderr": 0.047427623612430116,
      "acc_norm": 0.48214285714285715,
      "acc_norm_stderr": 0.047427623612430116
    },
    "harness|hendrycksTest-management|5": {
      "acc": 0.8058252427184466,
      "acc_stderr": 0.03916667762822584,
      "acc_norm": 0.8058252427184466,
      "acc_norm_stderr": 0.03916667762822584
    },
    "harness|hendrycksTest-marketing|5": {
      "acc": 0.8717948717948718,
      "acc_stderr": 0.02190190511507332,
      "acc_norm": 0.8717948717948718,
      "acc_norm_stderr": 0.02190190511507332
    },
    "harness|hendrycksTest-medical_genetics|5": {
      "acc": 0.65,
      "acc_stderr": 0.047937248544110196,
      "acc_norm": 0.65,
      "acc_norm_stderr": 0.047937248544110196
    },
    "harness|hendrycksTest-miscellaneous|5": {
      "acc": 0.8275862068965517,
      "acc_stderr": 0.013507943909371798,
      "acc_norm": 0.8275862068965517,
      "acc_norm_stderr": 0.013507943909371798
    },
    "harness|hendrycksTest-moral_disputes|5": {
      "acc": 0.7167630057803468,
      "acc_stderr": 0.02425790170532338,
      "acc_norm": 0.7167630057803468,
      "acc_norm_stderr": 0.02425790170532338
    },
    "harness|hendrycksTest-moral_scenarios|5": {
      "acc": 0.39553072625698327,
      "acc_stderr": 0.01635341541007577,
      "acc_norm": 0.39553072625698327,
      "acc_norm_stderr": 0.01635341541007577
    },
    "harness|hendrycksTest-nutrition|5": {
      "acc": 0.6993464052287581,
      "acc_stderr": 0.026256053835718968,
      "acc_norm": 0.6993464052287581,
      "acc_norm_stderr": 0.026256053835718968
    },
    "harness|hendrycksTest-philosophy|5": {
      "acc": 0.7041800643086816,
      "acc_stderr": 0.02592237178881877,
      "acc_norm": 0.7041800643086816,
      "acc_norm_stderr": 0.02592237178881877
    },
    "harness|hendrycksTest-prehistory|5": {
      "acc": 0.7098765432098766,
      "acc_stderr": 0.025251173936495036,
      "acc_norm": 0.7098765432098766,
      "acc_norm_stderr": 0.025251173936495036
    },
    "harness|hendrycksTest-professional_accounting|5": {
      "acc": 0.5070921985815603,
      "acc_stderr": 0.02982449855912901,
      "acc_norm": 0.5070921985815603,
      "acc_norm_stderr": 0.02982449855912901
    },
    "harness|hendrycksTest-professional_law|5": {
      "acc": 0.4771838331160365,
      "acc_stderr": 0.012756933382823694,
      "acc_norm": 0.4771838331160365,
      "acc_norm_stderr": 0.012756933382823694
    },
    "harness|hendrycksTest-professional_medicine|5": {
      "acc": 0.5772058823529411,
      "acc_stderr": 0.030008562845003476,
      "acc_norm": 0.5772058823529411,
      "acc_norm_stderr": 0.030008562845003476
    },
    "harness|hendrycksTest-professional_psychology|5": {
      "acc": 0.6699346405228758,
      "acc_stderr": 0.019023726160724556,
      "acc_norm": 0.6699346405228758,
      "acc_norm_stderr": 0.019023726160724556
    },
    "harness|hendrycksTest-public_relations|5": {
      "acc": 0.6909090909090909,
      "acc_stderr": 0.044262946482000985,
      "acc_norm": 0.6909090909090909,
      "acc_norm_stderr": 0.044262946482000985
    },
    "harness|hendrycksTest-security_studies|5": {
      "acc": 0.7877551020408163,
      "acc_stderr": 0.026176967197866767,
      "acc_norm": 0.7877551020408163,
      "acc_norm_stderr": 0.026176967197866767
    },
    "harness|hendrycksTest-sociology|5": {
      "acc": 0.8706467661691543,
      "acc_stderr": 0.023729830881018526,
      "acc_norm": 0.8706467661691543,
      "acc_norm_stderr": 0.023729830881018526
    },
    "harness|hendrycksTest-us_foreign_policy|5": {
      "acc": 0.87,
      "acc_stderr": 0.03379976689896309,
      "acc_norm": 0.87,
      "acc_norm_stderr": 0.03379976689896309
    },
    "harness|hendrycksTest-virology|5": {
      "acc": 0.5120481927710844,
      "acc_stderr": 0.03891364495835817,
      "acc_norm": 0.5120481927710844,
      "acc_norm_stderr": 0.03891364495835817
    },
    "harness|hendrycksTest-world_religions|5": {
      "acc": 0.8187134502923976,
      "acc_stderr": 0.029547741687640038,
      "acc_norm": 0.8187134502923976,
      "acc_norm_stderr": 0.029547741687640038
    },
    "all": {
      "acc": 0.6390701952816291,
      "acc_stderr": 0.03365809160773111,
      "acc_norm": 0.6390701952816291,
      "acc_norm_stderr": 0.03365809160773111
    }
  },
  "versions": {
    "harness|arc:challenge|25": 0,
    "harness|hellaswag|10": 0,
    "harness|truthfulqa:mc|0": 1,
    "harness|hendrycksTest-abstract_algebra|5": 1,
    "harness|hendrycksTest-anatomy|5": 1,
    "harness|hendrycksTest-astronomy|5": 1,
    "harness|hendrycksTest-business_ethics|5": 1,
    "harness|hendrycksTest-clinical_knowledge|5": 1,
    "harness|hendrycksTest-college_biology|5": 1,
    "harness|hendrycksTest-college_chemistry|5": 1,
    "harness|hendrycksTest-college_computer_science|5": 1,
    "harness|hendrycksTest-college_mathematics|5": 1,
    "harness|hendrycksTest-college_medicine|5": 1,
    "harness|hendrycksTest-college_physics|5": 1,
    "harness|hendrycksTest-computer_security|5": 1,
    "harness|hendrycksTest-conceptual_physics|5": 1,
    "harness|hendrycksTest-econometrics|5": 1,
    "harness|hendrycksTest-electrical_engineering|5": 1,
    "harness|hendrycksTest-elementary_mathematics|5": 1,
    "harness|hendrycksTest-formal_logic|5": 1,
    "harness|hendrycksTest-global_facts|5": 1,
    "harness|hendrycksTest-high_school_biology|5": 1,
    "harness|hendrycksTest-high_school_chemistry|5": 1,
    "harness|hendrycksTest-high_school_computer_science|5": 1,
    "harness|hendrycksTest-high_school_european_history|5": 1,
    "harness|hendrycksTest-high_school_geography|5": 1,
    "harness|hendrycksTest-high_school_government_and_politics|5": 1,
    "harness|hendrycksTest-high_school_macroeconomics|5": 1,
    "harness|hendrycksTest-high_school_mathematics|5": 1,
    "harness|hendrycksTest-high_school_microeconomics|5": 1,
    "harness|hendrycksTest-high_school_physics|5": 1,
    "harness|hendrycksTest-high_school_psychology|5": 1,
    "harness|hendrycksTest-high_school_statistics|5": 1,
    "harness|hendrycksTest-high_school_us_history|5": 1,
    "harness|hendrycksTest-high_school_world_history|5": 1,
    "harness|hendrycksTest-human_aging|5": 1,
    "harness|hendrycksTest-human_sexuality|5": 1,
    "harness|hendrycksTest-international_law|5": 1,
    "harness|hendrycksTest-jurisprudence|5": 1,
    "harness|hendrycksTest-logical_fallacies|5": 1,
    "harness|hendrycksTest-machine_learning|5": 1,
    "harness|hendrycksTest-management|5": 1,
    "harness|hendrycksTest-marketing|5": 1,
    "harness|hendrycksTest-medical_genetics|5": 1,
    "harness|hendrycksTest-miscellaneous|5": 1,
    "harness|hendrycksTest-moral_disputes|5": 1,
    "harness|hendrycksTest-moral_scenarios|5": 1,
    "harness|hendrycksTest-nutrition|5": 1,
    "harness|hendrycksTest-philosophy|5": 1,
    "harness|hendrycksTest-prehistory|5": 1,
    "harness|hendrycksTest-professional_accounting|5": 1,
    "harness|hendrycksTest-professional_law|5": 1,
    "harness|hendrycksTest-professional_medicine|5": 1,
    "harness|hendrycksTest-professional_psychology|5": 1,
    "harness|hendrycksTest-public_relations|5": 1,
    "harness|hendrycksTest-security_studies|5": 1,
    "harness|hendrycksTest-sociology|5": 1,
    "harness|hendrycksTest-us_foreign_policy|5": 1,
    "harness|hendrycksTest-virology|5": 1,
    "harness|hendrycksTest-world_religions|5": 1,
    "all": 0
  },
  "config": {
    "model_name": "meta-llama/Llama-2-70b-chat-hf",
    "model_sha": "7f54101c0fbb67a8143ca23eb8bd09b71f269c74",
    "model_dtype": "torch.float16",
    "lighteval_sha": "43cff840721bd0214adb4e29236a5e2ca1813937",
    "num_few_shot_default": 0,
    "num_fewshot_seeds": 1,
    "override_batch_size": 1,
    "max_samples": null
  },
  "task_config": {
    "harness|arc:challenge": "LM Harness task",
    "harness|hellaswag": "LM Harness task",
    "harness|truthfulqa:mc": "LM Harness task",
    "harness|hendrycksTest-abstract_algebra": "LM Harness task",
    "harness|hendrycksTest-anatomy": "LM Harness task",
    "harness|hendrycksTest-astronomy": "LM Harness task",
    "harness|hendrycksTest-business_ethics": "LM Harness task",
    "harness|hendrycksTest-clinical_knowledge": "LM Harness task",
    "harness|hendrycksTest-college_biology": "LM Harness task",
    "harness|hendrycksTest-college_chemistry": "LM Harness task",
    "harness|hendrycksTest-college_computer_science": "LM Harness task",
    "harness|hendrycksTest-college_mathematics": "LM Harness task",
    "harness|hendrycksTest-college_medicine": "LM Harness task",
    "harness|hendrycksTest-college_physics": "LM Harness task",
    "harness|hendrycksTest-computer_security": "LM Harness task",
    "harness|hendrycksTest-conceptual_physics": "LM Harness task",
    "harness|hendrycksTest-econometrics": "LM Harness task",
    "harness|hendrycksTest-electrical_engineering": "LM Harness task",
    "harness|hendrycksTest-elementary_mathematics": "LM Harness task",
    "harness|hendrycksTest-formal_logic": "LM Harness task",
    "harness|hendrycksTest-global_facts": "LM Harness task",
    "harness|hendrycksTest-high_school_biology": "LM Harness task",
    "harness|hendrycksTest-high_school_chemistry": "LM Harness task",
    "harness|hendrycksTest-high_school_computer_science": "LM Harness task",
    "harness|hendrycksTest-high_school_european_history": "LM Harness task",
    "harness|hendrycksTest-high_school_geography": "LM Harness task",
    "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task",
    "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task",
    "harness|hendrycksTest-high_school_mathematics": "LM Harness task",
    "harness|hendrycksTest-high_school_microeconomics": "LM Harness task",
    "harness|hendrycksTest-high_school_physics": "LM Harness task",
    "harness|hendrycksTest-high_school_psychology": "LM Harness task",
    "harness|hendrycksTest-high_school_statistics": "LM Harness task",
    "harness|hendrycksTest-high_school_us_history": "LM Harness task",
    "harness|hendrycksTest-high_school_world_history": "LM Harness task",
    "harness|hendrycksTest-human_aging": "LM Harness task",
    "harness|hendrycksTest-human_sexuality": "LM Harness task",
    "harness|hendrycksTest-international_law": "LM Harness task",
    "harness|hendrycksTest-jurisprudence": "LM Harness task",
    "harness|hendrycksTest-logical_fallacies": "LM Harness task",
    "harness|hendrycksTest-machine_learning": "LM Harness task",
    "harness|hendrycksTest-management": "LM Harness task",
    "harness|hendrycksTest-marketing": "LM Harness task",
    "harness|hendrycksTest-medical_genetics": "LM Harness task",
    "harness|hendrycksTest-miscellaneous": "LM Harness task",
    "harness|hendrycksTest-moral_disputes": "LM Harness task",
    "harness|hendrycksTest-moral_scenarios": "LM Harness task",
    "harness|hendrycksTest-nutrition": "LM Harness task",
    "harness|hendrycksTest-philosophy": "LM Harness task",
    "harness|hendrycksTest-prehistory": "LM Harness task",
    "harness|hendrycksTest-professional_accounting": "LM Harness task",
    "harness|hendrycksTest-professional_law": "LM Harness task",
    "harness|hendrycksTest-professional_medicine": "LM Harness task",
    "harness|hendrycksTest-professional_psychology": "LM Harness task",
    "harness|hendrycksTest-public_relations": "LM Harness task",
    "harness|hendrycksTest-security_studies": "LM Harness task",
    "harness|hendrycksTest-sociology": "LM Harness task",
    "harness|hendrycksTest-us_foreign_policy": "LM Harness task",
    "harness|hendrycksTest-virology": "LM Harness task",
    "harness|hendrycksTest-world_religions": "LM Harness task"

  },
  "hashes": {
    "harness|arc:challenge|25": {
      "hash_examples": "fb8c51b1872daeda",
      "hash_full_prompts": "045cbb916e5145c6",
      "hash_input_tokens": "fab18a8dbccd885e",
      "hash_cont_tokens": "e8abf848493b50f7"
    },
    "harness|hellaswag|10": {
      "hash_examples": "e1768ecb99d7ecf0",
      "hash_full_prompts": "0b4c16983130f84f",
      "hash_input_tokens": "fd3d11be48664a7e",
      "hash_cont_tokens": "9fe0a5c42e1532db"
    },
    "harness|truthfulqa:mc|0": {
      "hash_examples": "23176c0531c7b867",
      "hash_full_prompts": "36a6d90e75d92d4a",
      "hash_input_tokens": "e3c2231820d87234",
      "hash_cont_tokens": "f5da56a132aab151"
    },
    "harness|hendrycksTest-abstract_algebra|5": {
      "hash_examples": "280f9f325b40559a",
      "hash_full_prompts": "2f776a367d23aea2",
      "hash_input_tokens": "c3792fce2534965f",
      "hash_cont_tokens": "50421e30bef398f9"
    },
    "harness|hendrycksTest-anatomy|5": {
      "hash_examples": "2f83a4f1cab4ba18",
      "hash_full_prompts": "516f74bef25df620",
      "hash_input_tokens": "1bfeea5736b995ee",
      "hash_cont_tokens": "f11971a765cb609f"
    },
    "harness|hendrycksTest-astronomy|5": {
      "hash_examples": "7d587b908da4d762",
      "hash_full_prompts": "faf4e80f65de93ca",
      "hash_input_tokens": "c4b2f1160f746871",
      "hash_cont_tokens": "440a970fadecdc7b"
    },
    "harness|hendrycksTest-business_ethics|5": {
      "hash_examples": "33e51740670de686",
      "hash_full_prompts": "db01c3ef8e1479d4",
      "hash_input_tokens": "b98d6ef1d1e2e17b",
      "hash_cont_tokens": "50421e30bef398f9"
    },
    "harness|hendrycksTest-clinical_knowledge|5": {
      "hash_examples": "f3366dbe7eefffa4",
      "hash_full_prompts": "49654f71d94b65c3",
      "hash_input_tokens": "9851119dacda883c",
      "hash_cont_tokens": "7ecd60c25b9bfe5b"
    },
    "harness|hendrycksTest-college_biology|5": {
      "hash_examples": "ca2b6753a0193e7f",
      "hash_full_prompts": "2b460b75f1fdfefd",
      "hash_input_tokens": "81a92a54cddefc2f",
      "hash_cont_tokens": "875cde3af7a0ee14"
    },
    "harness|hendrycksTest-college_chemistry|5": {
      "hash_examples": "22ff85f1d34f42d1",
      "hash_full_prompts": "242c9be6da583e95",
      "hash_input_tokens": "fd4c0cebdc2c1c3d",
      "hash_cont_tokens": "50421e30bef398f9"
    },
    "harness|hendrycksTest-college_computer_science|5": {
      "hash_examples": "30318289d717a5cf",
      "hash_full_prompts": "ed2bdb4e87c4b371",
      "hash_input_tokens": "49f6021f4c075e0d",
      "hash_cont_tokens": "50421e30bef398f9"
    },
    "harness|hendrycksTest-college_mathematics|5": {
      "hash_examples": "4944d1f0b6b5d911",
      "hash_full_prompts": "770bc4281c973190",
      "hash_input_tokens": "db61bad69399bfe8",
      "hash_cont_tokens": "50421e30bef398f9"
    },
    "harness|hendrycksTest-college_medicine|5": {
      "hash_examples": "dd69cc33381275af",
      "hash_full_prompts": "ad2a53e5250ab46e",
      "hash_input_tokens": "c458392f38424d77",
      "hash_cont_tokens": "702fb6d82ff0d6ac"
    },
    "harness|hendrycksTest-college_physics|5": {
      "hash_examples": "875dd26d22655b0d",
      "hash_full_prompts": "833a0d7b55aed500",
      "hash_input_tokens": "49cf4d8d8696b588",
      "hash_cont_tokens": "f7b8097afc16a47c"
    },
    "harness|hendrycksTest-computer_security|5": {
      "hash_examples": "006451eedc0ededb",
      "hash_full_prompts": "94034c97e85d8f46",
      "hash_input_tokens": "e81d46ca85fa2b7c",
      "hash_cont_tokens": "50421e30bef398f9"
    },
    "harness|hendrycksTest-conceptual_physics|5": {
      "hash_examples": "8874ece872d2ca4c",
      "hash_full_prompts": "e40d15a34640d6fa",
      "hash_input_tokens": "d5e231a26622e7d5",
      "hash_cont_tokens": "aa0e8bc655f2f641"
    },
    "harness|hendrycksTest-econometrics|5": {
      "hash_examples": "64d3623b0bfaa43f",
      "hash_full_prompts": "612f340fae41338d",
      "hash_input_tokens": "afa3603fd1622706",
      "hash_cont_tokens": "b1cc6e7e9fcd3827"
    },
    "harness|hendrycksTest-electrical_engineering|5": {
      "hash_examples": "e98f51780c674d7e",
      "hash_full_prompts": "10275b312d812ae6",
      "hash_input_tokens": "e0c62cf84ed22e7e",
      "hash_cont_tokens": "2425a3f084a591ef"
    },
    "harness|hendrycksTest-elementary_mathematics|5": {
      "hash_examples": "fc48208a5ac1c0ce",
      "hash_full_prompts": "5ec274c6c82aca23",
      "hash_input_tokens": "303123d2b857f30b",
      "hash_cont_tokens": "bd87bf0c060fd925"
    },
    "harness|hendrycksTest-formal_logic|5": {
      "hash_examples": "5a6525665f63ea72",
      "hash_full_prompts": "07b92638c4a6b500",
      "hash_input_tokens": "3fd8073b90b9736d",
      "hash_cont_tokens": "eb8932890e0605db"
    },
    "harness|hendrycksTest-global_facts|5": {
      "hash_examples": "371d70d743b2b89b",
      "hash_full_prompts": "332fdee50a1921b4",
      "hash_input_tokens": "f65051acd3210902",
      "hash_cont_tokens": "50421e30bef398f9"
    },
    "harness|hendrycksTest-high_school_biology|5": {
      "hash_examples": "a79e1018b1674052",
      "hash_full_prompts": "e624e26ede922561",
      "hash_input_tokens": "264263fc8c2123bc",
      "hash_cont_tokens": "1ddcb86d28cde266"
    },
    "harness|hendrycksTest-high_school_chemistry|5": {
      "hash_examples": "44bfc25c389f0e03",
      "hash_full_prompts": "0e3e5f5d9246482a",
      "hash_input_tokens": "42e1a18523b075e7",
      "hash_cont_tokens": "176c8dcff38c5f8f"
    },
    "harness|hendrycksTest-high_school_computer_science|5": {
      "hash_examples": "8b8cdb1084f24169",
      "hash_full_prompts": "c00487e67c1813cc",
      "hash_input_tokens": "6f109fbd505d364b",
      "hash_cont_tokens": "50421e30bef398f9"
    },
    "harness|hendrycksTest-high_school_european_history|5": {
      "hash_examples": "11cd32d0ef440171",
      "hash_full_prompts": "318f4513c537c6bf",
      "hash_input_tokens": "f1f73dd687da18d7",
      "hash_cont_tokens": "674fc454bdc5ac93"
    },
    "harness|hendrycksTest-high_school_geography|5": {
      "hash_examples": "b60019b9e80b642f",
      "hash_full_prompts": "ee5789fcc1a81b1e",
      "hash_input_tokens": "575ea4d290807e79",
      "hash_cont_tokens": "03a5012b916274ea"
    },
    "harness|hendrycksTest-high_school_government_and_politics|5": {
      "hash_examples": "d221ec983d143dc3",
      "hash_full_prompts": "ac42d888e1ce1155",
      "hash_input_tokens": "5954aff17f30959c",
      "hash_cont_tokens": "873d2aab226ba1d8"
    },
    "harness|hendrycksTest-high_school_macroeconomics|5": {
      "hash_examples": "59c2915cacfd3fbb",
      "hash_full_prompts": "c6bd9d25158abd0e",
      "hash_input_tokens": "cc4bb974def176ee",
      "hash_cont_tokens": "c583432ad27fcfe0"
    },
    "harness|hendrycksTest-high_school_mathematics|5": {
      "hash_examples": "1f8ac897608de342",
      "hash_full_prompts": "5d88f41fc2d643a8",
      "hash_input_tokens": "94100bcb23e1a13e",
      "hash_cont_tokens": "d7907b61bcb8c123"
    },
    "harness|hendrycksTest-high_school_microeconomics|5": {
      "hash_examples": "ead6a0f2f6c83370",
      "hash_full_prompts": "bfc393381298609e",
      "hash_input_tokens": "129c79724487131d",
      "hash_cont_tokens": "f47f041de50333b9"
    },
    "harness|hendrycksTest-high_school_physics|5": {
      "hash_examples": "c3f2025990afec64",
      "hash_full_prompts": "fc78b4997e436734",
      "hash_input_tokens": "82c2ac81ad5b141c",
      "hash_cont_tokens": "0d56317b3e5eedb5"
    },
    "harness|hendrycksTest-high_school_psychology|5": {
      "hash_examples": "21f8aab618f6d636",
      "hash_full_prompts": "d5c76aa40b9dbc43",
      "hash_input_tokens": "422b8bb7add88cc5",
      "hash_cont_tokens": "09ba1243e7390c0f"
    },
    "harness|hendrycksTest-high_school_statistics|5": {
      "hash_examples": "2386a60a11fc5de3",
      "hash_full_prompts": "4c5c8be5aafac432",
      "hash_input_tokens": "d3e6f7198120fbdc",
      "hash_cont_tokens": "9cc29889c3d3f77d"
    },
    "harness|hendrycksTest-high_school_us_history|5": {
      "hash_examples": "74961543be40f04f",
      "hash_full_prompts": "5d5ca4840131ba21",
      "hash_input_tokens": "50c9ff438c85a69e",
      "hash_cont_tokens": "cdd0b3dc06d933e5"
    },
    "harness|hendrycksTest-high_school_world_history|5": {
      "hash_examples": "2ad2f6b7198b2234",
      "hash_full_prompts": "11845057459afd72",
      "hash_input_tokens": "054824cc474caef5",
      "hash_cont_tokens": "e02816433ff28daf"
    },
    "harness|hendrycksTest-human_aging|5": {
      "hash_examples": "1a7199dc733e779b",
      "hash_full_prompts": "756b9096b8eaf892",
      "hash_input_tokens": "151f31a573d81257",
      "hash_cont_tokens": "142a4a8a1138a214"
    },
    "harness|hendrycksTest-human_sexuality|5": {
      "hash_examples": "7acb8fdad97f88a6",
      "hash_full_prompts": "731a52ff15b8cfdb",
      "hash_input_tokens": "b77763767fb18cc4",
      "hash_cont_tokens": "bc54813e809b796d"
    },
    "harness|hendrycksTest-international_law|5": {
      "hash_examples": "1300bfd0dfc59114",
      "hash_full_prompts": "db2aefbff5eec996",
      "hash_input_tokens": "a4e52c47400b8bca",
      "hash_cont_tokens": "8ea8c5ff76a15bca"
    },
    "harness|hendrycksTest-jurisprudence|5": {
      "hash_examples": "083b1e4904c48dc2",
      "hash_full_prompts": "0f89ee3fe03d6a21",
      "hash_input_tokens": "69644001a800b0f7",
      "hash_cont_tokens": "e3a8cd951b6e3469"
    },
    "harness|hendrycksTest-logical_fallacies|5": {
      "hash_examples": "709128f9926a634c",
      "hash_full_prompts": "98a04b1f8f841069",
      "hash_input_tokens": "332ca144a888ad7f",
      "hash_cont_tokens": "3e9e0bdc248fd88a"
    },
    "harness|hendrycksTest-machine_learning|5": {
      "hash_examples": "88f22a636029ae47",
      "hash_full_prompts": "2e1c8d4b1e0cc921",
      "hash_input_tokens": "a27f6dd3c2837ded",
      "hash_cont_tokens": "55b12fb138c6a064"
    },
    "harness|hendrycksTest-management|5": {
      "hash_examples": "8c8a1e07a2151dca",
      "hash_full_prompts": "f51611f514b265b0",
      "hash_input_tokens": "9f72696f5f9c4c80",
      "hash_cont_tokens": "a01d6d39a83c4597"
    },
    "harness|hendrycksTest-marketing|5": {
      "hash_examples": "2668953431f91e96",
      "hash_full_prompts": "77562bef997c7650",
      "hash_input_tokens": "0d9707022133f086",
      "hash_cont_tokens": "6aeaed4d823c98aa"
    },
    "harness|hendrycksTest-medical_genetics|5": {
      "hash_examples": "9c2dda34a2ea4fd2",
      "hash_full_prompts": "202139046daa118f",
      "hash_input_tokens": "e957962a583e58a2",
      "hash_cont_tokens": "50421e30bef398f9"
    },
    "harness|hendrycksTest-miscellaneous|5": {
      "hash_examples": "41adb694024809c2",
      "hash_full_prompts": "bffec9fc237bcf93",
      "hash_input_tokens": "46fe4585062aa36a",
      "hash_cont_tokens": "9b0ab02a64603081"
    },
    "harness|hendrycksTest-moral_disputes|5": {
      "hash_examples": "3171c13ba3c594c4",
      "hash_full_prompts": "170831fc36f1d59e",
      "hash_input_tokens": "cf9834b2c07721dc",
      "hash_cont_tokens": "3b8bbe9108e55ce9"
    },
    "harness|hendrycksTest-moral_scenarios|5": {
      "hash_examples": "9873e077e83e0546",
      "hash_full_prompts": "08f4ceba3131a068",
      "hash_input_tokens": "f257b7cce9ddb541",
      "hash_cont_tokens": "3e9bfc0362e97330"
    },
    "harness|hendrycksTest-nutrition|5": {
      "hash_examples": "7db1d8142ec14323",
      "hash_full_prompts": "4c0e68e3586cb453",
      "hash_input_tokens": "8650a7e901b42458",
      "hash_cont_tokens": "23b2dc6ee2da4cfc"
    },
    "harness|hendrycksTest-philosophy|5": {
      "hash_examples": "9b455b7d72811cc8",
      "hash_full_prompts": "e467f822d8a0d3ff",
      "hash_input_tokens": "4ba4c1d13e1040ec",
      "hash_cont_tokens": "9f6ff69d23a48783"
    },
    "harness|hendrycksTest-prehistory|5": {
      "hash_examples": "8be90d0f538f1560",
      "hash_full_prompts": "152187949bcd0921",
      "hash_input_tokens": "7431d7b2d5c13409",
      "hash_cont_tokens": "d6458d743d875837"
    },
    "harness|hendrycksTest-professional_accounting|5": {
      "hash_examples": "8d377597916cd07e",
      "hash_full_prompts": "0eb7345d6144ee0d",
      "hash_input_tokens": "e7bbb4a15e991424",
      "hash_cont_tokens": "922a195f53a35662"
    },
    "harness|hendrycksTest-professional_law|5": {
      "hash_examples": "cd9dbc52b3c932d6",
      "hash_full_prompts": "36ac764272bfb182",
      "hash_input_tokens": "9178e10bd0763ec4",
      "hash_cont_tokens": "2e590029ef41fbcd"
    },
    "harness|hendrycksTest-professional_medicine|5": {
      "hash_examples": "b20e4e816c1e383e",
      "hash_full_prompts": "7b8d69ea2acaf2f7",
      "hash_input_tokens": "f5a22012a54f70ea",
      "hash_cont_tokens": "7cfee54dbddd5a98"
    },
    "harness|hendrycksTest-professional_psychology|5": {
      "hash_examples": "d45b73b22f9cc039",
      "hash_full_prompts": "fe8937e9ffc99771",
      "hash_input_tokens": "8eeb91b3a7cbea0a",
      "hash_cont_tokens": "a86677b2a45c20e1"
    },
    "harness|hendrycksTest-public_relations|5": {
      "hash_examples": "0d25072e1761652a",
      "hash_full_prompts": "f9adc39cfa9f42ba",
      "hash_input_tokens": "bdfc559a40a1e8ec",
      "hash_cont_tokens": "0d756ccaae031757"
    },
    "harness|hendrycksTest-security_studies|5": {
      "hash_examples": "62bb8197e63d60d4",
      "hash_full_prompts": "869c9c3ae196b7c3",
      "hash_input_tokens": "d49711415961ced7",
      "hash_cont_tokens": "b2229bc2cfbf594b"
    },
    "harness|hendrycksTest-sociology|5": {
      "hash_examples": "e7959df87dea8672",
      "hash_full_prompts": "1a1fc00e17b3a52a",
      "hash_input_tokens": "f9a00c6fc5e9cea7",
      "hash_cont_tokens": "c3a3bdfd177eed5b"
    },
    "harness|hendrycksTest-us_foreign_policy|5": {
      "hash_examples": "4a56a01ddca44dca",
      "hash_full_prompts": "0c7a7081c71c07b6",
      "hash_input_tokens": "647f2d7d9075afaa",
      "hash_cont_tokens": "50421e30bef398f9"
    },
    "harness|hendrycksTest-virology|5": {
      "hash_examples": "451cc86a8c4f4fe9",
      "hash_full_prompts": "01e95325d8b738e4",
      "hash_input_tokens": "784f75f0ad6e0698",
      "hash_cont_tokens": "af8b3658088cb37f"
    },
    "harness|hendrycksTest-world_religions|5": {
      "hash_examples": "3b29cfaf1a81c379",
      "hash_full_prompts": "e0d79a15083dfdff",
      "hash_input_tokens": "17766ebe38853371",
      "hash_cont_tokens": "060118bef6de4e0a"
    }
  }
}