File size: 16,583 Bytes
8ec84fa |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 |
{
"results": {
"copa": {
"acc": 0.74,
"acc_stderr": 0.04408440022768078
},
"piqa": {
"acc": 0.7149075081610446,
"acc_stderr": 0.010533270588738935,
"acc_norm": 0.7116430903155604,
"acc_norm_stderr": 0.010569190399220661
},
"rte": {
"acc": 0.5342960288808665,
"acc_stderr": 0.03002557981936643
},
"winogrande": {
"acc": 0.531965272296764,
"acc_stderr": 0.01402373922116638
},
"hendrycksTest-abstract_algebra": {
"acc": 0.23,
"acc_stderr": 0.04229525846816508,
"acc_norm": 0.23,
"acc_norm_stderr": 0.04229525846816506
},
"hendrycksTest-anatomy": {
"acc": 0.2074074074074074,
"acc_stderr": 0.03502553170678316,
"acc_norm": 0.2,
"acc_norm_stderr": 0.03455473702325436
},
"hendrycksTest-astronomy": {
"acc": 0.2565789473684211,
"acc_stderr": 0.03554180368025689,
"acc_norm": 0.3026315789473684,
"acc_norm_stderr": 0.0373852067611967
},
"hendrycksTest-business_ethics": {
"acc": 0.38,
"acc_stderr": 0.04878317312145633,
"acc_norm": 0.32,
"acc_norm_stderr": 0.046882617226215034
},
"hendrycksTest-clinical_knowledge": {
"acc": 0.22641509433962265,
"acc_stderr": 0.025757559893106758,
"acc_norm": 0.3132075471698113,
"acc_norm_stderr": 0.02854479331905533
},
"hendrycksTest-college_biology": {
"acc": 0.2361111111111111,
"acc_stderr": 0.03551446610810826,
"acc_norm": 0.2222222222222222,
"acc_norm_stderr": 0.03476590104304134
},
"hendrycksTest-college_chemistry": {
"acc": 0.28,
"acc_stderr": 0.04512608598542128,
"acc_norm": 0.34,
"acc_norm_stderr": 0.04760952285695236
},
"hendrycksTest-college_computer_science": {
"acc": 0.32,
"acc_stderr": 0.046882617226215034,
"acc_norm": 0.3,
"acc_norm_stderr": 0.046056618647183814
},
"hendrycksTest-college_mathematics": {
"acc": 0.19,
"acc_stderr": 0.03942772444036624,
"acc_norm": 0.28,
"acc_norm_stderr": 0.045126085985421296
},
"hendrycksTest-college_medicine": {
"acc": 0.2254335260115607,
"acc_stderr": 0.031862098516411454,
"acc_norm": 0.20809248554913296,
"acc_norm_stderr": 0.030952890217749895
},
"hendrycksTest-college_physics": {
"acc": 0.18627450980392157,
"acc_stderr": 0.038739587141493524,
"acc_norm": 0.24509803921568626,
"acc_norm_stderr": 0.04280105837364396
},
"hendrycksTest-computer_security": {
"acc": 0.25,
"acc_stderr": 0.04351941398892446,
"acc_norm": 0.33,
"acc_norm_stderr": 0.047258156262526045
},
"hendrycksTest-conceptual_physics": {
"acc": 0.2553191489361702,
"acc_stderr": 0.028504856470514192,
"acc_norm": 0.2170212765957447,
"acc_norm_stderr": 0.02694748312149622
},
"hendrycksTest-econometrics": {
"acc": 0.30701754385964913,
"acc_stderr": 0.0433913832257986,
"acc_norm": 0.23684210526315788,
"acc_norm_stderr": 0.03999423879281336
},
"hendrycksTest-electrical_engineering": {
"acc": 0.2827586206896552,
"acc_stderr": 0.037528339580033376,
"acc_norm": 0.2827586206896552,
"acc_norm_stderr": 0.037528339580033376
},
"hendrycksTest-elementary_mathematics": {
"acc": 0.2222222222222222,
"acc_stderr": 0.021411684393694185,
"acc_norm": 0.25132275132275134,
"acc_norm_stderr": 0.022340482339643895
},
"hendrycksTest-formal_logic": {
"acc": 0.3333333333333333,
"acc_stderr": 0.042163702135578345,
"acc_norm": 0.30158730158730157,
"acc_norm_stderr": 0.04104947269903394
},
"hendrycksTest-global_facts": {
"acc": 0.22,
"acc_stderr": 0.041633319989322674,
"acc_norm": 0.2,
"acc_norm_stderr": 0.04020151261036846
},
"hendrycksTest-high_school_biology": {
"acc": 0.23225806451612904,
"acc_stderr": 0.024022256130308235,
"acc_norm": 0.2838709677419355,
"acc_norm_stderr": 0.02564938106302926
},
"hendrycksTest-high_school_chemistry": {
"acc": 0.2019704433497537,
"acc_stderr": 0.028247350122180267,
"acc_norm": 0.270935960591133,
"acc_norm_stderr": 0.031270907132976984
},
"hendrycksTest-high_school_computer_science": {
"acc": 0.2,
"acc_stderr": 0.04020151261036843,
"acc_norm": 0.24,
"acc_norm_stderr": 0.042923469599092816
},
"hendrycksTest-high_school_european_history": {
"acc": 0.21212121212121213,
"acc_stderr": 0.031922715695482974,
"acc_norm": 0.2787878787878788,
"acc_norm_stderr": 0.03501438706296781
},
"hendrycksTest-high_school_geography": {
"acc": 0.23737373737373738,
"acc_stderr": 0.0303137105381989,
"acc_norm": 0.30808080808080807,
"acc_norm_stderr": 0.032894773300986155
},
"hendrycksTest-high_school_government_and_politics": {
"acc": 0.24352331606217617,
"acc_stderr": 0.03097543638684543,
"acc_norm": 0.27461139896373055,
"acc_norm_stderr": 0.03221024508041154
},
"hendrycksTest-high_school_macroeconomics": {
"acc": 0.26153846153846155,
"acc_stderr": 0.022282141204204426,
"acc_norm": 0.26666666666666666,
"acc_norm_stderr": 0.022421273612923707
},
"hendrycksTest-high_school_mathematics": {
"acc": 0.2111111111111111,
"acc_stderr": 0.02488211685765507,
"acc_norm": 0.29259259259259257,
"acc_norm_stderr": 0.02773896963217609
},
"hendrycksTest-high_school_microeconomics": {
"acc": 0.24789915966386555,
"acc_stderr": 0.028047967224176892,
"acc_norm": 0.3277310924369748,
"acc_norm_stderr": 0.030489911417673227
},
"hendrycksTest-high_school_physics": {
"acc": 0.2185430463576159,
"acc_stderr": 0.03374235550425694,
"acc_norm": 0.23178807947019867,
"acc_norm_stderr": 0.03445406271987053
},
"hendrycksTest-high_school_psychology": {
"acc": 0.21467889908256882,
"acc_stderr": 0.01760430414925649,
"acc_norm": 0.21467889908256882,
"acc_norm_stderr": 0.017604304149256483
},
"hendrycksTest-high_school_statistics": {
"acc": 0.25925925925925924,
"acc_stderr": 0.02988691054762696,
"acc_norm": 0.3101851851851852,
"acc_norm_stderr": 0.0315469628565663
},
"hendrycksTest-high_school_us_history": {
"acc": 0.25,
"acc_stderr": 0.03039153369274154,
"acc_norm": 0.2549019607843137,
"acc_norm_stderr": 0.030587591351604246
},
"hendrycksTest-high_school_world_history": {
"acc": 0.25738396624472576,
"acc_stderr": 0.0284588209914603,
"acc_norm": 0.25316455696202533,
"acc_norm_stderr": 0.02830465794303529
},
"hendrycksTest-human_aging": {
"acc": 0.3452914798206278,
"acc_stderr": 0.031911001928357954,
"acc_norm": 0.24663677130044842,
"acc_norm_stderr": 0.028930413120910874
},
"hendrycksTest-human_sexuality": {
"acc": 0.366412213740458,
"acc_stderr": 0.04225875451969638,
"acc_norm": 0.3053435114503817,
"acc_norm_stderr": 0.04039314978724562
},
"hendrycksTest-international_law": {
"acc": 0.2396694214876033,
"acc_stderr": 0.03896878985070417,
"acc_norm": 0.48760330578512395,
"acc_norm_stderr": 0.04562951548180765
},
"hendrycksTest-jurisprudence": {
"acc": 0.25,
"acc_stderr": 0.04186091791394607,
"acc_norm": 0.37037037037037035,
"acc_norm_stderr": 0.04668408033024931
},
"hendrycksTest-logical_fallacies": {
"acc": 0.2147239263803681,
"acc_stderr": 0.03226219377286774,
"acc_norm": 0.294478527607362,
"acc_norm_stderr": 0.03581165790474082
},
"hendrycksTest-machine_learning": {
"acc": 0.21428571428571427,
"acc_stderr": 0.038946411200447915,
"acc_norm": 0.23214285714285715,
"acc_norm_stderr": 0.040073418097558045
},
"hendrycksTest-management": {
"acc": 0.23300970873786409,
"acc_stderr": 0.04185832598928315,
"acc_norm": 0.32038834951456313,
"acc_norm_stderr": 0.04620284082280039
},
"hendrycksTest-marketing": {
"acc": 0.2905982905982906,
"acc_stderr": 0.029745048572674047,
"acc_norm": 0.3076923076923077,
"acc_norm_stderr": 0.030236389942173092
},
"hendrycksTest-medical_genetics": {
"acc": 0.28,
"acc_stderr": 0.045126085985421255,
"acc_norm": 0.36,
"acc_norm_stderr": 0.048241815132442176
},
"hendrycksTest-miscellaneous": {
"acc": 0.2707535121328225,
"acc_stderr": 0.01588988836256049,
"acc_norm": 0.2669220945083014,
"acc_norm_stderr": 0.015818450894777562
},
"hendrycksTest-moral_disputes": {
"acc": 0.2745664739884393,
"acc_stderr": 0.02402774515526502,
"acc_norm": 0.315028901734104,
"acc_norm_stderr": 0.025009313790069706
},
"hendrycksTest-moral_scenarios": {
"acc": 0.22346368715083798,
"acc_stderr": 0.01393206863857977,
"acc_norm": 0.27150837988826815,
"acc_norm_stderr": 0.014874252168095273
},
"hendrycksTest-nutrition": {
"acc": 0.3137254901960784,
"acc_stderr": 0.026568921015457155,
"acc_norm": 0.3888888888888889,
"acc_norm_stderr": 0.027914055510467998
},
"hendrycksTest-philosophy": {
"acc": 0.24437299035369775,
"acc_stderr": 0.024406162094668907,
"acc_norm": 0.3086816720257235,
"acc_norm_stderr": 0.026236965881153252
},
"hendrycksTest-prehistory": {
"acc": 0.23148148148148148,
"acc_stderr": 0.023468429832451152,
"acc_norm": 0.20987654320987653,
"acc_norm_stderr": 0.022658344085981375
},
"hendrycksTest-professional_accounting": {
"acc": 0.24113475177304963,
"acc_stderr": 0.025518731049537762,
"acc_norm": 0.26595744680851063,
"acc_norm_stderr": 0.026358065698880585
},
"hendrycksTest-professional_law": {
"acc": 0.25554106910039115,
"acc_stderr": 0.011139857833598514,
"acc_norm": 0.2900912646675359,
"acc_norm_stderr": 0.011590375554733095
},
"hendrycksTest-professional_medicine": {
"acc": 0.25735294117647056,
"acc_stderr": 0.02655651947004151,
"acc_norm": 0.23161764705882354,
"acc_norm_stderr": 0.025626533803777565
},
"hendrycksTest-professional_psychology": {
"acc": 0.2565359477124183,
"acc_stderr": 0.017667841612379002,
"acc_norm": 0.2565359477124183,
"acc_norm_stderr": 0.01766784161237899
},
"hendrycksTest-public_relations": {
"acc": 0.24545454545454545,
"acc_stderr": 0.041220665028782834,
"acc_norm": 0.2,
"acc_norm_stderr": 0.03831305140884603
},
"hendrycksTest-security_studies": {
"acc": 0.39183673469387753,
"acc_stderr": 0.031251275910891656,
"acc_norm": 0.2938775510204082,
"acc_norm_stderr": 0.029162738410249776
},
"hendrycksTest-sociology": {
"acc": 0.263681592039801,
"acc_stderr": 0.031157150869355558,
"acc_norm": 0.23880597014925373,
"acc_norm_stderr": 0.03014777593540922
},
"hendrycksTest-us_foreign_policy": {
"acc": 0.36,
"acc_stderr": 0.04824181513244218,
"acc_norm": 0.36,
"acc_norm_stderr": 0.04824181513244218
},
"hendrycksTest-virology": {
"acc": 0.3072289156626506,
"acc_stderr": 0.035915667978246635,
"acc_norm": 0.2891566265060241,
"acc_norm_stderr": 0.035294868015111155
},
"hendrycksTest-world_religions": {
"acc": 0.2982456140350877,
"acc_stderr": 0.035087719298245654,
"acc_norm": 0.3684210526315789,
"acc_norm_stderr": 0.036996580176568775
}
},
"versions": {
"copa": 0,
"piqa": 0,
"rte": 0,
"winogrande": 0,
"hendrycksTest-abstract_algebra": 0,
"hendrycksTest-anatomy": 0,
"hendrycksTest-astronomy": 0,
"hendrycksTest-business_ethics": 0,
"hendrycksTest-clinical_knowledge": 0,
"hendrycksTest-college_biology": 0,
"hendrycksTest-college_chemistry": 0,
"hendrycksTest-college_computer_science": 0,
"hendrycksTest-college_mathematics": 0,
"hendrycksTest-college_medicine": 0,
"hendrycksTest-college_physics": 0,
"hendrycksTest-computer_security": 0,
"hendrycksTest-conceptual_physics": 0,
"hendrycksTest-econometrics": 0,
"hendrycksTest-electrical_engineering": 0,
"hendrycksTest-elementary_mathematics": 0,
"hendrycksTest-formal_logic": 0,
"hendrycksTest-global_facts": 0,
"hendrycksTest-high_school_biology": 0,
"hendrycksTest-high_school_chemistry": 0,
"hendrycksTest-high_school_computer_science": 0,
"hendrycksTest-high_school_european_history": 0,
"hendrycksTest-high_school_geography": 0,
"hendrycksTest-high_school_government_and_politics": 0,
"hendrycksTest-high_school_macroeconomics": 0,
"hendrycksTest-high_school_mathematics": 0,
"hendrycksTest-high_school_microeconomics": 0,
"hendrycksTest-high_school_physics": 0,
"hendrycksTest-high_school_psychology": 0,
"hendrycksTest-high_school_statistics": 0,
"hendrycksTest-high_school_us_history": 0,
"hendrycksTest-high_school_world_history": 0,
"hendrycksTest-human_aging": 0,
"hendrycksTest-human_sexuality": 0,
"hendrycksTest-international_law": 0,
"hendrycksTest-jurisprudence": 0,
"hendrycksTest-logical_fallacies": 0,
"hendrycksTest-machine_learning": 0,
"hendrycksTest-management": 0,
"hendrycksTest-marketing": 0,
"hendrycksTest-medical_genetics": 0,
"hendrycksTest-miscellaneous": 0,
"hendrycksTest-moral_disputes": 0,
"hendrycksTest-moral_scenarios": 0,
"hendrycksTest-nutrition": 0,
"hendrycksTest-philosophy": 0,
"hendrycksTest-prehistory": 0,
"hendrycksTest-professional_accounting": 0,
"hendrycksTest-professional_law": 0,
"hendrycksTest-professional_medicine": 0,
"hendrycksTest-professional_psychology": 0,
"hendrycksTest-public_relations": 0,
"hendrycksTest-security_studies": 0,
"hendrycksTest-sociology": 0,
"hendrycksTest-us_foreign_policy": 0,
"hendrycksTest-virology": 0,
"hendrycksTest-world_religions": 0
}
} |