lm1-misc-oscar / 3b977b77b /evaluation /lm1-3b9-77b-oscar-results_lm-eval_global_step73814_2022-12-03-13-13-55.json
Muennighoff's picture
Add
8ec84fa
raw
history blame
16.6 kB
{
"results": {
"copa": {
"acc": 0.74,
"acc_stderr": 0.04408440022768078
},
"piqa": {
"acc": 0.7149075081610446,
"acc_stderr": 0.010533270588738935,
"acc_norm": 0.7116430903155604,
"acc_norm_stderr": 0.010569190399220661
},
"rte": {
"acc": 0.5342960288808665,
"acc_stderr": 0.03002557981936643
},
"winogrande": {
"acc": 0.531965272296764,
"acc_stderr": 0.01402373922116638
},
"hendrycksTest-abstract_algebra": {
"acc": 0.23,
"acc_stderr": 0.04229525846816508,
"acc_norm": 0.23,
"acc_norm_stderr": 0.04229525846816506
},
"hendrycksTest-anatomy": {
"acc": 0.2074074074074074,
"acc_stderr": 0.03502553170678316,
"acc_norm": 0.2,
"acc_norm_stderr": 0.03455473702325436
},
"hendrycksTest-astronomy": {
"acc": 0.2565789473684211,
"acc_stderr": 0.03554180368025689,
"acc_norm": 0.3026315789473684,
"acc_norm_stderr": 0.0373852067611967
},
"hendrycksTest-business_ethics": {
"acc": 0.38,
"acc_stderr": 0.04878317312145633,
"acc_norm": 0.32,
"acc_norm_stderr": 0.046882617226215034
},
"hendrycksTest-clinical_knowledge": {
"acc": 0.22641509433962265,
"acc_stderr": 0.025757559893106758,
"acc_norm": 0.3132075471698113,
"acc_norm_stderr": 0.02854479331905533
},
"hendrycksTest-college_biology": {
"acc": 0.2361111111111111,
"acc_stderr": 0.03551446610810826,
"acc_norm": 0.2222222222222222,
"acc_norm_stderr": 0.03476590104304134
},
"hendrycksTest-college_chemistry": {
"acc": 0.28,
"acc_stderr": 0.04512608598542128,
"acc_norm": 0.34,
"acc_norm_stderr": 0.04760952285695236
},
"hendrycksTest-college_computer_science": {
"acc": 0.32,
"acc_stderr": 0.046882617226215034,
"acc_norm": 0.3,
"acc_norm_stderr": 0.046056618647183814
},
"hendrycksTest-college_mathematics": {
"acc": 0.19,
"acc_stderr": 0.03942772444036624,
"acc_norm": 0.28,
"acc_norm_stderr": 0.045126085985421296
},
"hendrycksTest-college_medicine": {
"acc": 0.2254335260115607,
"acc_stderr": 0.031862098516411454,
"acc_norm": 0.20809248554913296,
"acc_norm_stderr": 0.030952890217749895
},
"hendrycksTest-college_physics": {
"acc": 0.18627450980392157,
"acc_stderr": 0.038739587141493524,
"acc_norm": 0.24509803921568626,
"acc_norm_stderr": 0.04280105837364396
},
"hendrycksTest-computer_security": {
"acc": 0.25,
"acc_stderr": 0.04351941398892446,
"acc_norm": 0.33,
"acc_norm_stderr": 0.047258156262526045
},
"hendrycksTest-conceptual_physics": {
"acc": 0.2553191489361702,
"acc_stderr": 0.028504856470514192,
"acc_norm": 0.2170212765957447,
"acc_norm_stderr": 0.02694748312149622
},
"hendrycksTest-econometrics": {
"acc": 0.30701754385964913,
"acc_stderr": 0.0433913832257986,
"acc_norm": 0.23684210526315788,
"acc_norm_stderr": 0.03999423879281336
},
"hendrycksTest-electrical_engineering": {
"acc": 0.2827586206896552,
"acc_stderr": 0.037528339580033376,
"acc_norm": 0.2827586206896552,
"acc_norm_stderr": 0.037528339580033376
},
"hendrycksTest-elementary_mathematics": {
"acc": 0.2222222222222222,
"acc_stderr": 0.021411684393694185,
"acc_norm": 0.25132275132275134,
"acc_norm_stderr": 0.022340482339643895
},
"hendrycksTest-formal_logic": {
"acc": 0.3333333333333333,
"acc_stderr": 0.042163702135578345,
"acc_norm": 0.30158730158730157,
"acc_norm_stderr": 0.04104947269903394
},
"hendrycksTest-global_facts": {
"acc": 0.22,
"acc_stderr": 0.041633319989322674,
"acc_norm": 0.2,
"acc_norm_stderr": 0.04020151261036846
},
"hendrycksTest-high_school_biology": {
"acc": 0.23225806451612904,
"acc_stderr": 0.024022256130308235,
"acc_norm": 0.2838709677419355,
"acc_norm_stderr": 0.02564938106302926
},
"hendrycksTest-high_school_chemistry": {
"acc": 0.2019704433497537,
"acc_stderr": 0.028247350122180267,
"acc_norm": 0.270935960591133,
"acc_norm_stderr": 0.031270907132976984
},
"hendrycksTest-high_school_computer_science": {
"acc": 0.2,
"acc_stderr": 0.04020151261036843,
"acc_norm": 0.24,
"acc_norm_stderr": 0.042923469599092816
},
"hendrycksTest-high_school_european_history": {
"acc": 0.21212121212121213,
"acc_stderr": 0.031922715695482974,
"acc_norm": 0.2787878787878788,
"acc_norm_stderr": 0.03501438706296781
},
"hendrycksTest-high_school_geography": {
"acc": 0.23737373737373738,
"acc_stderr": 0.0303137105381989,
"acc_norm": 0.30808080808080807,
"acc_norm_stderr": 0.032894773300986155
},
"hendrycksTest-high_school_government_and_politics": {
"acc": 0.24352331606217617,
"acc_stderr": 0.03097543638684543,
"acc_norm": 0.27461139896373055,
"acc_norm_stderr": 0.03221024508041154
},
"hendrycksTest-high_school_macroeconomics": {
"acc": 0.26153846153846155,
"acc_stderr": 0.022282141204204426,
"acc_norm": 0.26666666666666666,
"acc_norm_stderr": 0.022421273612923707
},
"hendrycksTest-high_school_mathematics": {
"acc": 0.2111111111111111,
"acc_stderr": 0.02488211685765507,
"acc_norm": 0.29259259259259257,
"acc_norm_stderr": 0.02773896963217609
},
"hendrycksTest-high_school_microeconomics": {
"acc": 0.24789915966386555,
"acc_stderr": 0.028047967224176892,
"acc_norm": 0.3277310924369748,
"acc_norm_stderr": 0.030489911417673227
},
"hendrycksTest-high_school_physics": {
"acc": 0.2185430463576159,
"acc_stderr": 0.03374235550425694,
"acc_norm": 0.23178807947019867,
"acc_norm_stderr": 0.03445406271987053
},
"hendrycksTest-high_school_psychology": {
"acc": 0.21467889908256882,
"acc_stderr": 0.01760430414925649,
"acc_norm": 0.21467889908256882,
"acc_norm_stderr": 0.017604304149256483
},
"hendrycksTest-high_school_statistics": {
"acc": 0.25925925925925924,
"acc_stderr": 0.02988691054762696,
"acc_norm": 0.3101851851851852,
"acc_norm_stderr": 0.0315469628565663
},
"hendrycksTest-high_school_us_history": {
"acc": 0.25,
"acc_stderr": 0.03039153369274154,
"acc_norm": 0.2549019607843137,
"acc_norm_stderr": 0.030587591351604246
},
"hendrycksTest-high_school_world_history": {
"acc": 0.25738396624472576,
"acc_stderr": 0.0284588209914603,
"acc_norm": 0.25316455696202533,
"acc_norm_stderr": 0.02830465794303529
},
"hendrycksTest-human_aging": {
"acc": 0.3452914798206278,
"acc_stderr": 0.031911001928357954,
"acc_norm": 0.24663677130044842,
"acc_norm_stderr": 0.028930413120910874
},
"hendrycksTest-human_sexuality": {
"acc": 0.366412213740458,
"acc_stderr": 0.04225875451969638,
"acc_norm": 0.3053435114503817,
"acc_norm_stderr": 0.04039314978724562
},
"hendrycksTest-international_law": {
"acc": 0.2396694214876033,
"acc_stderr": 0.03896878985070417,
"acc_norm": 0.48760330578512395,
"acc_norm_stderr": 0.04562951548180765
},
"hendrycksTest-jurisprudence": {
"acc": 0.25,
"acc_stderr": 0.04186091791394607,
"acc_norm": 0.37037037037037035,
"acc_norm_stderr": 0.04668408033024931
},
"hendrycksTest-logical_fallacies": {
"acc": 0.2147239263803681,
"acc_stderr": 0.03226219377286774,
"acc_norm": 0.294478527607362,
"acc_norm_stderr": 0.03581165790474082
},
"hendrycksTest-machine_learning": {
"acc": 0.21428571428571427,
"acc_stderr": 0.038946411200447915,
"acc_norm": 0.23214285714285715,
"acc_norm_stderr": 0.040073418097558045
},
"hendrycksTest-management": {
"acc": 0.23300970873786409,
"acc_stderr": 0.04185832598928315,
"acc_norm": 0.32038834951456313,
"acc_norm_stderr": 0.04620284082280039
},
"hendrycksTest-marketing": {
"acc": 0.2905982905982906,
"acc_stderr": 0.029745048572674047,
"acc_norm": 0.3076923076923077,
"acc_norm_stderr": 0.030236389942173092
},
"hendrycksTest-medical_genetics": {
"acc": 0.28,
"acc_stderr": 0.045126085985421255,
"acc_norm": 0.36,
"acc_norm_stderr": 0.048241815132442176
},
"hendrycksTest-miscellaneous": {
"acc": 0.2707535121328225,
"acc_stderr": 0.01588988836256049,
"acc_norm": 0.2669220945083014,
"acc_norm_stderr": 0.015818450894777562
},
"hendrycksTest-moral_disputes": {
"acc": 0.2745664739884393,
"acc_stderr": 0.02402774515526502,
"acc_norm": 0.315028901734104,
"acc_norm_stderr": 0.025009313790069706
},
"hendrycksTest-moral_scenarios": {
"acc": 0.22346368715083798,
"acc_stderr": 0.01393206863857977,
"acc_norm": 0.27150837988826815,
"acc_norm_stderr": 0.014874252168095273
},
"hendrycksTest-nutrition": {
"acc": 0.3137254901960784,
"acc_stderr": 0.026568921015457155,
"acc_norm": 0.3888888888888889,
"acc_norm_stderr": 0.027914055510467998
},
"hendrycksTest-philosophy": {
"acc": 0.24437299035369775,
"acc_stderr": 0.024406162094668907,
"acc_norm": 0.3086816720257235,
"acc_norm_stderr": 0.026236965881153252
},
"hendrycksTest-prehistory": {
"acc": 0.23148148148148148,
"acc_stderr": 0.023468429832451152,
"acc_norm": 0.20987654320987653,
"acc_norm_stderr": 0.022658344085981375
},
"hendrycksTest-professional_accounting": {
"acc": 0.24113475177304963,
"acc_stderr": 0.025518731049537762,
"acc_norm": 0.26595744680851063,
"acc_norm_stderr": 0.026358065698880585
},
"hendrycksTest-professional_law": {
"acc": 0.25554106910039115,
"acc_stderr": 0.011139857833598514,
"acc_norm": 0.2900912646675359,
"acc_norm_stderr": 0.011590375554733095
},
"hendrycksTest-professional_medicine": {
"acc": 0.25735294117647056,
"acc_stderr": 0.02655651947004151,
"acc_norm": 0.23161764705882354,
"acc_norm_stderr": 0.025626533803777565
},
"hendrycksTest-professional_psychology": {
"acc": 0.2565359477124183,
"acc_stderr": 0.017667841612379002,
"acc_norm": 0.2565359477124183,
"acc_norm_stderr": 0.01766784161237899
},
"hendrycksTest-public_relations": {
"acc": 0.24545454545454545,
"acc_stderr": 0.041220665028782834,
"acc_norm": 0.2,
"acc_norm_stderr": 0.03831305140884603
},
"hendrycksTest-security_studies": {
"acc": 0.39183673469387753,
"acc_stderr": 0.031251275910891656,
"acc_norm": 0.2938775510204082,
"acc_norm_stderr": 0.029162738410249776
},
"hendrycksTest-sociology": {
"acc": 0.263681592039801,
"acc_stderr": 0.031157150869355558,
"acc_norm": 0.23880597014925373,
"acc_norm_stderr": 0.03014777593540922
},
"hendrycksTest-us_foreign_policy": {
"acc": 0.36,
"acc_stderr": 0.04824181513244218,
"acc_norm": 0.36,
"acc_norm_stderr": 0.04824181513244218
},
"hendrycksTest-virology": {
"acc": 0.3072289156626506,
"acc_stderr": 0.035915667978246635,
"acc_norm": 0.2891566265060241,
"acc_norm_stderr": 0.035294868015111155
},
"hendrycksTest-world_religions": {
"acc": 0.2982456140350877,
"acc_stderr": 0.035087719298245654,
"acc_norm": 0.3684210526315789,
"acc_norm_stderr": 0.036996580176568775
}
},
"versions": {
"copa": 0,
"piqa": 0,
"rte": 0,
"winogrande": 0,
"hendrycksTest-abstract_algebra": 0,
"hendrycksTest-anatomy": 0,
"hendrycksTest-astronomy": 0,
"hendrycksTest-business_ethics": 0,
"hendrycksTest-clinical_knowledge": 0,
"hendrycksTest-college_biology": 0,
"hendrycksTest-college_chemistry": 0,
"hendrycksTest-college_computer_science": 0,
"hendrycksTest-college_mathematics": 0,
"hendrycksTest-college_medicine": 0,
"hendrycksTest-college_physics": 0,
"hendrycksTest-computer_security": 0,
"hendrycksTest-conceptual_physics": 0,
"hendrycksTest-econometrics": 0,
"hendrycksTest-electrical_engineering": 0,
"hendrycksTest-elementary_mathematics": 0,
"hendrycksTest-formal_logic": 0,
"hendrycksTest-global_facts": 0,
"hendrycksTest-high_school_biology": 0,
"hendrycksTest-high_school_chemistry": 0,
"hendrycksTest-high_school_computer_science": 0,
"hendrycksTest-high_school_european_history": 0,
"hendrycksTest-high_school_geography": 0,
"hendrycksTest-high_school_government_and_politics": 0,
"hendrycksTest-high_school_macroeconomics": 0,
"hendrycksTest-high_school_mathematics": 0,
"hendrycksTest-high_school_microeconomics": 0,
"hendrycksTest-high_school_physics": 0,
"hendrycksTest-high_school_psychology": 0,
"hendrycksTest-high_school_statistics": 0,
"hendrycksTest-high_school_us_history": 0,
"hendrycksTest-high_school_world_history": 0,
"hendrycksTest-human_aging": 0,
"hendrycksTest-human_sexuality": 0,
"hendrycksTest-international_law": 0,
"hendrycksTest-jurisprudence": 0,
"hendrycksTest-logical_fallacies": 0,
"hendrycksTest-machine_learning": 0,
"hendrycksTest-management": 0,
"hendrycksTest-marketing": 0,
"hendrycksTest-medical_genetics": 0,
"hendrycksTest-miscellaneous": 0,
"hendrycksTest-moral_disputes": 0,
"hendrycksTest-moral_scenarios": 0,
"hendrycksTest-nutrition": 0,
"hendrycksTest-philosophy": 0,
"hendrycksTest-prehistory": 0,
"hendrycksTest-professional_accounting": 0,
"hendrycksTest-professional_law": 0,
"hendrycksTest-professional_medicine": 0,
"hendrycksTest-professional_psychology": 0,
"hendrycksTest-public_relations": 0,
"hendrycksTest-security_studies": 0,
"hendrycksTest-sociology": 0,
"hendrycksTest-us_foreign_policy": 0,
"hendrycksTest-virology": 0,
"hendrycksTest-world_religions": 0
}
}