lm1-misc-oscar
/
3b977b77b
/evaluation
/lm1-3b9-77b-oscar-results_lm-eval_global_step73814_2022-12-03-13-13-55.csv
task,metric,value,err,version | |
copa,acc,0.74,0.04408440022768078,0 | |
hendrycksTest-abstract_algebra,acc,0.23,0.04229525846816508,0 | |
hendrycksTest-abstract_algebra,acc_norm,0.23,0.04229525846816506,0 | |
hendrycksTest-anatomy,acc,0.2074074074074074,0.03502553170678316,0 | |
hendrycksTest-anatomy,acc_norm,0.2,0.03455473702325436,0 | |
hendrycksTest-astronomy,acc,0.2565789473684211,0.03554180368025689,0 | |
hendrycksTest-astronomy,acc_norm,0.3026315789473684,0.0373852067611967,0 | |
hendrycksTest-business_ethics,acc,0.38,0.04878317312145633,0 | |
hendrycksTest-business_ethics,acc_norm,0.32,0.046882617226215034,0 | |
hendrycksTest-clinical_knowledge,acc,0.22641509433962265,0.025757559893106758,0 | |
hendrycksTest-clinical_knowledge,acc_norm,0.3132075471698113,0.02854479331905533,0 | |
hendrycksTest-college_biology,acc,0.2361111111111111,0.03551446610810826,0 | |
hendrycksTest-college_biology,acc_norm,0.2222222222222222,0.03476590104304134,0 | |
hendrycksTest-college_chemistry,acc,0.28,0.04512608598542128,0 | |
hendrycksTest-college_chemistry,acc_norm,0.34,0.04760952285695236,0 | |
hendrycksTest-college_computer_science,acc,0.32,0.046882617226215034,0 | |
hendrycksTest-college_computer_science,acc_norm,0.3,0.046056618647183814,0 | |
hendrycksTest-college_mathematics,acc,0.19,0.03942772444036624,0 | |
hendrycksTest-college_mathematics,acc_norm,0.28,0.045126085985421296,0 | |
hendrycksTest-college_medicine,acc,0.2254335260115607,0.031862098516411454,0 | |
hendrycksTest-college_medicine,acc_norm,0.20809248554913296,0.030952890217749895,0 | |
hendrycksTest-college_physics,acc,0.18627450980392157,0.038739587141493524,0 | |
hendrycksTest-college_physics,acc_norm,0.24509803921568626,0.04280105837364396,0 | |
hendrycksTest-computer_security,acc,0.25,0.04351941398892446,0 | |
hendrycksTest-computer_security,acc_norm,0.33,0.047258156262526045,0 | |
hendrycksTest-conceptual_physics,acc,0.2553191489361702,0.028504856470514192,0 | |
hendrycksTest-conceptual_physics,acc_norm,0.2170212765957447,0.02694748312149622,0 | |
hendrycksTest-econometrics,acc,0.30701754385964913,0.0433913832257986,0 | |
hendrycksTest-econometrics,acc_norm,0.23684210526315788,0.03999423879281336,0 | |
hendrycksTest-electrical_engineering,acc,0.2827586206896552,0.037528339580033376,0 | |
hendrycksTest-electrical_engineering,acc_norm,0.2827586206896552,0.037528339580033376,0 | |
hendrycksTest-elementary_mathematics,acc,0.2222222222222222,0.021411684393694185,0 | |
hendrycksTest-elementary_mathematics,acc_norm,0.25132275132275134,0.022340482339643895,0 | |
hendrycksTest-formal_logic,acc,0.3333333333333333,0.042163702135578345,0 | |
hendrycksTest-formal_logic,acc_norm,0.30158730158730157,0.04104947269903394,0 | |
hendrycksTest-global_facts,acc,0.22,0.041633319989322674,0 | |
hendrycksTest-global_facts,acc_norm,0.2,0.04020151261036846,0 | |
hendrycksTest-high_school_biology,acc,0.23225806451612904,0.024022256130308235,0 | |
hendrycksTest-high_school_biology,acc_norm,0.2838709677419355,0.02564938106302926,0 | |
hendrycksTest-high_school_chemistry,acc,0.2019704433497537,0.028247350122180267,0 | |
hendrycksTest-high_school_chemistry,acc_norm,0.270935960591133,0.031270907132976984,0 | |
hendrycksTest-high_school_computer_science,acc,0.2,0.04020151261036843,0 | |
hendrycksTest-high_school_computer_science,acc_norm,0.24,0.042923469599092816,0 | |
hendrycksTest-high_school_european_history,acc,0.21212121212121213,0.031922715695482974,0 | |
hendrycksTest-high_school_european_history,acc_norm,0.2787878787878788,0.03501438706296781,0 | |
hendrycksTest-high_school_geography,acc,0.23737373737373738,0.0303137105381989,0 | |
hendrycksTest-high_school_geography,acc_norm,0.30808080808080807,0.032894773300986155,0 | |
hendrycksTest-high_school_government_and_politics,acc,0.24352331606217617,0.03097543638684543,0 | |
hendrycksTest-high_school_government_and_politics,acc_norm,0.27461139896373055,0.03221024508041154,0 | |
hendrycksTest-high_school_macroeconomics,acc,0.26153846153846155,0.022282141204204426,0 | |
hendrycksTest-high_school_macroeconomics,acc_norm,0.26666666666666666,0.022421273612923707,0 | |
hendrycksTest-high_school_mathematics,acc,0.2111111111111111,0.02488211685765507,0 | |
hendrycksTest-high_school_mathematics,acc_norm,0.29259259259259257,0.02773896963217609,0 | |
hendrycksTest-high_school_microeconomics,acc,0.24789915966386555,0.028047967224176892,0 | |
hendrycksTest-high_school_microeconomics,acc_norm,0.3277310924369748,0.030489911417673227,0 | |
hendrycksTest-high_school_physics,acc,0.2185430463576159,0.03374235550425694,0 | |
hendrycksTest-high_school_physics,acc_norm,0.23178807947019867,0.03445406271987053,0 | |
hendrycksTest-high_school_psychology,acc,0.21467889908256882,0.01760430414925649,0 | |
hendrycksTest-high_school_psychology,acc_norm,0.21467889908256882,0.017604304149256483,0 | |
hendrycksTest-high_school_statistics,acc,0.25925925925925924,0.02988691054762696,0 | |
hendrycksTest-high_school_statistics,acc_norm,0.3101851851851852,0.0315469628565663,0 | |
hendrycksTest-high_school_us_history,acc,0.25,0.03039153369274154,0 | |
hendrycksTest-high_school_us_history,acc_norm,0.2549019607843137,0.030587591351604246,0 | |
hendrycksTest-high_school_world_history,acc,0.25738396624472576,0.0284588209914603,0 | |
hendrycksTest-high_school_world_history,acc_norm,0.25316455696202533,0.02830465794303529,0 | |
hendrycksTest-human_aging,acc,0.3452914798206278,0.031911001928357954,0 | |
hendrycksTest-human_aging,acc_norm,0.24663677130044842,0.028930413120910874,0 | |
hendrycksTest-human_sexuality,acc,0.366412213740458,0.04225875451969638,0 | |
hendrycksTest-human_sexuality,acc_norm,0.3053435114503817,0.04039314978724562,0 | |
hendrycksTest-international_law,acc,0.2396694214876033,0.03896878985070417,0 | |
hendrycksTest-international_law,acc_norm,0.48760330578512395,0.04562951548180765,0 | |
hendrycksTest-jurisprudence,acc,0.25,0.04186091791394607,0 | |
hendrycksTest-jurisprudence,acc_norm,0.37037037037037035,0.04668408033024931,0 | |
hendrycksTest-logical_fallacies,acc,0.2147239263803681,0.03226219377286774,0 | |
hendrycksTest-logical_fallacies,acc_norm,0.294478527607362,0.03581165790474082,0 | |
hendrycksTest-machine_learning,acc,0.21428571428571427,0.038946411200447915,0 | |
hendrycksTest-machine_learning,acc_norm,0.23214285714285715,0.040073418097558045,0 | |
hendrycksTest-management,acc,0.23300970873786409,0.04185832598928315,0 | |
hendrycksTest-management,acc_norm,0.32038834951456313,0.04620284082280039,0 | |
hendrycksTest-marketing,acc,0.2905982905982906,0.029745048572674047,0 | |
hendrycksTest-marketing,acc_norm,0.3076923076923077,0.030236389942173092,0 | |
hendrycksTest-medical_genetics,acc,0.28,0.045126085985421255,0 | |
hendrycksTest-medical_genetics,acc_norm,0.36,0.048241815132442176,0 | |
hendrycksTest-miscellaneous,acc,0.2707535121328225,0.01588988836256049,0 | |
hendrycksTest-miscellaneous,acc_norm,0.2669220945083014,0.015818450894777562,0 | |
hendrycksTest-moral_disputes,acc,0.2745664739884393,0.02402774515526502,0 | |
hendrycksTest-moral_disputes,acc_norm,0.315028901734104,0.025009313790069706,0 | |
hendrycksTest-moral_scenarios,acc,0.22346368715083798,0.01393206863857977,0 | |
hendrycksTest-moral_scenarios,acc_norm,0.27150837988826815,0.014874252168095273,0 | |
hendrycksTest-nutrition,acc,0.3137254901960784,0.026568921015457155,0 | |
hendrycksTest-nutrition,acc_norm,0.3888888888888889,0.027914055510467998,0 | |
hendrycksTest-philosophy,acc,0.24437299035369775,0.024406162094668907,0 | |
hendrycksTest-philosophy,acc_norm,0.3086816720257235,0.026236965881153252,0 | |
hendrycksTest-prehistory,acc,0.23148148148148148,0.023468429832451152,0 | |
hendrycksTest-prehistory,acc_norm,0.20987654320987653,0.022658344085981375,0 | |
hendrycksTest-professional_accounting,acc,0.24113475177304963,0.025518731049537762,0 | |
hendrycksTest-professional_accounting,acc_norm,0.26595744680851063,0.026358065698880585,0 | |
hendrycksTest-professional_law,acc,0.25554106910039115,0.011139857833598514,0 | |
hendrycksTest-professional_law,acc_norm,0.2900912646675359,0.011590375554733095,0 | |
hendrycksTest-professional_medicine,acc,0.25735294117647056,0.02655651947004151,0 | |
hendrycksTest-professional_medicine,acc_norm,0.23161764705882354,0.025626533803777565,0 | |
hendrycksTest-professional_psychology,acc,0.2565359477124183,0.017667841612379002,0 | |
hendrycksTest-professional_psychology,acc_norm,0.2565359477124183,0.01766784161237899,0 | |
hendrycksTest-public_relations,acc,0.24545454545454545,0.041220665028782834,0 | |
hendrycksTest-public_relations,acc_norm,0.2,0.03831305140884603,0 | |
hendrycksTest-security_studies,acc,0.39183673469387753,0.031251275910891656,0 | |
hendrycksTest-security_studies,acc_norm,0.2938775510204082,0.029162738410249776,0 | |
hendrycksTest-sociology,acc,0.263681592039801,0.031157150869355558,0 | |
hendrycksTest-sociology,acc_norm,0.23880597014925373,0.03014777593540922,0 | |
hendrycksTest-us_foreign_policy,acc,0.36,0.04824181513244218,0 | |
hendrycksTest-us_foreign_policy,acc_norm,0.36,0.04824181513244218,0 | |
hendrycksTest-virology,acc,0.3072289156626506,0.035915667978246635,0 | |
hendrycksTest-virology,acc_norm,0.2891566265060241,0.035294868015111155,0 | |
hendrycksTest-world_religions,acc,0.2982456140350877,0.035087719298245654,0 | |
hendrycksTest-world_religions,acc_norm,0.3684210526315789,0.036996580176568775,0 | |
piqa,acc,0.7149075081610446,0.010533270588738935,0 | |
piqa,acc_norm,0.7116430903155604,0.010569190399220661,0 | |
rte,acc,0.5342960288808665,0.03002557981936643,0 | |
winogrande,acc,0.531965272296764,0.01402373922116638,0 | |