Spaces:
Runtime error
Runtime error
{ | |
"results": { | |
"harness|arc:challenge|25": { | |
"acc": 0.6262798634812287, | |
"acc_stderr": 0.014137708601759091, | |
"acc_norm": 0.6732081911262798, | |
"acc_norm_stderr": 0.013706665975587333 | |
}, | |
"harness|hellaswag|10": { | |
"acc": 0.6760605457080263, | |
"acc_stderr": 0.00467020812857923, | |
"acc_norm": 0.8733320055765784, | |
"acc_norm_stderr": 0.0033192094001351187 | |
}, | |
"harness|hendrycksTest-abstract_algebra|5": { | |
"acc": 0.33, | |
"acc_stderr": 0.04725815626252605, | |
"acc_norm": 0.33, | |
"acc_norm_stderr": 0.04725815626252605 | |
}, | |
"harness|hendrycksTest-anatomy|5": { | |
"acc": 0.6296296296296297, | |
"acc_stderr": 0.04171654161354544, | |
"acc_norm": 0.6296296296296297, | |
"acc_norm_stderr": 0.04171654161354544 | |
}, | |
"harness|hendrycksTest-astronomy|5": { | |
"acc": 0.8092105263157895, | |
"acc_stderr": 0.031975658210325, | |
"acc_norm": 0.8092105263157895, | |
"acc_norm_stderr": 0.031975658210325 | |
}, | |
"harness|hendrycksTest-business_ethics|5": { | |
"acc": 0.72, | |
"acc_stderr": 0.04512608598542127, | |
"acc_norm": 0.72, | |
"acc_norm_stderr": 0.04512608598542127 | |
}, | |
"harness|hendrycksTest-clinical_knowledge|5": { | |
"acc": 0.7169811320754716, | |
"acc_stderr": 0.027724236492700918, | |
"acc_norm": 0.7169811320754716, | |
"acc_norm_stderr": 0.027724236492700918 | |
}, | |
"harness|hendrycksTest-college_biology|5": { | |
"acc": 0.8472222222222222, | |
"acc_stderr": 0.030085743248565666, | |
"acc_norm": 0.8472222222222222, | |
"acc_norm_stderr": 0.030085743248565666 | |
}, | |
"harness|hendrycksTest-college_chemistry|5": { | |
"acc": 0.51, | |
"acc_stderr": 0.05024183937956912, | |
"acc_norm": 0.51, | |
"acc_norm_stderr": 0.05024183937956912 | |
}, | |
"harness|hendrycksTest-college_computer_science|5": { | |
"acc": 0.6, | |
"acc_stderr": 0.049236596391733084, | |
"acc_norm": 0.6, | |
"acc_norm_stderr": 0.049236596391733084 | |
}, | |
"harness|hendrycksTest-college_mathematics|5": { | |
"acc": 0.37, | |
"acc_stderr": 0.048523658709391, | |
"acc_norm": 0.37, | |
"acc_norm_stderr": 0.048523658709391 | |
}, | |
"harness|hendrycksTest-college_medicine|5": { | |
"acc": 0.6416184971098265, | |
"acc_stderr": 0.03656343653353159, | |
"acc_norm": 0.6416184971098265, | |
"acc_norm_stderr": 0.03656343653353159 | |
}, | |
"harness|hendrycksTest-college_physics|5": { | |
"acc": 0.37254901960784315, | |
"acc_stderr": 0.04810840148082635, | |
"acc_norm": 0.37254901960784315, | |
"acc_norm_stderr": 0.04810840148082635 | |
}, | |
"harness|hendrycksTest-computer_security|5": { | |
"acc": 0.77, | |
"acc_stderr": 0.04229525846816506, | |
"acc_norm": 0.77, | |
"acc_norm_stderr": 0.04229525846816506 | |
}, | |
"harness|hendrycksTest-conceptual_physics|5": { | |
"acc": 0.6638297872340425, | |
"acc_stderr": 0.030881618520676942, | |
"acc_norm": 0.6638297872340425, | |
"acc_norm_stderr": 0.030881618520676942 | |
}, | |
"harness|hendrycksTest-econometrics|5": { | |
"acc": 0.4473684210526316, | |
"acc_stderr": 0.04677473004491199, | |
"acc_norm": 0.4473684210526316, | |
"acc_norm_stderr": 0.04677473004491199 | |
}, | |
"harness|hendrycksTest-electrical_engineering|5": { | |
"acc": 0.6551724137931034, | |
"acc_stderr": 0.03960933549451207, | |
"acc_norm": 0.6551724137931034, | |
"acc_norm_stderr": 0.03960933549451207 | |
}, | |
"harness|hendrycksTest-elementary_mathematics|5": { | |
"acc": 0.43386243386243384, | |
"acc_stderr": 0.025525034382474894, | |
"acc_norm": 0.43386243386243384, | |
"acc_norm_stderr": 0.025525034382474894 | |
}, | |
"harness|hendrycksTest-formal_logic|5": { | |
"acc": 0.47619047619047616, | |
"acc_stderr": 0.04467062628403273, | |
"acc_norm": 0.47619047619047616, | |
"acc_norm_stderr": 0.04467062628403273 | |
}, | |
"harness|hendrycksTest-global_facts|5": { | |
"acc": 0.46, | |
"acc_stderr": 0.05009082659620332, | |
"acc_norm": 0.46, | |
"acc_norm_stderr": 0.05009082659620332 | |
}, | |
"harness|hendrycksTest-high_school_biology|5": { | |
"acc": 0.8193548387096774, | |
"acc_stderr": 0.02188617856717253, | |
"acc_norm": 0.8193548387096774, | |
"acc_norm_stderr": 0.02188617856717253 | |
}, | |
"harness|hendrycksTest-high_school_chemistry|5": { | |
"acc": 0.5123152709359606, | |
"acc_stderr": 0.035169204442208966, | |
"acc_norm": 0.5123152709359606, | |
"acc_norm_stderr": 0.035169204442208966 | |
}, | |
"harness|hendrycksTest-high_school_computer_science|5": { | |
"acc": 0.79, | |
"acc_stderr": 0.040936018074033256, | |
"acc_norm": 0.79, | |
"acc_norm_stderr": 0.040936018074033256 | |
}, | |
"harness|hendrycksTest-high_school_european_history|5": { | |
"acc": 0.8303030303030303, | |
"acc_stderr": 0.029311188674983134, | |
"acc_norm": 0.8303030303030303, | |
"acc_norm_stderr": 0.029311188674983134 | |
}, | |
"harness|hendrycksTest-high_school_geography|5": { | |
"acc": 0.8787878787878788, | |
"acc_stderr": 0.023253157951942084, | |
"acc_norm": 0.8787878787878788, | |
"acc_norm_stderr": 0.023253157951942084 | |
}, | |
"harness|hendrycksTest-high_school_government_and_politics|5": { | |
"acc": 0.9430051813471503, | |
"acc_stderr": 0.016731085293607555, | |
"acc_norm": 0.9430051813471503, | |
"acc_norm_stderr": 0.016731085293607555 | |
}, | |
"harness|hendrycksTest-high_school_macroeconomics|5": { | |
"acc": 0.7410256410256411, | |
"acc_stderr": 0.02221110681006167, | |
"acc_norm": 0.7410256410256411, | |
"acc_norm_stderr": 0.02221110681006167 | |
}, | |
"harness|hendrycksTest-high_school_mathematics|5": { | |
"acc": 0.35555555555555557, | |
"acc_stderr": 0.029185714949857403, | |
"acc_norm": 0.35555555555555557, | |
"acc_norm_stderr": 0.029185714949857403 | |
}, | |
"harness|hendrycksTest-high_school_microeconomics|5": { | |
"acc": 0.7647058823529411, | |
"acc_stderr": 0.02755361446786381, | |
"acc_norm": 0.7647058823529411, | |
"acc_norm_stderr": 0.02755361446786381 | |
}, | |
"harness|hendrycksTest-high_school_physics|5": { | |
"acc": 0.4304635761589404, | |
"acc_stderr": 0.04042809961395634, | |
"acc_norm": 0.4304635761589404, | |
"acc_norm_stderr": 0.04042809961395634 | |
}, | |
"harness|hendrycksTest-high_school_psychology|5": { | |
"acc": 0.8733944954128441, | |
"acc_stderr": 0.014257128686165169, | |
"acc_norm": 0.8733944954128441, | |
"acc_norm_stderr": 0.014257128686165169 | |
}, | |
"harness|hendrycksTest-high_school_statistics|5": { | |
"acc": 0.6342592592592593, | |
"acc_stderr": 0.032847388576472056, | |
"acc_norm": 0.6342592592592593, | |
"acc_norm_stderr": 0.032847388576472056 | |
}, | |
"harness|hendrycksTest-high_school_us_history|5": { | |
"acc": 0.8970588235294118, | |
"acc_stderr": 0.02132833757080437, | |
"acc_norm": 0.8970588235294118, | |
"acc_norm_stderr": 0.02132833757080437 | |
}, | |
"harness|hendrycksTest-high_school_world_history|5": { | |
"acc": 0.8776371308016878, | |
"acc_stderr": 0.021331741829746786, | |
"acc_norm": 0.8776371308016878, | |
"acc_norm_stderr": 0.021331741829746786 | |
}, | |
"harness|hendrycksTest-human_aging|5": { | |
"acc": 0.8026905829596412, | |
"acc_stderr": 0.02670985334496796, | |
"acc_norm": 0.8026905829596412, | |
"acc_norm_stderr": 0.02670985334496796 | |
}, | |
"harness|hendrycksTest-human_sexuality|5": { | |
"acc": 0.8778625954198473, | |
"acc_stderr": 0.028718776889342344, | |
"acc_norm": 0.8778625954198473, | |
"acc_norm_stderr": 0.028718776889342344 | |
}, | |
"harness|hendrycksTest-international_law|5": { | |
"acc": 0.8760330578512396, | |
"acc_stderr": 0.03008309871603521, | |
"acc_norm": 0.8760330578512396, | |
"acc_norm_stderr": 0.03008309871603521 | |
}, | |
"harness|hendrycksTest-jurisprudence|5": { | |
"acc": 0.8333333333333334, | |
"acc_stderr": 0.03602814176392645, | |
"acc_norm": 0.8333333333333334, | |
"acc_norm_stderr": 0.03602814176392645 | |
}, | |
"harness|hendrycksTest-logical_fallacies|5": { | |
"acc": 0.803680981595092, | |
"acc_stderr": 0.031207970394709218, | |
"acc_norm": 0.803680981595092, | |
"acc_norm_stderr": 0.031207970394709218 | |
}, | |
"harness|hendrycksTest-machine_learning|5": { | |
"acc": 0.5357142857142857, | |
"acc_stderr": 0.04733667890053756, | |
"acc_norm": 0.5357142857142857, | |
"acc_norm_stderr": 0.04733667890053756 | |
}, | |
"harness|hendrycksTest-management|5": { | |
"acc": 0.8349514563106796, | |
"acc_stderr": 0.03675668832233188, | |
"acc_norm": 0.8349514563106796, | |
"acc_norm_stderr": 0.03675668832233188 | |
}, | |
"harness|hendrycksTest-marketing|5": { | |
"acc": 0.905982905982906, | |
"acc_stderr": 0.01911989279892498, | |
"acc_norm": 0.905982905982906, | |
"acc_norm_stderr": 0.01911989279892498 | |
}, | |
"harness|hendrycksTest-medical_genetics|5": { | |
"acc": 0.74, | |
"acc_stderr": 0.04408440022768077, | |
"acc_norm": 0.74, | |
"acc_norm_stderr": 0.04408440022768077 | |
}, | |
"harness|hendrycksTest-miscellaneous|5": { | |
"acc": 0.8620689655172413, | |
"acc_stderr": 0.012331009307795656, | |
"acc_norm": 0.8620689655172413, | |
"acc_norm_stderr": 0.012331009307795656 | |
}, | |
"harness|hendrycksTest-moral_disputes|5": { | |
"acc": 0.7774566473988439, | |
"acc_stderr": 0.02239421566194282, | |
"acc_norm": 0.7774566473988439, | |
"acc_norm_stderr": 0.02239421566194282 | |
}, | |
"harness|hendrycksTest-moral_scenarios|5": { | |
"acc": 0.4547486033519553, | |
"acc_stderr": 0.016653875777524012, | |
"acc_norm": 0.4547486033519553, | |
"acc_norm_stderr": 0.016653875777524012 | |
}, | |
"harness|hendrycksTest-nutrition|5": { | |
"acc": 0.7810457516339869, | |
"acc_stderr": 0.02367908986180772, | |
"acc_norm": 0.7810457516339869, | |
"acc_norm_stderr": 0.02367908986180772 | |
}, | |
"harness|hendrycksTest-philosophy|5": { | |
"acc": 0.7877813504823151, | |
"acc_stderr": 0.023222756797435115, | |
"acc_norm": 0.7877813504823151, | |
"acc_norm_stderr": 0.023222756797435115 | |
}, | |
"harness|hendrycksTest-prehistory|5": { | |
"acc": 0.8364197530864198, | |
"acc_stderr": 0.020581466138257114, | |
"acc_norm": 0.8364197530864198, | |
"acc_norm_stderr": 0.020581466138257114 | |
}, | |
"harness|hendrycksTest-professional_accounting|5": { | |
"acc": 0.5673758865248227, | |
"acc_stderr": 0.02955545423677884, | |
"acc_norm": 0.5673758865248227, | |
"acc_norm_stderr": 0.02955545423677884 | |
}, | |
"harness|hendrycksTest-professional_law|5": { | |
"acc": 0.5319426336375489, | |
"acc_stderr": 0.012744149704869645, | |
"acc_norm": 0.5319426336375489, | |
"acc_norm_stderr": 0.012744149704869645 | |
}, | |
"harness|hendrycksTest-professional_medicine|5": { | |
"acc": 0.75, | |
"acc_stderr": 0.026303648393696036, | |
"acc_norm": 0.75, | |
"acc_norm_stderr": 0.026303648393696036 | |
}, | |
"harness|hendrycksTest-professional_psychology|5": { | |
"acc": 0.7565359477124183, | |
"acc_stderr": 0.01736247376214662, | |
"acc_norm": 0.7565359477124183, | |
"acc_norm_stderr": 0.01736247376214662 | |
}, | |
"harness|hendrycksTest-public_relations|5": { | |
"acc": 0.6909090909090909, | |
"acc_stderr": 0.044262946482000985, | |
"acc_norm": 0.6909090909090909, | |
"acc_norm_stderr": 0.044262946482000985 | |
}, | |
"harness|hendrycksTest-security_studies|5": { | |
"acc": 0.7918367346938775, | |
"acc_stderr": 0.0259911176728133, | |
"acc_norm": 0.7918367346938775, | |
"acc_norm_stderr": 0.0259911176728133 | |
}, | |
"harness|hendrycksTest-sociology|5": { | |
"acc": 0.900497512437811, | |
"acc_stderr": 0.021166216304659393, | |
"acc_norm": 0.900497512437811, | |
"acc_norm_stderr": 0.021166216304659393 | |
}, | |
"harness|hendrycksTest-us_foreign_policy|5": { | |
"acc": 0.92, | |
"acc_stderr": 0.0272659924344291, | |
"acc_norm": 0.92, | |
"acc_norm_stderr": 0.0272659924344291 | |
}, | |
"harness|hendrycksTest-virology|5": { | |
"acc": 0.5301204819277109, | |
"acc_stderr": 0.03885425420866767, | |
"acc_norm": 0.5301204819277109, | |
"acc_norm_stderr": 0.03885425420866767 | |
}, | |
"harness|hendrycksTest-world_religions|5": { | |
"acc": 0.8538011695906432, | |
"acc_stderr": 0.027097290118070806, | |
"acc_norm": 0.8538011695906432, | |
"acc_norm_stderr": 0.027097290118070806 | |
}, | |
"harness|truthfulqa:mc|0": { | |
"mc1": 0.3108935128518972, | |
"mc1_stderr": 0.016203316673559696, | |
"mc2": 0.44923493721887353, | |
"mc2_stderr": 0.01390226410719232 | |
}, | |
"all": { | |
"acc": 0.6967225637378714, | |
"acc_stderr": 0.030867069907791145, | |
"acc_norm": 0.7008615431872544, | |
"acc_norm_stderr": 0.030836865817034945, | |
"mc1": 0.3108935128518972, | |
"mc1_stderr": 0.016203316673559696, | |
"mc2": 0.44923493721887353, | |
"mc2_stderr": 0.01390226410719232 | |
} | |
}, | |
"versions": { | |
"harness|arc:challenge|25": 0, | |
"harness|hellaswag|10": 0, | |
"harness|hendrycksTest-abstract_algebra|5": 1, | |
"harness|hendrycksTest-anatomy|5": 1, | |
"harness|hendrycksTest-astronomy|5": 1, | |
"harness|hendrycksTest-business_ethics|5": 1, | |
"harness|hendrycksTest-clinical_knowledge|5": 1, | |
"harness|hendrycksTest-college_biology|5": 1, | |
"harness|hendrycksTest-college_chemistry|5": 1, | |
"harness|hendrycksTest-college_computer_science|5": 1, | |
"harness|hendrycksTest-college_mathematics|5": 1, | |
"harness|hendrycksTest-college_medicine|5": 1, | |
"harness|hendrycksTest-college_physics|5": 1, | |
"harness|hendrycksTest-computer_security|5": 1, | |
"harness|hendrycksTest-conceptual_physics|5": 1, | |
"harness|hendrycksTest-econometrics|5": 1, | |
"harness|hendrycksTest-electrical_engineering|5": 1, | |
"harness|hendrycksTest-elementary_mathematics|5": 1, | |
"harness|hendrycksTest-formal_logic|5": 1, | |
"harness|hendrycksTest-global_facts|5": 1, | |
"harness|hendrycksTest-high_school_biology|5": 1, | |
"harness|hendrycksTest-high_school_chemistry|5": 1, | |
"harness|hendrycksTest-high_school_computer_science|5": 1, | |
"harness|hendrycksTest-high_school_european_history|5": 1, | |
"harness|hendrycksTest-high_school_geography|5": 1, | |
"harness|hendrycksTest-high_school_government_and_politics|5": 1, | |
"harness|hendrycksTest-high_school_macroeconomics|5": 1, | |
"harness|hendrycksTest-high_school_mathematics|5": 1, | |
"harness|hendrycksTest-high_school_microeconomics|5": 1, | |
"harness|hendrycksTest-high_school_physics|5": 1, | |
"harness|hendrycksTest-high_school_psychology|5": 1, | |
"harness|hendrycksTest-high_school_statistics|5": 1, | |
"harness|hendrycksTest-high_school_us_history|5": 1, | |
"harness|hendrycksTest-high_school_world_history|5": 1, | |
"harness|hendrycksTest-human_aging|5": 1, | |
"harness|hendrycksTest-human_sexuality|5": 1, | |
"harness|hendrycksTest-international_law|5": 1, | |
"harness|hendrycksTest-jurisprudence|5": 1, | |
"harness|hendrycksTest-logical_fallacies|5": 1, | |
"harness|hendrycksTest-machine_learning|5": 1, | |
"harness|hendrycksTest-management|5": 1, | |
"harness|hendrycksTest-marketing|5": 1, | |
"harness|hendrycksTest-medical_genetics|5": 1, | |
"harness|hendrycksTest-miscellaneous|5": 1, | |
"harness|hendrycksTest-moral_disputes|5": 1, | |
"harness|hendrycksTest-moral_scenarios|5": 1, | |
"harness|hendrycksTest-nutrition|5": 1, | |
"harness|hendrycksTest-philosophy|5": 1, | |
"harness|hendrycksTest-prehistory|5": 1, | |
"harness|hendrycksTest-professional_accounting|5": 1, | |
"harness|hendrycksTest-professional_law|5": 1, | |
"harness|hendrycksTest-professional_medicine|5": 1, | |
"harness|hendrycksTest-professional_psychology|5": 1, | |
"harness|hendrycksTest-public_relations|5": 1, | |
"harness|hendrycksTest-security_studies|5": 1, | |
"harness|hendrycksTest-sociology|5": 1, | |
"harness|hendrycksTest-us_foreign_policy|5": 1, | |
"harness|hendrycksTest-virology|5": 1, | |
"harness|hendrycksTest-world_religions|5": 1, | |
"harness|truthfulqa:mc|0": 1, | |
"all": 0 | |
}, | |
"config": { | |
"model_name": "meta-llama/Llama-2-70b-hf", | |
"model_sha": "ed7b07231238f836b99bf45701b9a0063576b194", | |
"model_dtype": "torch.float16", | |
"lighteval_sha": "d2e819bc028044e701a13b954d3326ceddb71b98", | |
"num_few_shot_default": 0, | |
"num_fewshot_seeds": 1, | |
"override_batch_size": 1, | |
"max_samples": null | |
} | |
} |