pminervini's picture
Upload folder using huggingface_hub
4d9b54f
raw
history blame
No virus
16.4 kB
{
"results": {
"harness|arc:challenge|25": {
"acc": 0.6262798634812287,
"acc_stderr": 0.014137708601759091,
"acc_norm": 0.6732081911262798,
"acc_norm_stderr": 0.013706665975587333
},
"harness|hellaswag|10": {
"acc": 0.6760605457080263,
"acc_stderr": 0.00467020812857923,
"acc_norm": 0.8733320055765784,
"acc_norm_stderr": 0.0033192094001351187
},
"harness|hendrycksTest-abstract_algebra|5": {
"acc": 0.33,
"acc_stderr": 0.04725815626252605,
"acc_norm": 0.33,
"acc_norm_stderr": 0.04725815626252605
},
"harness|hendrycksTest-anatomy|5": {
"acc": 0.6296296296296297,
"acc_stderr": 0.04171654161354544,
"acc_norm": 0.6296296296296297,
"acc_norm_stderr": 0.04171654161354544
},
"harness|hendrycksTest-astronomy|5": {
"acc": 0.8092105263157895,
"acc_stderr": 0.031975658210325,
"acc_norm": 0.8092105263157895,
"acc_norm_stderr": 0.031975658210325
},
"harness|hendrycksTest-business_ethics|5": {
"acc": 0.72,
"acc_stderr": 0.04512608598542127,
"acc_norm": 0.72,
"acc_norm_stderr": 0.04512608598542127
},
"harness|hendrycksTest-clinical_knowledge|5": {
"acc": 0.7169811320754716,
"acc_stderr": 0.027724236492700918,
"acc_norm": 0.7169811320754716,
"acc_norm_stderr": 0.027724236492700918
},
"harness|hendrycksTest-college_biology|5": {
"acc": 0.8472222222222222,
"acc_stderr": 0.030085743248565666,
"acc_norm": 0.8472222222222222,
"acc_norm_stderr": 0.030085743248565666
},
"harness|hendrycksTest-college_chemistry|5": {
"acc": 0.51,
"acc_stderr": 0.05024183937956912,
"acc_norm": 0.51,
"acc_norm_stderr": 0.05024183937956912
},
"harness|hendrycksTest-college_computer_science|5": {
"acc": 0.6,
"acc_stderr": 0.049236596391733084,
"acc_norm": 0.6,
"acc_norm_stderr": 0.049236596391733084
},
"harness|hendrycksTest-college_mathematics|5": {
"acc": 0.37,
"acc_stderr": 0.048523658709391,
"acc_norm": 0.37,
"acc_norm_stderr": 0.048523658709391
},
"harness|hendrycksTest-college_medicine|5": {
"acc": 0.6416184971098265,
"acc_stderr": 0.03656343653353159,
"acc_norm": 0.6416184971098265,
"acc_norm_stderr": 0.03656343653353159
},
"harness|hendrycksTest-college_physics|5": {
"acc": 0.37254901960784315,
"acc_stderr": 0.04810840148082635,
"acc_norm": 0.37254901960784315,
"acc_norm_stderr": 0.04810840148082635
},
"harness|hendrycksTest-computer_security|5": {
"acc": 0.77,
"acc_stderr": 0.04229525846816506,
"acc_norm": 0.77,
"acc_norm_stderr": 0.04229525846816506
},
"harness|hendrycksTest-conceptual_physics|5": {
"acc": 0.6638297872340425,
"acc_stderr": 0.030881618520676942,
"acc_norm": 0.6638297872340425,
"acc_norm_stderr": 0.030881618520676942
},
"harness|hendrycksTest-econometrics|5": {
"acc": 0.4473684210526316,
"acc_stderr": 0.04677473004491199,
"acc_norm": 0.4473684210526316,
"acc_norm_stderr": 0.04677473004491199
},
"harness|hendrycksTest-electrical_engineering|5": {
"acc": 0.6551724137931034,
"acc_stderr": 0.03960933549451207,
"acc_norm": 0.6551724137931034,
"acc_norm_stderr": 0.03960933549451207
},
"harness|hendrycksTest-elementary_mathematics|5": {
"acc": 0.43386243386243384,
"acc_stderr": 0.025525034382474894,
"acc_norm": 0.43386243386243384,
"acc_norm_stderr": 0.025525034382474894
},
"harness|hendrycksTest-formal_logic|5": {
"acc": 0.47619047619047616,
"acc_stderr": 0.04467062628403273,
"acc_norm": 0.47619047619047616,
"acc_norm_stderr": 0.04467062628403273
},
"harness|hendrycksTest-global_facts|5": {
"acc": 0.46,
"acc_stderr": 0.05009082659620332,
"acc_norm": 0.46,
"acc_norm_stderr": 0.05009082659620332
},
"harness|hendrycksTest-high_school_biology|5": {
"acc": 0.8193548387096774,
"acc_stderr": 0.02188617856717253,
"acc_norm": 0.8193548387096774,
"acc_norm_stderr": 0.02188617856717253
},
"harness|hendrycksTest-high_school_chemistry|5": {
"acc": 0.5123152709359606,
"acc_stderr": 0.035169204442208966,
"acc_norm": 0.5123152709359606,
"acc_norm_stderr": 0.035169204442208966
},
"harness|hendrycksTest-high_school_computer_science|5": {
"acc": 0.79,
"acc_stderr": 0.040936018074033256,
"acc_norm": 0.79,
"acc_norm_stderr": 0.040936018074033256
},
"harness|hendrycksTest-high_school_european_history|5": {
"acc": 0.8303030303030303,
"acc_stderr": 0.029311188674983134,
"acc_norm": 0.8303030303030303,
"acc_norm_stderr": 0.029311188674983134
},
"harness|hendrycksTest-high_school_geography|5": {
"acc": 0.8787878787878788,
"acc_stderr": 0.023253157951942084,
"acc_norm": 0.8787878787878788,
"acc_norm_stderr": 0.023253157951942084
},
"harness|hendrycksTest-high_school_government_and_politics|5": {
"acc": 0.9430051813471503,
"acc_stderr": 0.016731085293607555,
"acc_norm": 0.9430051813471503,
"acc_norm_stderr": 0.016731085293607555
},
"harness|hendrycksTest-high_school_macroeconomics|5": {
"acc": 0.7410256410256411,
"acc_stderr": 0.02221110681006167,
"acc_norm": 0.7410256410256411,
"acc_norm_stderr": 0.02221110681006167
},
"harness|hendrycksTest-high_school_mathematics|5": {
"acc": 0.35555555555555557,
"acc_stderr": 0.029185714949857403,
"acc_norm": 0.35555555555555557,
"acc_norm_stderr": 0.029185714949857403
},
"harness|hendrycksTest-high_school_microeconomics|5": {
"acc": 0.7647058823529411,
"acc_stderr": 0.02755361446786381,
"acc_norm": 0.7647058823529411,
"acc_norm_stderr": 0.02755361446786381
},
"harness|hendrycksTest-high_school_physics|5": {
"acc": 0.4304635761589404,
"acc_stderr": 0.04042809961395634,
"acc_norm": 0.4304635761589404,
"acc_norm_stderr": 0.04042809961395634
},
"harness|hendrycksTest-high_school_psychology|5": {
"acc": 0.8733944954128441,
"acc_stderr": 0.014257128686165169,
"acc_norm": 0.8733944954128441,
"acc_norm_stderr": 0.014257128686165169
},
"harness|hendrycksTest-high_school_statistics|5": {
"acc": 0.6342592592592593,
"acc_stderr": 0.032847388576472056,
"acc_norm": 0.6342592592592593,
"acc_norm_stderr": 0.032847388576472056
},
"harness|hendrycksTest-high_school_us_history|5": {
"acc": 0.8970588235294118,
"acc_stderr": 0.02132833757080437,
"acc_norm": 0.8970588235294118,
"acc_norm_stderr": 0.02132833757080437
},
"harness|hendrycksTest-high_school_world_history|5": {
"acc": 0.8776371308016878,
"acc_stderr": 0.021331741829746786,
"acc_norm": 0.8776371308016878,
"acc_norm_stderr": 0.021331741829746786
},
"harness|hendrycksTest-human_aging|5": {
"acc": 0.8026905829596412,
"acc_stderr": 0.02670985334496796,
"acc_norm": 0.8026905829596412,
"acc_norm_stderr": 0.02670985334496796
},
"harness|hendrycksTest-human_sexuality|5": {
"acc": 0.8778625954198473,
"acc_stderr": 0.028718776889342344,
"acc_norm": 0.8778625954198473,
"acc_norm_stderr": 0.028718776889342344
},
"harness|hendrycksTest-international_law|5": {
"acc": 0.8760330578512396,
"acc_stderr": 0.03008309871603521,
"acc_norm": 0.8760330578512396,
"acc_norm_stderr": 0.03008309871603521
},
"harness|hendrycksTest-jurisprudence|5": {
"acc": 0.8333333333333334,
"acc_stderr": 0.03602814176392645,
"acc_norm": 0.8333333333333334,
"acc_norm_stderr": 0.03602814176392645
},
"harness|hendrycksTest-logical_fallacies|5": {
"acc": 0.803680981595092,
"acc_stderr": 0.031207970394709218,
"acc_norm": 0.803680981595092,
"acc_norm_stderr": 0.031207970394709218
},
"harness|hendrycksTest-machine_learning|5": {
"acc": 0.5357142857142857,
"acc_stderr": 0.04733667890053756,
"acc_norm": 0.5357142857142857,
"acc_norm_stderr": 0.04733667890053756
},
"harness|hendrycksTest-management|5": {
"acc": 0.8349514563106796,
"acc_stderr": 0.03675668832233188,
"acc_norm": 0.8349514563106796,
"acc_norm_stderr": 0.03675668832233188
},
"harness|hendrycksTest-marketing|5": {
"acc": 0.905982905982906,
"acc_stderr": 0.01911989279892498,
"acc_norm": 0.905982905982906,
"acc_norm_stderr": 0.01911989279892498
},
"harness|hendrycksTest-medical_genetics|5": {
"acc": 0.74,
"acc_stderr": 0.04408440022768077,
"acc_norm": 0.74,
"acc_norm_stderr": 0.04408440022768077
},
"harness|hendrycksTest-miscellaneous|5": {
"acc": 0.8620689655172413,
"acc_stderr": 0.012331009307795656,
"acc_norm": 0.8620689655172413,
"acc_norm_stderr": 0.012331009307795656
},
"harness|hendrycksTest-moral_disputes|5": {
"acc": 0.7774566473988439,
"acc_stderr": 0.02239421566194282,
"acc_norm": 0.7774566473988439,
"acc_norm_stderr": 0.02239421566194282
},
"harness|hendrycksTest-moral_scenarios|5": {
"acc": 0.4547486033519553,
"acc_stderr": 0.016653875777524012,
"acc_norm": 0.4547486033519553,
"acc_norm_stderr": 0.016653875777524012
},
"harness|hendrycksTest-nutrition|5": {
"acc": 0.7810457516339869,
"acc_stderr": 0.02367908986180772,
"acc_norm": 0.7810457516339869,
"acc_norm_stderr": 0.02367908986180772
},
"harness|hendrycksTest-philosophy|5": {
"acc": 0.7877813504823151,
"acc_stderr": 0.023222756797435115,
"acc_norm": 0.7877813504823151,
"acc_norm_stderr": 0.023222756797435115
},
"harness|hendrycksTest-prehistory|5": {
"acc": 0.8364197530864198,
"acc_stderr": 0.020581466138257114,
"acc_norm": 0.8364197530864198,
"acc_norm_stderr": 0.020581466138257114
},
"harness|hendrycksTest-professional_accounting|5": {
"acc": 0.5673758865248227,
"acc_stderr": 0.02955545423677884,
"acc_norm": 0.5673758865248227,
"acc_norm_stderr": 0.02955545423677884
},
"harness|hendrycksTest-professional_law|5": {
"acc": 0.5319426336375489,
"acc_stderr": 0.012744149704869645,
"acc_norm": 0.5319426336375489,
"acc_norm_stderr": 0.012744149704869645
},
"harness|hendrycksTest-professional_medicine|5": {
"acc": 0.75,
"acc_stderr": 0.026303648393696036,
"acc_norm": 0.75,
"acc_norm_stderr": 0.026303648393696036
},
"harness|hendrycksTest-professional_psychology|5": {
"acc": 0.7565359477124183,
"acc_stderr": 0.01736247376214662,
"acc_norm": 0.7565359477124183,
"acc_norm_stderr": 0.01736247376214662
},
"harness|hendrycksTest-public_relations|5": {
"acc": 0.6909090909090909,
"acc_stderr": 0.044262946482000985,
"acc_norm": 0.6909090909090909,
"acc_norm_stderr": 0.044262946482000985
},
"harness|hendrycksTest-security_studies|5": {
"acc": 0.7918367346938775,
"acc_stderr": 0.0259911176728133,
"acc_norm": 0.7918367346938775,
"acc_norm_stderr": 0.0259911176728133
},
"harness|hendrycksTest-sociology|5": {
"acc": 0.900497512437811,
"acc_stderr": 0.021166216304659393,
"acc_norm": 0.900497512437811,
"acc_norm_stderr": 0.021166216304659393
},
"harness|hendrycksTest-us_foreign_policy|5": {
"acc": 0.92,
"acc_stderr": 0.0272659924344291,
"acc_norm": 0.92,
"acc_norm_stderr": 0.0272659924344291
},
"harness|hendrycksTest-virology|5": {
"acc": 0.5301204819277109,
"acc_stderr": 0.03885425420866767,
"acc_norm": 0.5301204819277109,
"acc_norm_stderr": 0.03885425420866767
},
"harness|hendrycksTest-world_religions|5": {
"acc": 0.8538011695906432,
"acc_stderr": 0.027097290118070806,
"acc_norm": 0.8538011695906432,
"acc_norm_stderr": 0.027097290118070806
},
"harness|truthfulqa:mc|0": {
"mc1": 0.3108935128518972,
"mc1_stderr": 0.016203316673559696,
"mc2": 0.44923493721887353,
"mc2_stderr": 0.01390226410719232
},
"all": {
"acc": 0.6967225637378714,
"acc_stderr": 0.030867069907791145,
"acc_norm": 0.7008615431872544,
"acc_norm_stderr": 0.030836865817034945,
"mc1": 0.3108935128518972,
"mc1_stderr": 0.016203316673559696,
"mc2": 0.44923493721887353,
"mc2_stderr": 0.01390226410719232
}
},
"versions": {
"harness|arc:challenge|25": 0,
"harness|hellaswag|10": 0,
"harness|hendrycksTest-abstract_algebra|5": 1,
"harness|hendrycksTest-anatomy|5": 1,
"harness|hendrycksTest-astronomy|5": 1,
"harness|hendrycksTest-business_ethics|5": 1,
"harness|hendrycksTest-clinical_knowledge|5": 1,
"harness|hendrycksTest-college_biology|5": 1,
"harness|hendrycksTest-college_chemistry|5": 1,
"harness|hendrycksTest-college_computer_science|5": 1,
"harness|hendrycksTest-college_mathematics|5": 1,
"harness|hendrycksTest-college_medicine|5": 1,
"harness|hendrycksTest-college_physics|5": 1,
"harness|hendrycksTest-computer_security|5": 1,
"harness|hendrycksTest-conceptual_physics|5": 1,
"harness|hendrycksTest-econometrics|5": 1,
"harness|hendrycksTest-electrical_engineering|5": 1,
"harness|hendrycksTest-elementary_mathematics|5": 1,
"harness|hendrycksTest-formal_logic|5": 1,
"harness|hendrycksTest-global_facts|5": 1,
"harness|hendrycksTest-high_school_biology|5": 1,
"harness|hendrycksTest-high_school_chemistry|5": 1,
"harness|hendrycksTest-high_school_computer_science|5": 1,
"harness|hendrycksTest-high_school_european_history|5": 1,
"harness|hendrycksTest-high_school_geography|5": 1,
"harness|hendrycksTest-high_school_government_and_politics|5": 1,
"harness|hendrycksTest-high_school_macroeconomics|5": 1,
"harness|hendrycksTest-high_school_mathematics|5": 1,
"harness|hendrycksTest-high_school_microeconomics|5": 1,
"harness|hendrycksTest-high_school_physics|5": 1,
"harness|hendrycksTest-high_school_psychology|5": 1,
"harness|hendrycksTest-high_school_statistics|5": 1,
"harness|hendrycksTest-high_school_us_history|5": 1,
"harness|hendrycksTest-high_school_world_history|5": 1,
"harness|hendrycksTest-human_aging|5": 1,
"harness|hendrycksTest-human_sexuality|5": 1,
"harness|hendrycksTest-international_law|5": 1,
"harness|hendrycksTest-jurisprudence|5": 1,
"harness|hendrycksTest-logical_fallacies|5": 1,
"harness|hendrycksTest-machine_learning|5": 1,
"harness|hendrycksTest-management|5": 1,
"harness|hendrycksTest-marketing|5": 1,
"harness|hendrycksTest-medical_genetics|5": 1,
"harness|hendrycksTest-miscellaneous|5": 1,
"harness|hendrycksTest-moral_disputes|5": 1,
"harness|hendrycksTest-moral_scenarios|5": 1,
"harness|hendrycksTest-nutrition|5": 1,
"harness|hendrycksTest-philosophy|5": 1,
"harness|hendrycksTest-prehistory|5": 1,
"harness|hendrycksTest-professional_accounting|5": 1,
"harness|hendrycksTest-professional_law|5": 1,
"harness|hendrycksTest-professional_medicine|5": 1,
"harness|hendrycksTest-professional_psychology|5": 1,
"harness|hendrycksTest-public_relations|5": 1,
"harness|hendrycksTest-security_studies|5": 1,
"harness|hendrycksTest-sociology|5": 1,
"harness|hendrycksTest-us_foreign_policy|5": 1,
"harness|hendrycksTest-virology|5": 1,
"harness|hendrycksTest-world_religions|5": 1,
"harness|truthfulqa:mc|0": 1,
"all": 0
},
"config": {
"model_name": "meta-llama/Llama-2-70b-hf",
"model_sha": "ed7b07231238f836b99bf45701b9a0063576b194",
"model_dtype": "torch.float16",
"lighteval_sha": "d2e819bc028044e701a13b954d3326ceddb71b98",
"num_few_shot_default": 0,
"num_fewshot_seeds": 1,
"override_batch_size": 1,
"max_samples": null
}
}