{ "arc_challenge": { "alias": "arc_challenge", "acc,none": 0.30631399317406144, "acc_stderr,none": 0.013470584417276513, "acc_norm,none": 0.33447098976109213, "acc_norm_stderr,none": 0.013787460322441382 }, "gpqa_diamond_cot_n_shot": { "alias": "gpqa_diamond_cot_n_shot", "exact_match,strict-match": 0.0, "exact_match_stderr,strict-match": 0.0, "exact_match,flexible-extract": 0.15151515151515152, "exact_match_stderr,flexible-extract": 0.0255456504266036 }, "gpqa_diamond_cot_zeroshot": { "alias": "gpqa_diamond_cot_zeroshot", "exact_match,strict-match": 0.0, "exact_match_stderr,strict-match": 0.0, "exact_match,flexible-extract": 0.12626262626262627, "exact_match_stderr,flexible-extract": 0.02366435940288024 }, "gpqa_diamond_generative_n_shot": { "alias": "gpqa_diamond_generative_n_shot", "exact_match,strict-match": 0.005050505050505051, "exact_match_stderr,strict-match": 0.0050505050505050535, "exact_match,flexible-extract": 0.18686868686868688, "exact_match_stderr,flexible-extract": 0.02777253333421898 }, "gpqa_diamond_n_shot": { "alias": "gpqa_diamond_n_shot", "acc,none": 0.23232323232323232, "acc_stderr,none": 0.030088629490217483, "acc_norm,none": 0.23232323232323232, "acc_norm_stderr,none": 0.030088629490217483 }, "gpqa_diamond_zeroshot": { "alias": "gpqa_diamond_zeroshot", "acc,none": 0.2474747474747475, "acc_stderr,none": 0.03074630074212451, "acc_norm,none": 0.2474747474747475, "acc_norm_stderr,none": 0.03074630074212451 }, "gpqa_extended_cot_n_shot": { "alias": "gpqa_extended_cot_n_shot", "exact_match,strict-match": 0.0, "exact_match_stderr,strict-match": 0.0, "exact_match,flexible-extract": 0.15934065934065933, "exact_match_stderr,flexible-extract": 0.015677437397173054 }, "gpqa_extended_cot_zeroshot": { "alias": "gpqa_extended_cot_zeroshot", "exact_match,strict-match": 0.0018315018315018315, "exact_match_stderr,strict-match": 0.0018315018315018447, "exact_match,flexible-extract": 0.1336996336996337, "exact_match_stderr,flexible-extract": 0.014578106095655245 }, "gpqa_extended_generative_n_shot": { "alias": "gpqa_extended_generative_n_shot", "exact_match,strict-match": 0.0, "exact_match_stderr,strict-match": 0.0, "exact_match,flexible-extract": 0.20146520146520147, "exact_match_stderr,flexible-extract": 0.017181010109243344 }, "gpqa_extended_n_shot": { "alias": "gpqa_extended_n_shot", "acc,none": 0.2600732600732601, "acc_stderr,none": 0.018790743352015988, "acc_norm,none": 0.2600732600732601, "acc_norm_stderr,none": 0.018790743352015988 }, "gpqa_extended_zeroshot": { "alias": "gpqa_extended_zeroshot", "acc,none": 0.26373626373626374, "acc_stderr,none": 0.018875713580372485, "acc_norm,none": 0.26373626373626374, "acc_norm_stderr,none": 0.018875713580372485 }, "gpqa_main_cot_n_shot": { "alias": "gpqa_main_cot_n_shot", "exact_match,strict-match": 0.0, "exact_match_stderr,strict-match": 0.0, "exact_match,flexible-extract": 0.13616071428571427, "exact_match_stderr,flexible-extract": 0.016221410863569787 }, "gpqa_main_cot_zeroshot": { "alias": "gpqa_main_cot_zeroshot", "exact_match,strict-match": 0.0, "exact_match_stderr,strict-match": 0.0, "exact_match,flexible-extract": 0.15625, "exact_match_stderr,flexible-extract": 0.017173671221421365 }, "gpqa_main_generative_n_shot": { "alias": "gpqa_main_generative_n_shot", "exact_match,strict-match": 0.0, "exact_match_stderr,strict-match": 0.0, "exact_match,flexible-extract": 0.19642857142857142, "exact_match_stderr,flexible-extract": 0.018791472419524228 }, "gpqa_main_n_shot": { "alias": "gpqa_main_n_shot", "acc,none": 0.2700892857142857, "acc_stderr,none": 0.021000749078822385, "acc_norm,none": 0.2700892857142857, "acc_norm_stderr,none": 0.021000749078822385 }, "gpqa_main_zeroshot": { "alias": "gpqa_main_zeroshot", "acc,none": 0.30580357142857145, "acc_stderr,none": 0.021792582688756983, "acc_norm,none": 0.30580357142857145, "acc_norm_stderr,none": 0.021792582688756983 }, "hellaswag": { "alias": "hellaswag", "acc,none": 0.4047002589125672, "acc_stderr,none": 0.004898308167211847, "acc_norm,none": 0.5237004580760805, "acc_norm_stderr,none": 0.004984172621822888 }, "ifeval": { "alias": "ifeval", "prompt_level_strict_acc,none": 0.20147874306839186, "prompt_level_strict_acc_stderr,none": 0.017260802262371488, "inst_level_strict_acc,none": 0.3405275779376499, "inst_level_strict_acc_stderr,none": "N/A", "prompt_level_loose_acc,none": 0.2365988909426987, "prompt_level_loose_acc_stderr,none": 0.018288827582625643, "inst_level_loose_acc,none": 0.3752997601918465, "inst_level_loose_acc_stderr,none": "N/A" }, "mmlu_pro": { "exact_match,custom-extract": 0.14037566489361702, "exact_match_stderr,custom-extract": 0.003117505733576593, "alias": "mmlu_pro" }, "mmlu_pro_biology": { "alias": " - biology", "exact_match,custom-extract": 0.25662482566248257, "exact_match_stderr,custom-extract": 0.016322882305354162 }, "mmlu_pro_business": { "alias": " - business", "exact_match,custom-extract": 0.1267427122940431, "exact_match_stderr,custom-extract": 0.011851395705593072 }, "mmlu_pro_chemistry": { "alias": " - chemistry", "exact_match,custom-extract": 0.06537102473498234, "exact_match_stderr,custom-extract": 0.00734989211563516 }, "mmlu_pro_computer_science": { "alias": " - computer_science", "exact_match,custom-extract": 0.14878048780487804, "exact_match_stderr,custom-extract": 0.017596736073033845 }, "mmlu_pro_economics": { "alias": " - economics", "exact_match,custom-extract": 0.245260663507109, "exact_match_stderr,custom-extract": 0.01481830928170158 }, "mmlu_pro_engineering": { "alias": " - engineering", "exact_match,custom-extract": 0.058823529411764705, "exact_match_stderr,custom-extract": 0.007562639370979075 }, "mmlu_pro_health": { "alias": " - health", "exact_match,custom-extract": 0.13447432762836187, "exact_match_stderr,custom-extract": 0.011935720476846831 }, "mmlu_pro_history": { "alias": " - history", "exact_match,custom-extract": 0.14698162729658792, "exact_match_stderr,custom-extract": 0.018164310621441037 }, "mmlu_pro_law": { "alias": " - law", "exact_match,custom-extract": 0.13533151680290645, "exact_match_stderr,custom-extract": 0.010314019468785332 }, "mmlu_pro_math": { "alias": " - math", "exact_match,custom-extract": 0.12731310140636565, "exact_match_stderr,custom-extract": 0.009071913335559057 }, "mmlu_pro_other": { "alias": " - other", "exact_match,custom-extract": 0.15151515151515152, "exact_match_stderr,custom-extract": 0.011801826546500667 }, "mmlu_pro_philosophy": { "alias": " - philosophy", "exact_match,custom-extract": 0.12625250501002003, "exact_match_stderr,custom-extract": 0.014883268009546964 }, "mmlu_pro_physics": { "alias": " - physics", "exact_match,custom-extract": 0.0869899923017706, "exact_match_stderr,custom-extract": 0.007822310824931376 }, "mmlu_pro_psychology": { "alias": " - psychology", "exact_match,custom-extract": 0.2543859649122807, "exact_match_stderr,custom-extract": 0.0154267502913602 } }