--- license: apache-2.0 library_name: transformers tags: - merge pipeline_tag: text-generation model-index: - name: TheTop-5x7B-Instruct-S5-v0.1 results: - task: type: text-generation name: Text Generation dataset: name: AI2 Reasoning Challenge (25-Shot) type: ai2_arc config: ARC-Challenge split: test args: num_few_shot: 25 metrics: - type: acc_norm value: 72.53 name: normalized accuracy source: url: https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard?query=MaziyarPanahi/TheTop-5x7B-Instruct-S5-v0.1 name: Open LLM Leaderboard - task: type: text-generation name: Text Generation dataset: name: HellaSwag (10-Shot) type: hellaswag split: validation args: num_few_shot: 10 metrics: - type: acc_norm value: 88.71 name: normalized accuracy source: url: https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard?query=MaziyarPanahi/TheTop-5x7B-Instruct-S5-v0.1 name: Open LLM Leaderboard - task: type: text-generation name: Text Generation dataset: name: MMLU (5-Shot) type: cais/mmlu config: all split: test args: num_few_shot: 5 metrics: - type: acc value: 65.01 name: accuracy source: url: https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard?query=MaziyarPanahi/TheTop-5x7B-Instruct-S5-v0.1 name: Open LLM Leaderboard - task: type: text-generation name: Text Generation dataset: name: TruthfulQA (0-shot) type: truthful_qa config: multiple_choice split: validation args: num_few_shot: 0 metrics: - type: mc2 value: 67.58 source: url: https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard?query=MaziyarPanahi/TheTop-5x7B-Instruct-S5-v0.1 name: Open LLM Leaderboard - task: type: text-generation name: Text Generation dataset: name: Winogrande (5-shot) type: winogrande config: winogrande_xl split: validation args: num_few_shot: 5 metrics: - type: acc value: 86.19 name: accuracy source: url: https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard?query=MaziyarPanahi/TheTop-5x7B-Instruct-S5-v0.1 name: Open LLM Leaderboard - task: type: text-generation name: Text Generation dataset: name: GSM8k (5-shot) type: gsm8k config: main split: test args: num_few_shot: 5 metrics: - type: acc value: 70.81 name: accuracy source: url: https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard?query=MaziyarPanahi/TheTop-5x7B-Instruct-S5-v0.1 name: Open LLM Leaderboard --- Merge of top 7B models and the SLERP of other 7B models > mergekit is a toolkit for merging pre-trained language models. mergekit uses an out-of-core approach to perform unreasonably elaborate merges in resource-constrained situations. Merges can be run entirely on CPU or accelerated with as little as 8 GB of VRAM. Many merging algorithms are supported, with more coming as they catch my attention. ## Eval ![image/png](https://cdn-uploads.huggingface.co/production/uploads/5fd5e18a90b6dc4633f6d292/15qK3CpBMEySfjb0CiT4e.png) ```python { "all": { "acc": 0.6564118716978186, "acc_stderr": 0.03200912848183244, "acc_norm": 0.6553902167958241, "acc_norm_stderr": 0.03268788255929441, "mc1": 0.5312117503059975, "mc1_stderr": 0.01746936487457752, "mc2": 0.6758096547963126, "mc2_stderr": 0.015381620483561457 }, "harness|arc:challenge|25": { "acc": 0.6919795221843004, "acc_stderr": 0.013491429517292038, "acc_norm": 0.7252559726962458, "acc_norm_stderr": 0.013044617212771227 }, "harness|hellaswag|10": { "acc": 0.7234614618601872, "acc_stderr": 0.004463721071319078, "acc_norm": 0.8870742879904402, "acc_norm_stderr": 0.0031585512705264054 }, "harness|hendrycksTest-abstract_algebra|5": { "acc": 0.33, "acc_stderr": 0.047258156262526045, "acc_norm": 0.33, "acc_norm_stderr": 0.047258156262526045 }, "harness|hendrycksTest-anatomy|5": { "acc": 0.6518518518518519, "acc_stderr": 0.041153246103369526, "acc_norm": 0.6518518518518519, "acc_norm_stderr": 0.041153246103369526 }, "harness|hendrycksTest-astronomy|5": { "acc": 0.7039473684210527, "acc_stderr": 0.03715062154998904, "acc_norm": 0.7039473684210527, "acc_norm_stderr": 0.03715062154998904 }, "harness|hendrycksTest-business_ethics|5": { "acc": 0.65, "acc_stderr": 0.0479372485441102, "acc_norm": 0.65, "acc_norm_stderr": 0.0479372485441102 }, "harness|hendrycksTest-clinical_knowledge|5": { "acc": 0.6943396226415094, "acc_stderr": 0.028353298073322663, "acc_norm": 0.6943396226415094, "acc_norm_stderr": 0.028353298073322663 }, "harness|hendrycksTest-college_biology|5": { "acc": 0.7708333333333334, "acc_stderr": 0.03514697467862388, "acc_norm": 0.7708333333333334, "acc_norm_stderr": 0.03514697467862388 }, "harness|hendrycksTest-college_chemistry|5": { "acc": 0.49, "acc_stderr": 0.05024183937956912, "acc_norm": 0.49, "acc_norm_stderr": 0.05024183937956912 }, "harness|hendrycksTest-college_computer_science|5": { "acc": 0.52, "acc_stderr": 0.050211673156867795, "acc_norm": 0.52, "acc_norm_stderr": 0.050211673156867795 }, "harness|hendrycksTest-college_mathematics|5": { "acc": 0.28, "acc_stderr": 0.04512608598542126, "acc_norm": 0.28, "acc_norm_stderr": 0.04512608598542126 }, "harness|hendrycksTest-college_medicine|5": { "acc": 0.6820809248554913, "acc_stderr": 0.0355068398916558, "acc_norm": 0.6820809248554913, "acc_norm_stderr": 0.0355068398916558 }, "harness|hendrycksTest-college_physics|5": { "acc": 0.38235294117647056, "acc_stderr": 0.04835503696107224, "acc_norm": 0.38235294117647056, "acc_norm_stderr": 0.04835503696107224 }, "harness|hendrycksTest-computer_security|5": { "acc": 0.77, "acc_stderr": 0.04229525846816506, "acc_norm": 0.77, "acc_norm_stderr": 0.04229525846816506 }, "harness|hendrycksTest-conceptual_physics|5": { "acc": 0.5957446808510638, "acc_stderr": 0.03208115750788684, "acc_norm": 0.5957446808510638, "acc_norm_stderr": 0.03208115750788684 }, "harness|hendrycksTest-econometrics|5": { "acc": 0.5087719298245614, "acc_stderr": 0.04702880432049615, "acc_norm": 0.5087719298245614, "acc_norm_stderr": 0.04702880432049615 }, "harness|hendrycksTest-electrical_engineering|5": { "acc": 0.5724137931034483, "acc_stderr": 0.04122737111370332, "acc_norm": 0.5724137931034483, "acc_norm_stderr": 0.04122737111370332 }, "harness|hendrycksTest-elementary_mathematics|5": { "acc": 0.4312169312169312, "acc_stderr": 0.025506481698138208, "acc_norm": 0.4312169312169312, "acc_norm_stderr": 0.025506481698138208 }, "harness|hendrycksTest-formal_logic|5": { "acc": 0.5, "acc_stderr": 0.04472135954999579, "acc_norm": 0.5, "acc_norm_stderr": 0.04472135954999579 }, "harness|hendrycksTest-global_facts|5": { "acc": 0.37, "acc_stderr": 0.04852365870939099, "acc_norm": 0.37, "acc_norm_stderr": 0.04852365870939099 }, "harness|hendrycksTest-high_school_biology|5": { "acc": 0.7903225806451613, "acc_stderr": 0.023157879349083525, "acc_norm": 0.7903225806451613, "acc_norm_stderr": 0.023157879349083525 }, "harness|hendrycksTest-high_school_chemistry|5": { "acc": 0.4975369458128079, "acc_stderr": 0.03517945038691063, "acc_norm": 0.4975369458128079, "acc_norm_stderr": 0.03517945038691063 }, "harness|hendrycksTest-high_school_computer_science|5": { "acc": 0.66, "acc_stderr": 0.04760952285695237, "acc_norm": 0.66, "acc_norm_stderr": 0.04760952285695237 }, "harness|hendrycksTest-high_school_european_history|5": { "acc": 0.7696969696969697, "acc_stderr": 0.0328766675860349, "acc_norm": 0.7696969696969697, "acc_norm_stderr": 0.0328766675860349 }, "harness|hendrycksTest-high_school_geography|5": { "acc": 0.7878787878787878, "acc_stderr": 0.029126522834586818, "acc_norm": 0.7878787878787878, "acc_norm_stderr": 0.029126522834586818 }, "harness|hendrycksTest-high_school_government_and_politics|5": { "acc": 0.9067357512953368, "acc_stderr": 0.020986854593289733, "acc_norm": 0.9067357512953368, "acc_norm_stderr": 0.020986854593289733 }, "harness|hendrycksTest-high_school_macroeconomics|5": { "acc": 0.6641025641025641, "acc_stderr": 0.023946724741563976, "acc_norm": 0.6641025641025641, "acc_norm_stderr": 0.023946724741563976 }, "harness|hendrycksTest-high_school_mathematics|5": { "acc": 0.3592592592592593, "acc_stderr": 0.02925290592725197, "acc_norm": 0.3592592592592593, "acc_norm_stderr": 0.02925290592725197 }, "harness|hendrycksTest-high_school_microeconomics|5": { "acc": 0.6764705882352942, "acc_stderr": 0.03038835355188679, "acc_norm": 0.6764705882352942, "acc_norm_stderr": 0.03038835355188679 }, "harness|hendrycksTest-high_school_physics|5": { "acc": 0.36423841059602646, "acc_stderr": 0.03929111781242742, "acc_norm": 0.36423841059602646, "acc_norm_stderr": 0.03929111781242742 }, "harness|hendrycksTest-high_school_psychology|5": { "acc": 0.8385321100917431, "acc_stderr": 0.015776239256163224, "acc_norm": 0.8385321100917431, "acc_norm_stderr": 0.015776239256163224 }, "harness|hendrycksTest-high_school_statistics|5": { "acc": 0.5138888888888888, "acc_stderr": 0.03408655867977749, "acc_norm": 0.5138888888888888, "acc_norm_stderr": 0.03408655867977749 }, "harness|hendrycksTest-high_school_us_history|5": { "acc": 0.8529411764705882, "acc_stderr": 0.024857478080250447, "acc_norm": 0.8529411764705882, "acc_norm_stderr": 0.024857478080250447 }, "harness|hendrycksTest-high_school_world_history|5": { "acc": 0.8143459915611815, "acc_stderr": 0.025310495376944856, "acc_norm": 0.8143459915611815, "acc_norm_stderr": 0.025310495376944856 }, "harness|hendrycksTest-human_aging|5": { "acc": 0.6816143497757847, "acc_stderr": 0.03126580522513713, "acc_norm": 0.6816143497757847, "acc_norm_stderr": 0.03126580522513713 }, "harness|hendrycksTest-human_sexuality|5": { "acc": 0.816793893129771, "acc_stderr": 0.03392770926494733, "acc_norm": 0.816793893129771, "acc_norm_stderr": 0.03392770926494733 }, "harness|hendrycksTest-international_law|5": { "acc": 0.7933884297520661, "acc_stderr": 0.03695980128098824, "acc_norm": 0.7933884297520661, "acc_norm_stderr": 0.03695980128098824 }, "harness|hendrycksTest-jurisprudence|5": { "acc": 0.7870370370370371, "acc_stderr": 0.0395783547198098, "acc_norm": 0.7870370370370371, "acc_norm_stderr": 0.0395783547198098 }, "harness|hendrycksTest-logical_fallacies|5": { "acc": 0.7607361963190185, "acc_stderr": 0.0335195387952127, "acc_norm": 0.7607361963190185, "acc_norm_stderr": 0.0335195387952127 }, "harness|hendrycksTest-machine_learning|5": { "acc": 0.48214285714285715, "acc_stderr": 0.047427623612430116, "acc_norm": 0.48214285714285715, "acc_norm_stderr": 0.047427623612430116 }, "harness|hendrycksTest-management|5": { "acc": 0.7864077669902912, "acc_stderr": 0.040580420156460344, "acc_norm": 0.7864077669902912, "acc_norm_stderr": 0.040580420156460344 }, "harness|hendrycksTest-marketing|5": { "acc": 0.8760683760683761, "acc_stderr": 0.021586494001281365, "acc_norm": 0.8760683760683761, "acc_norm_stderr": 0.021586494001281365 }, "harness|hendrycksTest-medical_genetics|5": { "acc": 0.72, "acc_stderr": 0.04512608598542128, "acc_norm": 0.72, "acc_norm_stderr": 0.04512608598542128 }, "harness|hendrycksTest-miscellaneous|5": { "acc": 0.8250319284802043, "acc_stderr": 0.013586619219903341, "acc_norm": 0.8250319284802043, "acc_norm_stderr": 0.013586619219903341 }, "harness|hendrycksTest-moral_disputes|5": { "acc": 0.7456647398843931, "acc_stderr": 0.02344582627654554, "acc_norm": 0.7456647398843931, "acc_norm_stderr": 0.02344582627654554 }, "harness|hendrycksTest-moral_scenarios|5": { "acc": 0.45251396648044695, "acc_stderr": 0.016646914804438778, "acc_norm": 0.45251396648044695, "acc_norm_stderr": 0.016646914804438778 }, "harness|hendrycksTest-nutrition|5": { "acc": 0.7254901960784313, "acc_stderr": 0.02555316999182652, "acc_norm": 0.7254901960784313, "acc_norm_stderr": 0.02555316999182652 }, "harness|hendrycksTest-philosophy|5": { "acc": 0.707395498392283, "acc_stderr": 0.02583989833487798, "acc_norm": 0.707395498392283, "acc_norm_stderr": 0.02583989833487798 }, "harness|hendrycksTest-prehistory|5": { "acc": 0.7561728395061729, "acc_stderr": 0.02389187954195961, "acc_norm": 0.7561728395061729, "acc_norm_stderr": 0.02389187954195961 }, "harness|hendrycksTest-professional_accounting|5": { "acc": 0.4645390070921986, "acc_stderr": 0.029752389657427047, "acc_norm": 0.4645390070921986, "acc_norm_stderr": 0.029752389657427047 }, "harness|hendrycksTest-professional_law|5": { "acc": 0.47327249022164275, "acc_stderr": 0.01275197796767601, "acc_norm": 0.47327249022164275, "acc_norm_stderr": 0.01275197796767601 }, "harness|hendrycksTest-professional_medicine|5": { "acc": 0.6838235294117647, "acc_stderr": 0.02824568739146292, "acc_norm": 0.6838235294117647, "acc_norm_stderr": 0.02824568739146292 }, "harness|hendrycksTest-professional_psychology|5": { "acc": 0.6715686274509803, "acc_stderr": 0.018999707383162673, "acc_norm": 0.6715686274509803, "acc_norm_stderr": 0.018999707383162673 }, "harness|hendrycksTest-public_relations|5": { "acc": 0.6545454545454545, "acc_stderr": 0.04554619617541054, "acc_norm": 0.6545454545454545, "acc_norm_stderr": 0.04554619617541054 }, "harness|hendrycksTest-security_studies|5": { "acc": 0.7306122448979592, "acc_stderr": 0.02840125202902294, "acc_norm": 0.7306122448979592, "acc_norm_stderr": 0.02840125202902294 }, "harness|hendrycksTest-sociology|5": { "acc": 0.8208955223880597, "acc_stderr": 0.027113286753111837, "acc_norm": 0.8208955223880597, "acc_norm_stderr": 0.027113286753111837 }, "harness|hendrycksTest-us_foreign_policy|5": { "acc": 0.85, "acc_stderr": 0.03588702812826371, "acc_norm": 0.85, "acc_norm_stderr": 0.03588702812826371 }, "harness|hendrycksTest-virology|5": { "acc": 0.5542168674698795, "acc_stderr": 0.038695433234721015, "acc_norm": 0.5542168674698795, "acc_norm_stderr": 0.038695433234721015 }, "harness|hendrycksTest-world_religions|5": { "acc": 0.8362573099415205, "acc_stderr": 0.028380919596145866, "acc_norm": 0.8362573099415205, "acc_norm_stderr": 0.028380919596145866 }, "harness|truthfulqa:mc|0": { "mc1": 0.5312117503059975, "mc1_stderr": 0.01746936487457752, "mc2": 0.6758096547963126, "mc2_stderr": 0.015381620483561457 }, "harness|winogrande|5": { "acc": 0.861878453038674, "acc_stderr": 0.00969698839367458 }, "harness|gsm8k|5": { "acc": 0.7081122062168309, "acc_stderr": 0.012522795894420867 } } ``` # [Open LLM Leaderboard Evaluation Results](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard) Detailed results can be found [here](https://huggingface.co/datasets/open-llm-leaderboard/details_MaziyarPanahi__TheTop-5x7B-Instruct-S5-v0.1) | Metric |Value| |---------------------------------|----:| |Avg. |75.14| |AI2 Reasoning Challenge (25-Shot)|72.53| |HellaSwag (10-Shot) |88.71| |MMLU (5-Shot) |65.01| |TruthfulQA (0-shot) |67.58| |Winogrande (5-shot) |86.19| |GSM8k (5-shot) |70.81|