model,scenario,score,aggragated_from,source flan_flan-ul2,Holmes,72.2,[],holmes_240829.csv flan_t5_xxl,Holmes,70.5,[],holmes_240829.csv t5_xxl_lm_adapt,Holmes,70.2,[],holmes_240829.csv vicuna_13b_v1_5,Holmes,68.6,[],holmes_240829.csv llama_2_70b_chat,Holmes,66.3,[],holmes_240829.csv labradorite_13b,Holmes,66.1,[],holmes_240829.csv llama_2_13b,Holmes,65.0,[],holmes_240829.csv llama_2_13b_chat,Holmes,64.1,[],holmes_240829.csv pythia_12b_deduped,Holmes,63.1,[],holmes_240829.csv bart_base,Holmes,63.0,[],holmes_240829.csv orca_2_13b,Holmes,62.7,[],holmes_240829.csv pythia_6_9b_deduped,Holmes,62.3,[],holmes_240829.csv flan-ul2,Holmes,60.5,[],holmes_240829.csv flan_t5_xl,Holmes,60.0,[],holmes_240829.csv t5_xl_lm_adapt,Holmes,59.5,[],holmes_240829.csv electra_base_discriminator,Holmes,58.3,[],holmes_240829.csv dolly_v2_12b,Holmes,58.2,[],holmes_240829.csv pythia_12b,Holmes,58.0,[],holmes_240829.csv tulu_2_13b,Holmes,57.6,[],holmes_240829.csv pythia_6_9b,Holmes,56.6,[],holmes_240829.csv deberta_v3_base,Holmes,56.0,[],holmes_240829.csv pythia_2_8b_deduped,Holmes,56.0,[],holmes_240829.csv llama_2_70b,Holmes,55.9,[],holmes_240829.csv tulu_2_dpo_13b,Holmes,55.5,[],holmes_240829.csv wizardlm_13b_v1_2,Holmes,55.4,[],holmes_240829.csv deberta_base,Holmes,55.3,[],holmes_240829.csv pythia_1_4b,Holmes,54.2,[],holmes_240829.csv pythia_2_8b,Holmes,54.0,[],holmes_240829.csv tulu_2_70b,Holmes,53.5,[],holmes_240829.csv mistral_7b_instruct_v0_1,Holmes,52.9,[],holmes_240829.csv albert_base_v2,Holmes,52.3,[],holmes_240829.csv tk_instruct_11b_def,Holmes,51.7,[],holmes_240829.csv tulu_2_dpo_70b,Holmes,51.2,[],holmes_240829.csv flan_t5_large,Holmes,50.9,[],holmes_240829.csv t5_base_lm_adapt,Holmes,48.7,[],holmes_240829.csv flan_t5_base,Holmes,48.7,[],holmes_240829.csv pythia_1b_deduped,Holmes,47.5,[],holmes_240829.csv llama_2_7b,Holmes,47.2,[],holmes_240829.csv pythia_1_4b_deduped,Holmes,47.2,[],holmes_240829.csv mixtral_8x7b_instruct_v0_1,Holmes,46.5,[],holmes_240829.csv bert_base_uncased,Holmes,45.3,[],holmes_240829.csv mistral_7b_v0_1,Holmes,45.2,[],holmes_240829.csv llama_2_7b_chat,Holmes,45.0,[],holmes_240829.csv merlinite_7b,Holmes,44.1,[],holmes_240829.csv roberta_base,Holmes,43.2,[],holmes_240829.csv t5_large_lm_adapt,Holmes,42.4,[],holmes_240829.csv mixtral_8x7b_v0_1,Holmes,42.2,[],holmes_240829.csv gpt2,Holmes,40.6,[],holmes_240829.csv pythia_410m,Holmes,40.0,[],holmes_240829.csv flan_t5_small,Holmes,38.9,[],holmes_240829.csv t5_small_lm_adapt,Holmes,36.0,[],holmes_240829.csv pythia_410m_deduped,Holmes,31.3,[],holmes_240829.csv glove_840b,Holmes,26.6,[],holmes_240829.csv pythia_160m_deduped,Holmes,17.2,[],holmes_240829.csv pythia_160m,Holmes,16.3,[],holmes_240829.csv pythia_70m,Holmes,15.6,[],holmes_240829.csv pythia_70m_deduped,Holmes,14.4,[],holmes_240829.csv mistral_large_2_2407,eureka_information_retrieval_fact_recall,36.3,[],eureka_241002.csv llama3_70b,eureka_information_retrieval_fact_recall,37.4,[],eureka_241002.csv llama3_1_70b,eureka_information_retrieval_fact_recall,44.2,[],eureka_241002.csv llama3_1_405b,eureka_information_retrieval_fact_recall,54.9,[],eureka_241002.csv gpt_4o_2024_05_13,eureka_information_retrieval_fact_recall,53.7,[],eureka_241002.csv gpt_4_1106_preview,eureka_information_retrieval_fact_recall,47.0,[],eureka_241002.csv gemini_1_5_pro,eureka_information_retrieval_fact_recall,41.3,[],eureka_241002.csv claude_3_opus,eureka_information_retrieval_fact_recall,50.5,[],eureka_241002.csv claude_3_5_sonnet,eureka_information_retrieval_fact_recall,55.3,[],eureka_241002.csv mistral_large_2_2407,eureka_information_retrieval_fact_precision,17.6,[],eureka_241002.csv llama3_70b,eureka_information_retrieval_fact_precision,15.0,[],eureka_241002.csv llama3_1_70b,eureka_information_retrieval_fact_precision,16.0,[],eureka_241002.csv llama3_1_405b,eureka_information_retrieval_fact_precision,16.8,[],eureka_241002.csv gpt_4o_2024_05_13,eureka_information_retrieval_fact_precision,20.3,[],eureka_241002.csv gpt_4_1106_preview,eureka_information_retrieval_fact_precision,23.3,[],eureka_241002.csv gemini_1_5_pro,eureka_information_retrieval_fact_precision,9.8,[],eureka_241002.csv claude_3_opus,eureka_information_retrieval_fact_precision,18.7,[],eureka_241002.csv claude_3_5_sonnet,eureka_information_retrieval_fact_precision,20.6,[],eureka_241002.csv mistral_large_2_2407,eureka_instruction_following,77.3,[],eureka_241002.csv llama3_70b,eureka_instruction_following,77.3,[],eureka_241002.csv llama3_1_70b,eureka_instruction_following,80.8,[],eureka_241002.csv llama3_1_405b,eureka_instruction_following,83.5,[],eureka_241002.csv gpt_4o_2024_05_13,eureka_instruction_following,81.3,[],eureka_241002.csv gpt_4_1106_preview,eureka_instruction_following,75.2,[],eureka_241002.csv gemini_1_5_pro,eureka_instruction_following,75.2,[],eureka_241002.csv claude_3_opus,eureka_instruction_following,81.9,[],eureka_241002.csv claude_3_5_sonnet,eureka_instruction_following,81.3,[],eureka_241002.csv mistral_large_2_2407,eureka_long_context_qa_average,84.8,[],eureka_241002.csv llama3_70b,eureka_long_context_qa_average,86.8,[],eureka_241002.csv llama3_1_70b,eureka_long_context_qa_average,93.9,[],eureka_241002.csv llama3_1_405b,eureka_long_context_qa_average,96.6,[],eureka_241002.csv gpt_4o_2024_05_13,eureka_long_context_qa_average,95.5,[],eureka_241002.csv gpt_4_1106_preview,eureka_long_context_qa_average,92.7,[],eureka_241002.csv gemini_1_5_pro,eureka_long_context_qa_average,87.7,[],eureka_241002.csv claude_3_opus,eureka_long_context_qa_average,82.0,[],eureka_241002.csv claude_3_5_sonnet,eureka_long_context_qa_average,84.5,[],eureka_241002.csv mistral_large_2_2407,eureka_long_context_qa_longest_context_3k,66.7,[],eureka_241002.csv llama3_70b,eureka_long_context_qa_longest_context_3k,74.9,[],eureka_241002.csv llama3_1_70b,eureka_long_context_qa_longest_context_3k,86.4,[],eureka_241002.csv llama3_1_405b,eureka_long_context_qa_longest_context_3k,92.5,[],eureka_241002.csv gpt_4o_2024_05_13,eureka_long_context_qa_longest_context_3k,90.8,[],eureka_241002.csv gpt_4_1106_preview,eureka_long_context_qa_longest_context_3k,85.5,[],eureka_241002.csv gemini_1_5_pro,eureka_long_context_qa_longest_context_3k,75.8,[],eureka_241002.csv claude_3_opus,eureka_long_context_qa_longest_context_3k,73.0,[],eureka_241002.csv claude_3_5_sonnet,eureka_long_context_qa_longest_context_3k,75.6,[],eureka_241002.csv mistral_large_2_2407,eureka_toxicity_detection,84.1,[],eureka_241002.csv llama3_70b,eureka_toxicity_detection,87.4,[],eureka_241002.csv llama3_1_70b,eureka_toxicity_detection,86.0,[],eureka_241002.csv llama3_1_405b,eureka_toxicity_detection,57.1,[],eureka_241002.csv gpt_4o_2024_05_13,eureka_toxicity_detection,86.1,[],eureka_241002.csv gpt_4_1106_preview,eureka_toxicity_detection,84.1,[],eureka_241002.csv gemini_1_5_pro,eureka_toxicity_detection,42.6,[],eureka_241002.csv claude_3_opus,eureka_toxicity_detection,53.2,[],eureka_241002.csv claude_3_5_sonnet,eureka_toxicity_detection,67.6,[],eureka_241002.csv gpt_4o_2024_05_13,Helm Lite,0.963,[],helm_lite_240829.csv claude_3_5_sonnet_20240620,Helm Lite,0.915,[],helm_lite_240829.csv gpt_4_0613,Helm Lite,0.915,[],helm_lite_240829.csv gpt_4_turbo_2024_04_09,Helm Lite,0.908,[],helm_lite_240829.csv llama3_1_instruct_turbo_405b,Helm Lite,0.896,[],helm_lite_240829.csv llama3_1_instruct_turbo_70b,Helm Lite,0.858,[],helm_lite_240829.csv llama3_70b,Helm Lite,0.838,[],helm_lite_240829.csv qwen2_instruct_72b,Helm Lite,0.827,[],helm_lite_240829.csv mistral_large_2_2407,Helm Lite,0.803,[],helm_lite_240829.csv gemini_1_5_pro_001,Helm Lite,0.793,[],helm_lite_240829.csv gpt_4o_mini_2024_07_18,Helm Lite,0.776,[],helm_lite_240829.csv mixtral_8x22b,Helm Lite,0.767,[],helm_lite_240829.csv gpt_4_turbo_1106_preview,Helm Lite,0.758,[],helm_lite_240829.csv palmyra_x_v3_72b,Helm Lite,0.749,[],helm_lite_240829.csv gemma_2_instruct_27b,Helm Lite,0.742,[],helm_lite_240829.csv gemini_1_5_flash_001,Helm Lite,0.733,[],helm_lite_240829.csv claude_3_opus_20240229,Helm Lite,0.722,[],helm_lite_240829.csv palm_2_unicorn,Helm Lite,0.703,[],helm_lite_240829.csv qwen1_5_72b,Helm Lite,0.68,[],helm_lite_240829.csv palmyra_x_v2_33b,Helm Lite,0.659,[],helm_lite_240829.csv gemma_2_instruct_9b,Helm Lite,0.639,[],helm_lite_240829.csv yi_34b,Helm Lite,0.634,[],helm_lite_240829.csv qwen1_5_chat_110b,Helm Lite,0.619,[],helm_lite_240829.csv qwen1_5_32b,Helm Lite,0.615,[],helm_lite_240829.csv claude_v1_3,Helm Lite,0.594,[],helm_lite_240829.csv palm_2_bison,Helm Lite,0.584,[],helm_lite_240829.csv mixtral_8x7b_32k_seqlen,Helm Lite,0.582,[],helm_lite_240829.csv phi_3_14b,Helm Lite,0.579,[],helm_lite_240829.csv claude_2_0,Helm Lite,0.56,[],helm_lite_240829.csv deepseek_llm_chat_67b,Helm Lite,0.556,[],helm_lite_240829.csv phi_3_7b,Helm Lite,0.545,[],helm_lite_240829.csv llama_2_70b,Helm Lite,0.537,[],helm_lite_240829.csv yi_large_preview,Helm Lite,0.53,[],helm_lite_240829.csv command_r_plus,Helm Lite,0.509,[],helm_lite_240829.csv gpt_3_5_text_davinci_003,Helm Lite,0.503,[],helm_lite_240829.csv claude_2_1,Helm Lite,0.503,[],helm_lite_240829.csv qwen1_5_14b,Helm Lite,0.491,[],helm_lite_240829.csv gemini_1_0_pro_002,Helm Lite,0.484,[],helm_lite_240829.csv claude_instant_1_2,Helm Lite,0.464,[],helm_lite_240829.csv llama3_8b,Helm Lite,0.441,[],helm_lite_240829.csv gpt_3_5_turbo_0613,Helm Lite,0.42,[],helm_lite_240829.csv claude_3_sonnet_20240229,Helm Lite,0.42,[],helm_lite_240829.csv mistral_nemo_2402,Helm Lite,0.401,[],helm_lite_240829.csv arctic_instruct,Helm Lite,0.399,[],helm_lite_240829.csv gemma_7b,Helm Lite,0.392,[],helm_lite_240829.csv gpt_3_5_text_davinci_002,Helm Lite,0.392,[],helm_lite_240829.csv llama_65b,Helm Lite,0.39,[],helm_lite_240829.csv mistral_large_2402,Helm Lite,0.382,[],helm_lite_240829.csv command,Helm Lite,0.365,[],helm_lite_240829.csv command_r,Helm Lite,0.35,[],helm_lite_240829.csv llama3_1_instruct_turbo_8b,Helm Lite,0.347,[],helm_lite_240829.csv mistral_small_2402,Helm Lite,0.342,[],helm_lite_240829.csv dbrx_instructruct,Helm Lite,0.341,[],helm_lite_240829.csv jamba_instruct,Helm Lite,0.339,[],helm_lite_240829.csv mistral_v0_1_7b,Helm Lite,0.338,[],helm_lite_240829.csv mistral_medium_2312,Helm Lite,0.318,[],helm_lite_240829.csv qwen1_5_7b,Helm Lite,0.317,[],helm_lite_240829.csv claude_3_haiku_20240307,Helm Lite,0.309,[],helm_lite_240829.csv yi_6b,Helm Lite,0.289,[],helm_lite_240829.csv llama_2_13b,Helm Lite,0.273,[],helm_lite_240829.csv jurassic_2_jumbo_178b,Helm Lite,0.254,[],helm_lite_240829.csv falcon_40b,Helm Lite,0.249,[],helm_lite_240829.csv mistral_instruct_v0_3_7b,Helm Lite,0.233,[],helm_lite_240829.csv jurassic_2_grande_17b,Helm Lite,0.203,[],helm_lite_240829.csv phi_2,Helm Lite,0.202,[],helm_lite_240829.csv llama_2_7b,Helm Lite,0.18,[],helm_lite_240829.csv luminous_supreme_70b,Helm Lite,0.172,[],helm_lite_240829.csv command_light,Helm Lite,0.125,[],helm_lite_240829.csv luminous_extended_30b,Helm Lite,0.093,[],helm_lite_240829.csv falcon_7b,Helm Lite,0.078,[],helm_lite_240829.csv olmo_7b,Helm Lite,0.063,[],helm_lite_240829.csv luminous_base_13b,Helm Lite,0.052,[],helm_lite_240829.csv gpt_4o_2024_05_13,Helm Lite NarrativeQA,0.804,[],helm_lite_240829.csv claude_3_5_sonnet_20240620,Helm Lite NarrativeQA,0.746,[],helm_lite_240829.csv gpt_4_0613,Helm Lite NarrativeQA,0.768,[],helm_lite_240829.csv gpt_4_turbo_2024_04_09,Helm Lite NarrativeQA,0.761,[],helm_lite_240829.csv llama3_1_instruct_turbo_405b,Helm Lite NarrativeQA,0.749,[],helm_lite_240829.csv llama3_1_instruct_turbo_70b,Helm Lite NarrativeQA,0.772,[],helm_lite_240829.csv llama3_70b,Helm Lite NarrativeQA,0.798,[],helm_lite_240829.csv qwen2_instruct_72b,Helm Lite NarrativeQA,0.727,[],helm_lite_240829.csv mistral_large_2_2407,Helm Lite NarrativeQA,0.779,[],helm_lite_240829.csv gemini_1_5_pro_001,Helm Lite NarrativeQA,0.783,[],helm_lite_240829.csv gpt_4o_mini_2024_07_18,Helm Lite NarrativeQA,0.768,[],helm_lite_240829.csv mixtral_8x22b,Helm Lite NarrativeQA,0.779,[],helm_lite_240829.csv gpt_4_turbo_1106_preview,Helm Lite NarrativeQA,0.727,[],helm_lite_240829.csv palmyra_x_v3_72b,Helm Lite NarrativeQA,0.706,[],helm_lite_240829.csv gemma_2_instruct_27b,Helm Lite NarrativeQA,0.79,[],helm_lite_240829.csv gemini_1_5_flash_001,Helm Lite NarrativeQA,0.783,[],helm_lite_240829.csv claude_3_opus_20240229,Helm Lite NarrativeQA,0.351,[],helm_lite_240829.csv palm_2_unicorn,Helm Lite NarrativeQA,0.583,[],helm_lite_240829.csv qwen1_5_72b,Helm Lite NarrativeQA,0.601,[],helm_lite_240829.csv palmyra_x_v2_33b,Helm Lite NarrativeQA,0.752,[],helm_lite_240829.csv gemma_2_instruct_9b,Helm Lite NarrativeQA,0.768,[],helm_lite_240829.csv yi_34b,Helm Lite NarrativeQA,0.782,[],helm_lite_240829.csv qwen1_5_chat_110b,Helm Lite NarrativeQA,0.721,[],helm_lite_240829.csv qwen1_5_32b,Helm Lite NarrativeQA,0.589,[],helm_lite_240829.csv claude_v1_3,Helm Lite NarrativeQA,0.723,[],helm_lite_240829.csv palm_2_bison,Helm Lite NarrativeQA,0.718,[],helm_lite_240829.csv mixtral_8x7b_32k_seqlen,Helm Lite NarrativeQA,0.767,[],helm_lite_240829.csv phi_3_14b,Helm Lite NarrativeQA,0.724,[],helm_lite_240829.csv claude_2_0,Helm Lite NarrativeQA,0.718,[],helm_lite_240829.csv deepseek_llm_chat_67b,Helm Lite NarrativeQA,0.581,[],helm_lite_240829.csv phi_3_7b,Helm Lite NarrativeQA,0.754,[],helm_lite_240829.csv llama_2_70b,Helm Lite NarrativeQA,0.763,[],helm_lite_240829.csv yi_large_preview,Helm Lite NarrativeQA,0.373,[],helm_lite_240829.csv command_r_plus,Helm Lite NarrativeQA,0.735,[],helm_lite_240829.csv gpt_3_5_text_davinci_003,Helm Lite NarrativeQA,0.731,[],helm_lite_240829.csv claude_2_1,Helm Lite NarrativeQA,0.677,[],helm_lite_240829.csv qwen1_5_14b,Helm Lite NarrativeQA,0.711,[],helm_lite_240829.csv gemini_1_0_pro_002,Helm Lite NarrativeQA,0.751,[],helm_lite_240829.csv claude_instant_1_2,Helm Lite NarrativeQA,0.616,[],helm_lite_240829.csv llama3_8b,Helm Lite NarrativeQA,0.754,[],helm_lite_240829.csv gpt_3_5_turbo_0613,Helm Lite NarrativeQA,0.655,[],helm_lite_240829.csv claude_3_sonnet_20240229,Helm Lite NarrativeQA,0.111,[],helm_lite_240829.csv mistral_nemo_2402,Helm Lite NarrativeQA,0.731,[],helm_lite_240829.csv arctic_instruct,Helm Lite NarrativeQA,0.654,[],helm_lite_240829.csv gemma_7b,Helm Lite NarrativeQA,0.752,[],helm_lite_240829.csv gpt_3_5_text_davinci_002,Helm Lite NarrativeQA,0.719,[],helm_lite_240829.csv llama_65b,Helm Lite NarrativeQA,0.755,[],helm_lite_240829.csv mistral_large_2402,Helm Lite NarrativeQA,0.454,[],helm_lite_240829.csv command,Helm Lite NarrativeQA,0.749,[],helm_lite_240829.csv command_r,Helm Lite NarrativeQA,0.742,[],helm_lite_240829.csv llama3_1_instruct_turbo_8b,Helm Lite NarrativeQA,0.756,[],helm_lite_240829.csv mistral_small_2402,Helm Lite NarrativeQA,0.519,[],helm_lite_240829.csv dbrx_instructruct,Helm Lite NarrativeQA,0.488,[],helm_lite_240829.csv jamba_instruct,Helm Lite NarrativeQA,0.658,[],helm_lite_240829.csv mistral_v0_1_7b,Helm Lite NarrativeQA,0.716,[],helm_lite_240829.csv mistral_medium_2312,Helm Lite NarrativeQA,0.449,[],helm_lite_240829.csv qwen1_5_7b,Helm Lite NarrativeQA,0.448,[],helm_lite_240829.csv claude_3_haiku_20240307,Helm Lite NarrativeQA,0.244,[],helm_lite_240829.csv yi_6b,Helm Lite NarrativeQA,0.702,[],helm_lite_240829.csv llama_2_13b,Helm Lite NarrativeQA,0.741,[],helm_lite_240829.csv jurassic_2_jumbo_178b,Helm Lite NarrativeQA,0.728,[],helm_lite_240829.csv falcon_40b,Helm Lite NarrativeQA,0.671,[],helm_lite_240829.csv mistral_instruct_v0_3_7b,Helm Lite NarrativeQA,0.716,[],helm_lite_240829.csv jurassic_2_grande_17b,Helm Lite NarrativeQA,0.744,[],helm_lite_240829.csv phi_2,Helm Lite NarrativeQA,0.703,[],helm_lite_240829.csv llama_2_7b,Helm Lite NarrativeQA,0.686,[],helm_lite_240829.csv luminous_supreme_70b,Helm Lite NarrativeQA,0.743,[],helm_lite_240829.csv command_light,Helm Lite NarrativeQA,0.629,[],helm_lite_240829.csv luminous_extended_30b,Helm Lite NarrativeQA,0.684,[],helm_lite_240829.csv falcon_7b,Helm Lite NarrativeQA,0.621,[],helm_lite_240829.csv olmo_7b,Helm Lite NarrativeQA,0.597,[],helm_lite_240829.csv luminous_base_13b,Helm Lite NarrativeQA,0.633,[],helm_lite_240829.csv gpt_4o_2024_05_13,Helm Lite NaturalQuestionsOpen,0.803,[],helm_lite_240829.csv claude_3_5_sonnet_20240620,Helm Lite NaturalQuestionsOpen,0.749,[],helm_lite_240829.csv gpt_4_0613,Helm Lite NaturalQuestionsOpen,0.79,[],helm_lite_240829.csv gpt_4_turbo_2024_04_09,Helm Lite NaturalQuestionsOpen,0.795,[],helm_lite_240829.csv llama3_1_instruct_turbo_405b,Helm Lite NaturalQuestionsOpen,0.756,[],helm_lite_240829.csv llama3_1_instruct_turbo_70b,Helm Lite NaturalQuestionsOpen,0.738,[],helm_lite_240829.csv llama3_70b,Helm Lite NaturalQuestionsOpen,0.743,[],helm_lite_240829.csv qwen2_instruct_72b,Helm Lite NaturalQuestionsOpen,0.776,[],helm_lite_240829.csv mistral_large_2_2407,Helm Lite NaturalQuestionsOpen,0.734,[],helm_lite_240829.csv gemini_1_5_pro_001,Helm Lite NaturalQuestionsOpen,0.748,[],helm_lite_240829.csv gpt_4o_mini_2024_07_18,Helm Lite NaturalQuestionsOpen,0.746,[],helm_lite_240829.csv mixtral_8x22b,Helm Lite NaturalQuestionsOpen,0.726,[],helm_lite_240829.csv gpt_4_turbo_1106_preview,Helm Lite NaturalQuestionsOpen,0.763,[],helm_lite_240829.csv palmyra_x_v3_72b,Helm Lite NaturalQuestionsOpen,0.685,[],helm_lite_240829.csv gemma_2_instruct_27b,Helm Lite NaturalQuestionsOpen,0.731,[],helm_lite_240829.csv gemini_1_5_flash_001,Helm Lite NaturalQuestionsOpen,0.723,[],helm_lite_240829.csv claude_3_opus_20240229,Helm Lite NaturalQuestionsOpen,0.264,[],helm_lite_240829.csv palm_2_unicorn,Helm Lite NaturalQuestionsOpen,0.674,[],helm_lite_240829.csv qwen1_5_72b,Helm Lite NaturalQuestionsOpen,0.758,[],helm_lite_240829.csv palmyra_x_v2_33b,Helm Lite NaturalQuestionsOpen,0.752,[],helm_lite_240829.csv gemma_2_instruct_9b,Helm Lite NaturalQuestionsOpen,0.738,[],helm_lite_240829.csv yi_34b,Helm Lite NaturalQuestionsOpen,0.775,[],helm_lite_240829.csv qwen1_5_chat_110b,Helm Lite NaturalQuestionsOpen,0.739,[],helm_lite_240829.csv qwen1_5_32b,Helm Lite NaturalQuestionsOpen,0.777,[],helm_lite_240829.csv claude_v1_3,Helm Lite NaturalQuestionsOpen,0.699,[],helm_lite_240829.csv palm_2_bison,Helm Lite NaturalQuestionsOpen,0.813,[],helm_lite_240829.csv mixtral_8x7b_32k_seqlen,Helm Lite NaturalQuestionsOpen,0.699,[],helm_lite_240829.csv phi_3_14b,Helm Lite NaturalQuestionsOpen,0.729,[],helm_lite_240829.csv claude_2_0,Helm Lite NaturalQuestionsOpen,0.67,[],helm_lite_240829.csv deepseek_llm_chat_67b,Helm Lite NaturalQuestionsOpen,0.733,[],helm_lite_240829.csv phi_3_7b,Helm Lite NaturalQuestionsOpen,0.675,[],helm_lite_240829.csv llama_2_70b,Helm Lite NaturalQuestionsOpen,0.674,[],helm_lite_240829.csv yi_large_preview,Helm Lite NaturalQuestionsOpen,0.586,[],helm_lite_240829.csv command_r_plus,Helm Lite NaturalQuestionsOpen,0.711,[],helm_lite_240829.csv gpt_3_5_text_davinci_003,Helm Lite NaturalQuestionsOpen,0.77,[],helm_lite_240829.csv claude_2_1,Helm Lite NaturalQuestionsOpen,0.611,[],helm_lite_240829.csv qwen1_5_14b,Helm Lite NaturalQuestionsOpen,0.772,[],helm_lite_240829.csv gemini_1_0_pro_002,Helm Lite NaturalQuestionsOpen,0.714,[],helm_lite_240829.csv claude_instant_1_2,Helm Lite NaturalQuestionsOpen,0.731,[],helm_lite_240829.csv llama3_8b,Helm Lite NaturalQuestionsOpen,0.681,[],helm_lite_240829.csv gpt_3_5_turbo_0613,Helm Lite NaturalQuestionsOpen,0.678,[],helm_lite_240829.csv claude_3_sonnet_20240229,Helm Lite NaturalQuestionsOpen,0.072,[],helm_lite_240829.csv mistral_nemo_2402,Helm Lite NaturalQuestionsOpen,0.65,[],helm_lite_240829.csv arctic_instruct,Helm Lite NaturalQuestionsOpen,0.586,[],helm_lite_240829.csv gemma_7b,Helm Lite NaturalQuestionsOpen,0.665,[],helm_lite_240829.csv gpt_3_5_text_davinci_002,Helm Lite NaturalQuestionsOpen,0.71,[],helm_lite_240829.csv llama_65b,Helm Lite NaturalQuestionsOpen,0.672,[],helm_lite_240829.csv mistral_large_2402,Helm Lite NaturalQuestionsOpen,0.485,[],helm_lite_240829.csv command,Helm Lite NaturalQuestionsOpen,0.777,[],helm_lite_240829.csv command_r,Helm Lite NaturalQuestionsOpen,0.72,[],helm_lite_240829.csv llama3_1_instruct_turbo_8b,Helm Lite NaturalQuestionsOpen,0.677,[],helm_lite_240829.csv mistral_small_2402,Helm Lite NaturalQuestionsOpen,0.587,[],helm_lite_240829.csv dbrx_instructruct,Helm Lite NaturalQuestionsOpen,0.55,[],helm_lite_240829.csv jamba_instruct,Helm Lite NaturalQuestionsOpen,0.636,[],helm_lite_240829.csv mistral_v0_1_7b,Helm Lite NaturalQuestionsOpen,0.687,[],helm_lite_240829.csv mistral_medium_2312,Helm Lite NaturalQuestionsOpen,0.468,[],helm_lite_240829.csv qwen1_5_7b,Helm Lite NaturalQuestionsOpen,0.749,[],helm_lite_240829.csv claude_3_haiku_20240307,Helm Lite NaturalQuestionsOpen,0.252,[],helm_lite_240829.csv yi_6b,Helm Lite NaturalQuestionsOpen,0.748,[],helm_lite_240829.csv llama_2_13b,Helm Lite NaturalQuestionsOpen,0.64,[],helm_lite_240829.csv jurassic_2_jumbo_178b,Helm Lite NaturalQuestionsOpen,0.65,[],helm_lite_240829.csv falcon_40b,Helm Lite NaturalQuestionsOpen,0.676,[],helm_lite_240829.csv mistral_instruct_v0_3_7b,Helm Lite NaturalQuestionsOpen,0.68,[],helm_lite_240829.csv jurassic_2_grande_17b,Helm Lite NaturalQuestionsOpen,0.627,[],helm_lite_240829.csv phi_2,Helm Lite NaturalQuestionsOpen,0.68,[],helm_lite_240829.csv llama_2_7b,Helm Lite NaturalQuestionsOpen,0.612,[],helm_lite_240829.csv luminous_supreme_70b,Helm Lite NaturalQuestionsOpen,0.656,[],helm_lite_240829.csv command_light,Helm Lite NaturalQuestionsOpen,0.686,[],helm_lite_240829.csv luminous_extended_30b,Helm Lite NaturalQuestionsOpen,0.611,[],helm_lite_240829.csv falcon_7b,Helm Lite NaturalQuestionsOpen,0.58,[],helm_lite_240829.csv olmo_7b,Helm Lite NaturalQuestionsOpen,0.603,[],helm_lite_240829.csv luminous_base_13b,Helm Lite NaturalQuestionsOpen,0.577,[],helm_lite_240829.csv gpt_4o_2024_05_13,Helm Lite NaturalQuestionsClosed,0.501,[],helm_lite_240829.csv claude_3_5_sonnet_20240620,Helm Lite NaturalQuestionsClosed,0.502,[],helm_lite_240829.csv gpt_4_0613,Helm Lite NaturalQuestionsClosed,0.457,[],helm_lite_240829.csv gpt_4_turbo_2024_04_09,Helm Lite NaturalQuestionsClosed,0.482,[],helm_lite_240829.csv llama3_1_instruct_turbo_405b,Helm Lite NaturalQuestionsClosed,0.456,[],helm_lite_240829.csv llama3_1_instruct_turbo_70b,Helm Lite NaturalQuestionsClosed,0.452,[],helm_lite_240829.csv llama3_70b,Helm Lite NaturalQuestionsClosed,0.475,[],helm_lite_240829.csv qwen2_instruct_72b,Helm Lite NaturalQuestionsClosed,0.39,[],helm_lite_240829.csv mistral_large_2_2407,Helm Lite NaturalQuestionsClosed,0.453,[],helm_lite_240829.csv gemini_1_5_pro_001,Helm Lite NaturalQuestionsClosed,0.378,[],helm_lite_240829.csv gpt_4o_mini_2024_07_18,Helm Lite NaturalQuestionsClosed,0.386,[],helm_lite_240829.csv mixtral_8x22b,Helm Lite NaturalQuestionsClosed,0.478,[],helm_lite_240829.csv gpt_4_turbo_1106_preview,Helm Lite NaturalQuestionsClosed,0.435,[],helm_lite_240829.csv palmyra_x_v3_72b,Helm Lite NaturalQuestionsClosed,0.407,[],helm_lite_240829.csv gemma_2_instruct_27b,Helm Lite NaturalQuestionsClosed,0.353,[],helm_lite_240829.csv gemini_1_5_flash_001,Helm Lite NaturalQuestionsClosed,0.332,[],helm_lite_240829.csv claude_3_opus_20240229,Helm Lite NaturalQuestionsClosed,0.441,[],helm_lite_240829.csv palm_2_unicorn,Helm Lite NaturalQuestionsClosed,0.435,[],helm_lite_240829.csv qwen1_5_72b,Helm Lite NaturalQuestionsClosed,0.417,[],helm_lite_240829.csv palmyra_x_v2_33b,Helm Lite NaturalQuestionsClosed,0.428,[],helm_lite_240829.csv gemma_2_instruct_9b,Helm Lite NaturalQuestionsClosed,0.328,[],helm_lite_240829.csv yi_34b,Helm Lite NaturalQuestionsClosed,0.443,[],helm_lite_240829.csv qwen1_5_chat_110b,Helm Lite NaturalQuestionsClosed,0.35,[],helm_lite_240829.csv qwen1_5_32b,Helm Lite NaturalQuestionsClosed,0.353,[],helm_lite_240829.csv claude_v1_3,Helm Lite NaturalQuestionsClosed,0.409,[],helm_lite_240829.csv palm_2_bison,Helm Lite NaturalQuestionsClosed,0.39,[],helm_lite_240829.csv mixtral_8x7b_32k_seqlen,Helm Lite NaturalQuestionsClosed,0.427,[],helm_lite_240829.csv phi_3_14b,Helm Lite NaturalQuestionsClosed,0.278,[],helm_lite_240829.csv claude_2_0,Helm Lite NaturalQuestionsClosed,0.428,[],helm_lite_240829.csv deepseek_llm_chat_67b,Helm Lite NaturalQuestionsClosed,0.412,[],helm_lite_240829.csv phi_3_7b,Helm Lite NaturalQuestionsClosed,0.324,[],helm_lite_240829.csv llama_2_70b,Helm Lite NaturalQuestionsClosed,0.46,[],helm_lite_240829.csv yi_large_preview,Helm Lite NaturalQuestionsClosed,0.428,[],helm_lite_240829.csv command_r_plus,Helm Lite NaturalQuestionsClosed,0.343,[],helm_lite_240829.csv gpt_3_5_text_davinci_003,Helm Lite NaturalQuestionsClosed,0.413,[],helm_lite_240829.csv claude_2_1,Helm Lite NaturalQuestionsClosed,0.375,[],helm_lite_240829.csv qwen1_5_14b,Helm Lite NaturalQuestionsClosed,0.3,[],helm_lite_240829.csv gemini_1_0_pro_002,Helm Lite NaturalQuestionsClosed,0.391,[],helm_lite_240829.csv claude_instant_1_2,Helm Lite NaturalQuestionsClosed,0.343,[],helm_lite_240829.csv llama3_8b,Helm Lite NaturalQuestionsClosed,0.378,[],helm_lite_240829.csv gpt_3_5_turbo_0613,Helm Lite NaturalQuestionsClosed,0.335,[],helm_lite_240829.csv claude_3_sonnet_20240229,Helm Lite NaturalQuestionsClosed,0.028,[],helm_lite_240829.csv mistral_nemo_2402,Helm Lite NaturalQuestionsClosed,0.265,[],helm_lite_240829.csv arctic_instruct,Helm Lite NaturalQuestionsClosed,0.39,[],helm_lite_240829.csv gemma_7b,Helm Lite NaturalQuestionsClosed,0.336,[],helm_lite_240829.csv gpt_3_5_text_davinci_002,Helm Lite NaturalQuestionsClosed,0.394,[],helm_lite_240829.csv llama_65b,Helm Lite NaturalQuestionsClosed,0.433,[],helm_lite_240829.csv mistral_large_2402,Helm Lite NaturalQuestionsClosed,0.311,[],helm_lite_240829.csv command,Helm Lite NaturalQuestionsClosed,0.391,[],helm_lite_240829.csv command_r,Helm Lite NaturalQuestionsClosed,0.352,[],helm_lite_240829.csv llama3_1_instruct_turbo_8b,Helm Lite NaturalQuestionsClosed,0.209,[],helm_lite_240829.csv mistral_small_2402,Helm Lite NaturalQuestionsClosed,0.304,[],helm_lite_240829.csv dbrx_instructruct,Helm Lite NaturalQuestionsClosed,0.284,[],helm_lite_240829.csv jamba_instruct,Helm Lite NaturalQuestionsClosed,0.384,[],helm_lite_240829.csv mistral_v0_1_7b,Helm Lite NaturalQuestionsClosed,0.367,[],helm_lite_240829.csv mistral_medium_2312,Helm Lite NaturalQuestionsClosed,0.29,[],helm_lite_240829.csv qwen1_5_7b,Helm Lite NaturalQuestionsClosed,0.27,[],helm_lite_240829.csv claude_3_haiku_20240307,Helm Lite NaturalQuestionsClosed,0.144,[],helm_lite_240829.csv yi_6b,Helm Lite NaturalQuestionsClosed,0.31,[],helm_lite_240829.csv llama_2_13b,Helm Lite NaturalQuestionsClosed,0.371,[],helm_lite_240829.csv jurassic_2_jumbo_178b,Helm Lite NaturalQuestionsClosed,0.385,[],helm_lite_240829.csv falcon_40b,Helm Lite NaturalQuestionsClosed,0.392,[],helm_lite_240829.csv mistral_instruct_v0_3_7b,Helm Lite NaturalQuestionsClosed,0.253,[],helm_lite_240829.csv jurassic_2_grande_17b,Helm Lite NaturalQuestionsClosed,0.35,[],helm_lite_240829.csv phi_2,Helm Lite NaturalQuestionsClosed,0.155,[],helm_lite_240829.csv llama_2_7b,Helm Lite NaturalQuestionsClosed,0.333,[],helm_lite_240829.csv luminous_supreme_70b,Helm Lite NaturalQuestionsClosed,0.299,[],helm_lite_240829.csv command_light,Helm Lite NaturalQuestionsClosed,0.195,[],helm_lite_240829.csv luminous_extended_30b,Helm Lite NaturalQuestionsClosed,0.253,[],helm_lite_240829.csv falcon_7b,Helm Lite NaturalQuestionsClosed,0.285,[],helm_lite_240829.csv olmo_7b,Helm Lite NaturalQuestionsClosed,0.259,[],helm_lite_240829.csv luminous_base_13b,Helm Lite NaturalQuestionsClosed,0.197,[],helm_lite_240829.csv gpt_4o_2024_05_13,Helm Lite OpenBookQA,0.966,[],helm_lite_240829.csv claude_3_5_sonnet_20240620,Helm Lite OpenBookQA,0.972,[],helm_lite_240829.csv gpt_4_0613,Helm Lite OpenBookQA,0.96,[],helm_lite_240829.csv gpt_4_turbo_2024_04_09,Helm Lite OpenBookQA,0.97,[],helm_lite_240829.csv llama3_1_instruct_turbo_405b,Helm Lite OpenBookQA,0.94,[],helm_lite_240829.csv llama3_1_instruct_turbo_70b,Helm Lite OpenBookQA,0.938,[],helm_lite_240829.csv llama3_70b,Helm Lite OpenBookQA,0.934,[],helm_lite_240829.csv qwen2_instruct_72b,Helm Lite OpenBookQA,0.954,[],helm_lite_240829.csv mistral_large_2_2407,Helm Lite OpenBookQA,0.932,[],helm_lite_240829.csv gemini_1_5_pro_001,Helm Lite OpenBookQA,0.902,[],helm_lite_240829.csv gpt_4o_mini_2024_07_18,Helm Lite OpenBookQA,0.92,[],helm_lite_240829.csv mixtral_8x22b,Helm Lite OpenBookQA,0.882,[],helm_lite_240829.csv gpt_4_turbo_1106_preview,Helm Lite OpenBookQA,0.95,[],helm_lite_240829.csv palmyra_x_v3_72b,Helm Lite OpenBookQA,0.938,[],helm_lite_240829.csv gemma_2_instruct_27b,Helm Lite OpenBookQA,0.918,[],helm_lite_240829.csv gemini_1_5_flash_001,Helm Lite OpenBookQA,0.928,[],helm_lite_240829.csv claude_3_opus_20240229,Helm Lite OpenBookQA,0.956,[],helm_lite_240829.csv palm_2_unicorn,Helm Lite OpenBookQA,0.938,[],helm_lite_240829.csv qwen1_5_72b,Helm Lite OpenBookQA,0.93,[],helm_lite_240829.csv palmyra_x_v2_33b,Helm Lite OpenBookQA,0.878,[],helm_lite_240829.csv gemma_2_instruct_9b,Helm Lite OpenBookQA,0.91,[],helm_lite_240829.csv yi_34b,Helm Lite OpenBookQA,0.92,[],helm_lite_240829.csv qwen1_5_chat_110b,Helm Lite OpenBookQA,0.922,[],helm_lite_240829.csv qwen1_5_32b,Helm Lite OpenBookQA,0.932,[],helm_lite_240829.csv claude_v1_3,Helm Lite OpenBookQA,0.908,[],helm_lite_240829.csv palm_2_bison,Helm Lite OpenBookQA,0.878,[],helm_lite_240829.csv mixtral_8x7b_32k_seqlen,Helm Lite OpenBookQA,0.868,[],helm_lite_240829.csv phi_3_14b,Helm Lite OpenBookQA,0.916,[],helm_lite_240829.csv claude_2_0,Helm Lite OpenBookQA,0.862,[],helm_lite_240829.csv deepseek_llm_chat_67b,Helm Lite OpenBookQA,0.88,[],helm_lite_240829.csv phi_3_7b,Helm Lite OpenBookQA,0.912,[],helm_lite_240829.csv llama_2_70b,Helm Lite OpenBookQA,0.838,[],helm_lite_240829.csv yi_large_preview,Helm Lite OpenBookQA,0.946,[],helm_lite_240829.csv command_r_plus,Helm Lite OpenBookQA,0.828,[],helm_lite_240829.csv gpt_3_5_text_davinci_003,Helm Lite OpenBookQA,0.828,[],helm_lite_240829.csv claude_2_1,Helm Lite OpenBookQA,0.872,[],helm_lite_240829.csv qwen1_5_14b,Helm Lite OpenBookQA,0.862,[],helm_lite_240829.csv gemini_1_0_pro_002,Helm Lite OpenBookQA,0.788,[],helm_lite_240829.csv claude_instant_1_2,Helm Lite OpenBookQA,0.844,[],helm_lite_240829.csv llama3_8b,Helm Lite OpenBookQA,0.766,[],helm_lite_240829.csv gpt_3_5_turbo_0613,Helm Lite OpenBookQA,0.838,[],helm_lite_240829.csv claude_3_sonnet_20240229,Helm Lite OpenBookQA,0.918,[],helm_lite_240829.csv mistral_nemo_2402,Helm Lite OpenBookQA,0.822,[],helm_lite_240829.csv arctic_instruct,Helm Lite OpenBookQA,0.828,[],helm_lite_240829.csv gemma_7b,Helm Lite OpenBookQA,0.808,[],helm_lite_240829.csv gpt_3_5_text_davinci_002,Helm Lite OpenBookQA,0.796,[],helm_lite_240829.csv llama_65b,Helm Lite OpenBookQA,0.754,[],helm_lite_240829.csv mistral_large_2402,Helm Lite OpenBookQA,0.894,[],helm_lite_240829.csv command,Helm Lite OpenBookQA,0.774,[],helm_lite_240829.csv command_r,Helm Lite OpenBookQA,0.782,[],helm_lite_240829.csv llama3_1_instruct_turbo_8b,Helm Lite OpenBookQA,0.74,[],helm_lite_240829.csv mistral_small_2402,Helm Lite OpenBookQA,0.862,[],helm_lite_240829.csv dbrx_instructruct,Helm Lite OpenBookQA,0.91,[],helm_lite_240829.csv jamba_instruct,Helm Lite OpenBookQA,0.796,[],helm_lite_240829.csv mistral_v0_1_7b,Helm Lite OpenBookQA,0.776,[],helm_lite_240829.csv mistral_medium_2312,Helm Lite OpenBookQA,0.83,[],helm_lite_240829.csv qwen1_5_7b,Helm Lite OpenBookQA,0.806,[],helm_lite_240829.csv claude_3_haiku_20240307,Helm Lite OpenBookQA,0.838,[],helm_lite_240829.csv yi_6b,Helm Lite OpenBookQA,0.8,[],helm_lite_240829.csv llama_2_13b,Helm Lite OpenBookQA,0.634,[],helm_lite_240829.csv jurassic_2_jumbo_178b,Helm Lite OpenBookQA,0.688,[],helm_lite_240829.csv falcon_40b,Helm Lite OpenBookQA,0.662,[],helm_lite_240829.csv mistral_instruct_v0_3_7b,Helm Lite OpenBookQA,0.79,[],helm_lite_240829.csv jurassic_2_grande_17b,Helm Lite OpenBookQA,0.614,[],helm_lite_240829.csv phi_2,Helm Lite OpenBookQA,0.798,[],helm_lite_240829.csv llama_2_7b,Helm Lite OpenBookQA,0.544,[],helm_lite_240829.csv luminous_supreme_70b,Helm Lite OpenBookQA,0.284,[],helm_lite_240829.csv command_light,Helm Lite OpenBookQA,0.398,[],helm_lite_240829.csv luminous_extended_30b,Helm Lite OpenBookQA,0.272,[],helm_lite_240829.csv falcon_7b,Helm Lite OpenBookQA,0.26,[],helm_lite_240829.csv olmo_7b,Helm Lite OpenBookQA,0.222,[],helm_lite_240829.csv luminous_base_13b,Helm Lite OpenBookQA,0.286,[],helm_lite_240829.csv gpt_4o_2024_05_13,Helm Lite MMLU,0.748,[],helm_lite_240829.csv claude_3_5_sonnet_20240620,Helm Lite MMLU,0.799,[],helm_lite_240829.csv gpt_4_0613,Helm Lite MMLU,0.735,[],helm_lite_240829.csv gpt_4_turbo_2024_04_09,Helm Lite MMLU,0.711,[],helm_lite_240829.csv llama3_1_instruct_turbo_405b,Helm Lite MMLU,0.759,[],helm_lite_240829.csv llama3_1_instruct_turbo_70b,Helm Lite MMLU,0.709,[],helm_lite_240829.csv llama3_70b,Helm Lite MMLU,0.695,[],helm_lite_240829.csv qwen2_instruct_72b,Helm Lite MMLU,0.769,[],helm_lite_240829.csv mistral_large_2_2407,Helm Lite MMLU,0.725,[],helm_lite_240829.csv gemini_1_5_pro_001,Helm Lite MMLU,0.772,[],helm_lite_240829.csv gpt_4o_mini_2024_07_18,Helm Lite MMLU,0.668,[],helm_lite_240829.csv mixtral_8x22b,Helm Lite MMLU,0.701,[],helm_lite_240829.csv gpt_4_turbo_1106_preview,Helm Lite MMLU,0.699,[],helm_lite_240829.csv palmyra_x_v3_72b,Helm Lite MMLU,0.702,[],helm_lite_240829.csv gemma_2_instruct_27b,Helm Lite MMLU,0.664,[],helm_lite_240829.csv gemini_1_5_flash_001,Helm Lite MMLU,0.703,[],helm_lite_240829.csv claude_3_opus_20240229,Helm Lite MMLU,0.768,[],helm_lite_240829.csv palm_2_unicorn,Helm Lite MMLU,0.702,[],helm_lite_240829.csv qwen1_5_72b,Helm Lite MMLU,0.647,[],helm_lite_240829.csv palmyra_x_v2_33b,Helm Lite MMLU,0.621,[],helm_lite_240829.csv gemma_2_instruct_9b,Helm Lite MMLU,0.645,[],helm_lite_240829.csv yi_34b,Helm Lite MMLU,0.65,[],helm_lite_240829.csv qwen1_5_chat_110b,Helm Lite MMLU,0.704,[],helm_lite_240829.csv qwen1_5_32b,Helm Lite MMLU,0.628,[],helm_lite_240829.csv claude_v1_3,Helm Lite MMLU,0.631,[],helm_lite_240829.csv palm_2_bison,Helm Lite MMLU,0.608,[],helm_lite_240829.csv mixtral_8x7b_32k_seqlen,Helm Lite MMLU,0.649,[],helm_lite_240829.csv phi_3_14b,Helm Lite MMLU,0.675,[],helm_lite_240829.csv claude_2_0,Helm Lite MMLU,0.639,[],helm_lite_240829.csv deepseek_llm_chat_67b,Helm Lite MMLU,0.641,[],helm_lite_240829.csv phi_3_7b,Helm Lite MMLU,0.659,[],helm_lite_240829.csv llama_2_70b,Helm Lite MMLU,0.58,[],helm_lite_240829.csv yi_large_preview,Helm Lite MMLU,0.712,[],helm_lite_240829.csv command_r_plus,Helm Lite MMLU,0.59,[],helm_lite_240829.csv gpt_3_5_text_davinci_003,Helm Lite MMLU,0.555,[],helm_lite_240829.csv claude_2_1,Helm Lite MMLU,0.643,[],helm_lite_240829.csv qwen1_5_14b,Helm Lite MMLU,0.626,[],helm_lite_240829.csv gemini_1_0_pro_002,Helm Lite MMLU,0.534,[],helm_lite_240829.csv claude_instant_1_2,Helm Lite MMLU,0.631,[],helm_lite_240829.csv llama3_8b,Helm Lite MMLU,0.602,[],helm_lite_240829.csv gpt_3_5_turbo_0613,Helm Lite MMLU,0.614,[],helm_lite_240829.csv claude_3_sonnet_20240229,Helm Lite MMLU,0.652,[],helm_lite_240829.csv mistral_nemo_2402,Helm Lite MMLU,0.604,[],helm_lite_240829.csv arctic_instruct,Helm Lite MMLU,0.575,[],helm_lite_240829.csv gemma_7b,Helm Lite MMLU,0.571,[],helm_lite_240829.csv gpt_3_5_text_davinci_002,Helm Lite MMLU,0.568,[],helm_lite_240829.csv llama_65b,Helm Lite MMLU,0.584,[],helm_lite_240829.csv mistral_large_2402,Helm Lite MMLU,0.638,[],helm_lite_240829.csv command,Helm Lite MMLU,0.525,[],helm_lite_240829.csv command_r,Helm Lite MMLU,0.567,[],helm_lite_240829.csv llama3_1_instruct_turbo_8b,Helm Lite MMLU,0.5,[],helm_lite_240829.csv mistral_small_2402,Helm Lite MMLU,0.593,[],helm_lite_240829.csv dbrx_instructruct,Helm Lite MMLU,0.643,[],helm_lite_240829.csv jamba_instruct,Helm Lite MMLU,0.582,[],helm_lite_240829.csv mistral_v0_1_7b,Helm Lite MMLU,0.584,[],helm_lite_240829.csv mistral_medium_2312,Helm Lite MMLU,0.618,[],helm_lite_240829.csv qwen1_5_7b,Helm Lite MMLU,0.569,[],helm_lite_240829.csv claude_3_haiku_20240307,Helm Lite MMLU,0.662,[],helm_lite_240829.csv yi_6b,Helm Lite MMLU,0.53,[],helm_lite_240829.csv llama_2_13b,Helm Lite MMLU,0.505,[],helm_lite_240829.csv jurassic_2_jumbo_178b,Helm Lite MMLU,0.483,[],helm_lite_240829.csv falcon_40b,Helm Lite MMLU,0.507,[],helm_lite_240829.csv mistral_instruct_v0_3_7b,Helm Lite MMLU,0.51,[],helm_lite_240829.csv jurassic_2_grande_17b,Helm Lite MMLU,0.471,[],helm_lite_240829.csv phi_2,Helm Lite MMLU,0.518,[],helm_lite_240829.csv llama_2_7b,Helm Lite MMLU,0.425,[],helm_lite_240829.csv luminous_supreme_70b,Helm Lite MMLU,0.316,[],helm_lite_240829.csv command_light,Helm Lite MMLU,0.386,[],helm_lite_240829.csv luminous_extended_30b,Helm Lite MMLU,0.248,[],helm_lite_240829.csv falcon_7b,Helm Lite MMLU,0.288,[],helm_lite_240829.csv olmo_7b,Helm Lite MMLU,0.305,[],helm_lite_240829.csv luminous_base_13b,Helm Lite MMLU,0.243,[],helm_lite_240829.csv gpt_4o_2024_05_13,Helm Lite MathEquivalentCOT,0.829,[],helm_lite_240829.csv claude_3_5_sonnet_20240620,Helm Lite MathEquivalentCOT,0.813,[],helm_lite_240829.csv gpt_4_0613,Helm Lite MathEquivalentCOT,0.802,[],helm_lite_240829.csv gpt_4_turbo_2024_04_09,Helm Lite MathEquivalentCOT,0.833,[],helm_lite_240829.csv llama3_1_instruct_turbo_405b,Helm Lite MathEquivalentCOT,0.827,[],helm_lite_240829.csv llama3_1_instruct_turbo_70b,Helm Lite MathEquivalentCOT,0.783,[],helm_lite_240829.csv llama3_70b,Helm Lite MathEquivalentCOT,0.663,[],helm_lite_240829.csv qwen2_instruct_72b,Helm Lite MathEquivalentCOT,0.79,[],helm_lite_240829.csv mistral_large_2_2407,Helm Lite MathEquivalentCOT,0.677,[],helm_lite_240829.csv gemini_1_5_pro_001,Helm Lite MathEquivalentCOT,0.825,[],helm_lite_240829.csv gpt_4o_mini_2024_07_18,Helm Lite MathEquivalentCOT,0.802,[],helm_lite_240829.csv mixtral_8x22b,Helm Lite MathEquivalentCOT,0.656,[],helm_lite_240829.csv gpt_4_turbo_1106_preview,Helm Lite MathEquivalentCOT,0.857,[],helm_lite_240829.csv palmyra_x_v3_72b,Helm Lite MathEquivalentCOT,0.723,[],helm_lite_240829.csv gemma_2_instruct_27b,Helm Lite MathEquivalentCOT,0.746,[],helm_lite_240829.csv gemini_1_5_flash_001,Helm Lite MathEquivalentCOT,0.753,[],helm_lite_240829.csv claude_3_opus_20240229,Helm Lite MathEquivalentCOT,0.76,[],helm_lite_240829.csv palm_2_unicorn,Helm Lite MathEquivalentCOT,0.674,[],helm_lite_240829.csv qwen1_5_72b,Helm Lite MathEquivalentCOT,0.683,[],helm_lite_240829.csv palmyra_x_v2_33b,Helm Lite MathEquivalentCOT,0.58,[],helm_lite_240829.csv gemma_2_instruct_9b,Helm Lite MathEquivalentCOT,0.724,[],helm_lite_240829.csv yi_34b,Helm Lite MathEquivalentCOT,0.375,[],helm_lite_240829.csv qwen1_5_chat_110b,Helm Lite MathEquivalentCOT,0.568,[],helm_lite_240829.csv qwen1_5_32b,Helm Lite MathEquivalentCOT,0.733,[],helm_lite_240829.csv claude_v1_3,Helm Lite MathEquivalentCOT,0.54,[],helm_lite_240829.csv palm_2_bison,Helm Lite MathEquivalentCOT,0.421,[],helm_lite_240829.csv mixtral_8x7b_32k_seqlen,Helm Lite MathEquivalentCOT,0.494,[],helm_lite_240829.csv phi_3_14b,Helm Lite MathEquivalentCOT,0.611,[],helm_lite_240829.csv claude_2_0,Helm Lite MathEquivalentCOT,0.603,[],helm_lite_240829.csv deepseek_llm_chat_67b,Helm Lite MathEquivalentCOT,0.615,[],helm_lite_240829.csv phi_3_7b,Helm Lite MathEquivalentCOT,0.703,[],helm_lite_240829.csv llama_2_70b,Helm Lite MathEquivalentCOT,0.323,[],helm_lite_240829.csv yi_large_preview,Helm Lite MathEquivalentCOT,0.712,[],helm_lite_240829.csv command_r_plus,Helm Lite MathEquivalentCOT,0.403,[],helm_lite_240829.csv gpt_3_5_text_davinci_003,Helm Lite MathEquivalentCOT,0.449,[],helm_lite_240829.csv claude_2_1,Helm Lite MathEquivalentCOT,0.632,[],helm_lite_240829.csv qwen1_5_14b,Helm Lite MathEquivalentCOT,0.686,[],helm_lite_240829.csv gemini_1_0_pro_002,Helm Lite MathEquivalentCOT,0.665,[],helm_lite_240829.csv claude_instant_1_2,Helm Lite MathEquivalentCOT,0.499,[],helm_lite_240829.csv llama3_8b,Helm Lite MathEquivalentCOT,0.391,[],helm_lite_240829.csv gpt_3_5_turbo_0613,Helm Lite MathEquivalentCOT,0.667,[],helm_lite_240829.csv claude_3_sonnet_20240229,Helm Lite MathEquivalentCOT,0.084,[],helm_lite_240829.csv mistral_nemo_2402,Helm Lite MathEquivalentCOT,0.668,[],helm_lite_240829.csv arctic_instruct,Helm Lite MathEquivalentCOT,0.519,[],helm_lite_240829.csv gemma_7b,Helm Lite MathEquivalentCOT,0.5,[],helm_lite_240829.csv gpt_3_5_text_davinci_002,Helm Lite MathEquivalentCOT,0.428,[],helm_lite_240829.csv llama_65b,Helm Lite MathEquivalentCOT,0.257,[],helm_lite_240829.csv mistral_large_2402,Helm Lite MathEquivalentCOT,0.75,[],helm_lite_240829.csv command,Helm Lite MathEquivalentCOT,0.236,[],helm_lite_240829.csv command_r,Helm Lite MathEquivalentCOT,0.266,[],helm_lite_240829.csv llama3_1_instruct_turbo_8b,Helm Lite MathEquivalentCOT,0.703,[],helm_lite_240829.csv mistral_small_2402,Helm Lite MathEquivalentCOT,0.621,[],helm_lite_240829.csv dbrx_instructruct,Helm Lite MathEquivalentCOT,0.358,[],helm_lite_240829.csv jamba_instruct,Helm Lite MathEquivalentCOT,0.38,[],helm_lite_240829.csv mistral_v0_1_7b,Helm Lite MathEquivalentCOT,0.297,[],helm_lite_240829.csv mistral_medium_2312,Helm Lite MathEquivalentCOT,0.565,[],helm_lite_240829.csv qwen1_5_7b,Helm Lite MathEquivalentCOT,0.561,[],helm_lite_240829.csv claude_3_haiku_20240307,Helm Lite MathEquivalentCOT,0.131,[],helm_lite_240829.csv yi_6b,Helm Lite MathEquivalentCOT,0.126,[],helm_lite_240829.csv llama_2_13b,Helm Lite MathEquivalentCOT,0.102,[],helm_lite_240829.csv jurassic_2_jumbo_178b,Helm Lite MathEquivalentCOT,0.103,[],helm_lite_240829.csv falcon_40b,Helm Lite MathEquivalentCOT,0.128,[],helm_lite_240829.csv mistral_instruct_v0_3_7b,Helm Lite MathEquivalentCOT,0.289,[],helm_lite_240829.csv jurassic_2_grande_17b,Helm Lite MathEquivalentCOT,0.064,[],helm_lite_240829.csv phi_2,Helm Lite MathEquivalentCOT,0.255,[],helm_lite_240829.csv llama_2_7b,Helm Lite MathEquivalentCOT,0.097,[],helm_lite_240829.csv luminous_supreme_70b,Helm Lite MathEquivalentCOT,0.078,[],helm_lite_240829.csv command_light,Helm Lite MathEquivalentCOT,0.098,[],helm_lite_240829.csv luminous_extended_30b,Helm Lite MathEquivalentCOT,0.04,[],helm_lite_240829.csv falcon_7b,Helm Lite MathEquivalentCOT,0.044,[],helm_lite_240829.csv olmo_7b,Helm Lite MathEquivalentCOT,0.029,[],helm_lite_240829.csv luminous_base_13b,Helm Lite MathEquivalentCOT,0.026,[],helm_lite_240829.csv gpt_4o_2024_05_13,Helm Lite GSM8K,0.905,[],helm_lite_240829.csv claude_3_5_sonnet_20240620,Helm Lite GSM8K,0.949,[],helm_lite_240829.csv gpt_4_0613,Helm Lite GSM8K,0.932,[],helm_lite_240829.csv gpt_4_turbo_2024_04_09,Helm Lite GSM8K,0.824,[],helm_lite_240829.csv llama3_1_instruct_turbo_405b,Helm Lite GSM8K,0.949,[],helm_lite_240829.csv llama3_1_instruct_turbo_70b,Helm Lite GSM8K,0.938,[],helm_lite_240829.csv llama3_70b,Helm Lite GSM8K,0.805,[],helm_lite_240829.csv qwen2_instruct_72b,Helm Lite GSM8K,0.92,[],helm_lite_240829.csv mistral_large_2_2407,Helm Lite GSM8K,0.912,[],helm_lite_240829.csv gemini_1_5_pro_001,Helm Lite GSM8K,0.836,[],helm_lite_240829.csv gpt_4o_mini_2024_07_18,Helm Lite GSM8K,0.843,[],helm_lite_240829.csv mixtral_8x22b,Helm Lite GSM8K,0.8,[],helm_lite_240829.csv gpt_4_turbo_1106_preview,Helm Lite GSM8K,0.668,[],helm_lite_240829.csv palmyra_x_v3_72b,Helm Lite GSM8K,0.831,[],helm_lite_240829.csv gemma_2_instruct_27b,Helm Lite GSM8K,0.812,[],helm_lite_240829.csv gemini_1_5_flash_001,Helm Lite GSM8K,0.785,[],helm_lite_240829.csv claude_3_opus_20240229,Helm Lite GSM8K,0.924,[],helm_lite_240829.csv palm_2_unicorn,Helm Lite GSM8K,0.831,[],helm_lite_240829.csv qwen1_5_72b,Helm Lite GSM8K,0.799,[],helm_lite_240829.csv palmyra_x_v2_33b,Helm Lite GSM8K,0.735,[],helm_lite_240829.csv gemma_2_instruct_9b,Helm Lite GSM8K,0.762,[],helm_lite_240829.csv yi_34b,Helm Lite GSM8K,0.648,[],helm_lite_240829.csv qwen1_5_chat_110b,Helm Lite GSM8K,0.815,[],helm_lite_240829.csv qwen1_5_32b,Helm Lite GSM8K,0.773,[],helm_lite_240829.csv claude_v1_3,Helm Lite GSM8K,0.784,[],helm_lite_240829.csv palm_2_bison,Helm Lite GSM8K,0.61,[],helm_lite_240829.csv mixtral_8x7b_32k_seqlen,Helm Lite GSM8K,0.622,[],helm_lite_240829.csv phi_3_14b,Helm Lite GSM8K,0.878,[],helm_lite_240829.csv claude_2_0,Helm Lite GSM8K,0.583,[],helm_lite_240829.csv deepseek_llm_chat_67b,Helm Lite GSM8K,0.795,[],helm_lite_240829.csv llama_2_70b,Helm Lite GSM8K,0.567,[],helm_lite_240829.csv yi_large_preview,Helm Lite GSM8K,0.69,[],helm_lite_240829.csv command_r_plus,Helm Lite GSM8K,0.738,[],helm_lite_240829.csv gpt_3_5_text_davinci_003,Helm Lite GSM8K,0.615,[],helm_lite_240829.csv claude_2_1,Helm Lite GSM8K,0.604,[],helm_lite_240829.csv qwen1_5_14b,Helm Lite GSM8K,0.693,[],helm_lite_240829.csv gemini_1_0_pro_002,Helm Lite GSM8K,0.816,[],helm_lite_240829.csv claude_instant_1_2,Helm Lite GSM8K,0.721,[],helm_lite_240829.csv llama3_8b,Helm Lite GSM8K,0.499,[],helm_lite_240829.csv gpt_3_5_turbo_0613,Helm Lite GSM8K,0.501,[],helm_lite_240829.csv claude_3_sonnet_20240229,Helm Lite GSM8K,0.907,[],helm_lite_240829.csv mistral_nemo_2402,Helm Lite GSM8K,0.782,[],helm_lite_240829.csv arctic_instruct,Helm Lite GSM8K,0.768,[],helm_lite_240829.csv gemma_7b,Helm Lite GSM8K,0.559,[],helm_lite_240829.csv gpt_3_5_text_davinci_002,Helm Lite GSM8K,0.479,[],helm_lite_240829.csv llama_65b,Helm Lite GSM8K,0.489,[],helm_lite_240829.csv mistral_large_2402,Helm Lite GSM8K,0.694,[],helm_lite_240829.csv command,Helm Lite GSM8K,0.452,[],helm_lite_240829.csv command_r,Helm Lite GSM8K,0.551,[],helm_lite_240829.csv llama3_1_instruct_turbo_8b,Helm Lite GSM8K,0.798,[],helm_lite_240829.csv mistral_small_2402,Helm Lite GSM8K,0.734,[],helm_lite_240829.csv dbrx_instructruct,Helm Lite GSM8K,0.671,[],helm_lite_240829.csv jamba_instruct,Helm Lite GSM8K,0.67,[],helm_lite_240829.csv mistral_v0_1_7b,Helm Lite GSM8K,0.377,[],helm_lite_240829.csv mistral_medium_2312,Helm Lite GSM8K,0.706,[],helm_lite_240829.csv qwen1_5_7b,Helm Lite GSM8K,0.6,[],helm_lite_240829.csv claude_3_haiku_20240307,Helm Lite GSM8K,0.699,[],helm_lite_240829.csv yi_6b,Helm Lite GSM8K,0.375,[],helm_lite_240829.csv llama_2_13b,Helm Lite GSM8K,0.266,[],helm_lite_240829.csv jurassic_2_jumbo_178b,Helm Lite GSM8K,0.239,[],helm_lite_240829.csv falcon_40b,Helm Lite GSM8K,0.267,[],helm_lite_240829.csv mistral_instruct_v0_3_7b,Helm Lite GSM8K,0.538,[],helm_lite_240829.csv jurassic_2_grande_17b,Helm Lite GSM8K,0.159,[],helm_lite_240829.csv phi_2,Helm Lite GSM8K,0.581,[],helm_lite_240829.csv llama_2_7b,Helm Lite GSM8K,0.154,[],helm_lite_240829.csv luminous_supreme_70b,Helm Lite GSM8K,0.137,[],helm_lite_240829.csv command_light,Helm Lite GSM8K,0.149,[],helm_lite_240829.csv luminous_extended_30b,Helm Lite GSM8K,0.075,[],helm_lite_240829.csv falcon_7b,Helm Lite GSM8K,0.055,[],helm_lite_240829.csv olmo_7b,Helm Lite GSM8K,0.044,[],helm_lite_240829.csv luminous_base_13b,Helm Lite GSM8K,0.028,[],helm_lite_240829.csv gpt_4o_2024_05_13,Helm Lite LegalBench,0.733,[],helm_lite_240829.csv claude_3_5_sonnet_20240620,Helm Lite LegalBench,0.707,[],helm_lite_240829.csv gpt_4_0613,Helm Lite LegalBench,0.713,[],helm_lite_240829.csv gpt_4_turbo_2024_04_09,Helm Lite LegalBench,0.727,[],helm_lite_240829.csv llama3_1_instruct_turbo_405b,Helm Lite LegalBench,0.707,[],helm_lite_240829.csv llama3_1_instruct_turbo_70b,Helm Lite LegalBench,0.687,[],helm_lite_240829.csv llama3_70b,Helm Lite LegalBench,0.733,[],helm_lite_240829.csv qwen2_instruct_72b,Helm Lite LegalBench,0.712,[],helm_lite_240829.csv mistral_large_2_2407,Helm Lite LegalBench,0.646,[],helm_lite_240829.csv gemini_1_5_pro_001,Helm Lite LegalBench,0.757,[],helm_lite_240829.csv gpt_4o_mini_2024_07_18,Helm Lite LegalBench,0.653,[],helm_lite_240829.csv mixtral_8x22b,Helm Lite LegalBench,0.708,[],helm_lite_240829.csv gpt_4_turbo_1106_preview,Helm Lite LegalBench,0.626,[],helm_lite_240829.csv palmyra_x_v3_72b,Helm Lite LegalBench,0.709,[],helm_lite_240829.csv gemma_2_instruct_27b,Helm Lite LegalBench,0.7,[],helm_lite_240829.csv gemini_1_5_flash_001,Helm Lite LegalBench,0.661,[],helm_lite_240829.csv claude_3_opus_20240229,Helm Lite LegalBench,0.662,[],helm_lite_240829.csv palm_2_unicorn,Helm Lite LegalBench,0.677,[],helm_lite_240829.csv qwen1_5_72b,Helm Lite LegalBench,0.694,[],helm_lite_240829.csv palmyra_x_v2_33b,Helm Lite LegalBench,0.644,[],helm_lite_240829.csv gemma_2_instruct_9b,Helm Lite LegalBench,0.639,[],helm_lite_240829.csv yi_34b,Helm Lite LegalBench,0.618,[],helm_lite_240829.csv qwen1_5_chat_110b,Helm Lite LegalBench,0.624,[],helm_lite_240829.csv qwen1_5_32b,Helm Lite LegalBench,0.636,[],helm_lite_240829.csv claude_v1_3,Helm Lite LegalBench,0.629,[],helm_lite_240829.csv palm_2_bison,Helm Lite LegalBench,0.645,[],helm_lite_240829.csv mixtral_8x7b_32k_seqlen,Helm Lite LegalBench,0.63,[],helm_lite_240829.csv phi_3_14b,Helm Lite LegalBench,0.593,[],helm_lite_240829.csv claude_2_0,Helm Lite LegalBench,0.643,[],helm_lite_240829.csv deepseek_llm_chat_67b,Helm Lite LegalBench,0.637,[],helm_lite_240829.csv phi_3_7b,Helm Lite LegalBench,0.584,[],helm_lite_240829.csv llama_2_70b,Helm Lite LegalBench,0.673,[],helm_lite_240829.csv yi_large_preview,Helm Lite LegalBench,0.519,[],helm_lite_240829.csv command_r_plus,Helm Lite LegalBench,0.672,[],helm_lite_240829.csv gpt_3_5_text_davinci_003,Helm Lite LegalBench,0.622,[],helm_lite_240829.csv claude_2_1,Helm Lite LegalBench,0.643,[],helm_lite_240829.csv qwen1_5_14b,Helm Lite LegalBench,0.593,[],helm_lite_240829.csv gemini_1_0_pro_002,Helm Lite LegalBench,0.475,[],helm_lite_240829.csv claude_instant_1_2,Helm Lite LegalBench,0.586,[],helm_lite_240829.csv llama3_8b,Helm Lite LegalBench,0.637,[],helm_lite_240829.csv gpt_3_5_turbo_0613,Helm Lite LegalBench,0.528,[],helm_lite_240829.csv claude_3_sonnet_20240229,Helm Lite LegalBench,0.49,[],helm_lite_240829.csv mistral_nemo_2402,Helm Lite LegalBench,0.415,[],helm_lite_240829.csv arctic_instruct,Helm Lite LegalBench,0.588,[],helm_lite_240829.csv gemma_7b,Helm Lite LegalBench,0.581,[],helm_lite_240829.csv gpt_3_5_text_davinci_002,Helm Lite LegalBench,0.58,[],helm_lite_240829.csv llama_65b,Helm Lite LegalBench,0.48,[],helm_lite_240829.csv mistral_large_2402,Helm Lite LegalBench,0.479,[],helm_lite_240829.csv command,Helm Lite LegalBench,0.578,[],helm_lite_240829.csv command_r,Helm Lite LegalBench,0.507,[],helm_lite_240829.csv llama3_1_instruct_turbo_8b,Helm Lite LegalBench,0.342,[],helm_lite_240829.csv mistral_small_2402,Helm Lite LegalBench,0.389,[],helm_lite_240829.csv dbrx_instructruct,Helm Lite LegalBench,0.426,[],helm_lite_240829.csv jamba_instruct,Helm Lite LegalBench,0.54,[],helm_lite_240829.csv mistral_v0_1_7b,Helm Lite LegalBench,0.58,[],helm_lite_240829.csv mistral_medium_2312,Helm Lite LegalBench,0.452,[],helm_lite_240829.csv qwen1_5_7b,Helm Lite LegalBench,0.523,[],helm_lite_240829.csv claude_3_haiku_20240307,Helm Lite LegalBench,0.46,[],helm_lite_240829.csv yi_6b,Helm Lite LegalBench,0.519,[],helm_lite_240829.csv llama_2_13b,Helm Lite LegalBench,0.591,[],helm_lite_240829.csv jurassic_2_jumbo_178b,Helm Lite LegalBench,0.533,[],helm_lite_240829.csv falcon_40b,Helm Lite LegalBench,0.442,[],helm_lite_240829.csv mistral_instruct_v0_3_7b,Helm Lite LegalBench,0.331,[],helm_lite_240829.csv jurassic_2_grande_17b,Helm Lite LegalBench,0.468,[],helm_lite_240829.csv phi_2,Helm Lite LegalBench,0.334,[],helm_lite_240829.csv llama_2_7b,Helm Lite LegalBench,0.502,[],helm_lite_240829.csv luminous_supreme_70b,Helm Lite LegalBench,0.452,[],helm_lite_240829.csv command_light,Helm Lite LegalBench,0.397,[],helm_lite_240829.csv luminous_extended_30b,Helm Lite LegalBench,0.421,[],helm_lite_240829.csv falcon_7b,Helm Lite LegalBench,0.346,[],helm_lite_240829.csv olmo_7b,Helm Lite LegalBench,0.341,[],helm_lite_240829.csv luminous_base_13b,Helm Lite LegalBench,0.332,[],helm_lite_240829.csv gpt_4o_2024_05_13,Helm Lite MedQA,0.857,[],helm_lite_240829.csv claude_3_5_sonnet_20240620,Helm Lite MedQA,0.825,[],helm_lite_240829.csv gpt_4_0613,Helm Lite MedQA,0.815,[],helm_lite_240829.csv gpt_4_turbo_2024_04_09,Helm Lite MedQA,0.783,[],helm_lite_240829.csv llama3_1_instruct_turbo_405b,Helm Lite MedQA,0.805,[],helm_lite_240829.csv llama3_1_instruct_turbo_70b,Helm Lite MedQA,0.769,[],helm_lite_240829.csv llama3_70b,Helm Lite MedQA,0.777,[],helm_lite_240829.csv qwen2_instruct_72b,Helm Lite MedQA,0.746,[],helm_lite_240829.csv mistral_large_2_2407,Helm Lite MedQA,0.775,[],helm_lite_240829.csv gemini_1_5_pro_001,Helm Lite MedQA,0.692,[],helm_lite_240829.csv gpt_4o_mini_2024_07_18,Helm Lite MedQA,0.748,[],helm_lite_240829.csv mixtral_8x22b,Helm Lite MedQA,0.704,[],helm_lite_240829.csv gpt_4_turbo_1106_preview,Helm Lite MedQA,0.817,[],helm_lite_240829.csv palmyra_x_v3_72b,Helm Lite MedQA,0.684,[],helm_lite_240829.csv gemma_2_instruct_27b,Helm Lite MedQA,0.684,[],helm_lite_240829.csv gemini_1_5_flash_001,Helm Lite MedQA,0.68,[],helm_lite_240829.csv claude_3_opus_20240229,Helm Lite MedQA,0.775,[],helm_lite_240829.csv palm_2_unicorn,Helm Lite MedQA,0.684,[],helm_lite_240829.csv qwen1_5_72b,Helm Lite MedQA,0.67,[],helm_lite_240829.csv palmyra_x_v2_33b,Helm Lite MedQA,0.598,[],helm_lite_240829.csv gemma_2_instruct_9b,Helm Lite MedQA,0.63,[],helm_lite_240829.csv yi_34b,Helm Lite MedQA,0.656,[],helm_lite_240829.csv qwen1_5_chat_110b,Helm Lite MedQA,0.64,[],helm_lite_240829.csv qwen1_5_32b,Helm Lite MedQA,0.656,[],helm_lite_240829.csv claude_v1_3,Helm Lite MedQA,0.618,[],helm_lite_240829.csv palm_2_bison,Helm Lite MedQA,0.547,[],helm_lite_240829.csv mixtral_8x7b_32k_seqlen,Helm Lite MedQA,0.652,[],helm_lite_240829.csv phi_3_14b,Helm Lite MedQA,0.696,[],helm_lite_240829.csv claude_2_0,Helm Lite MedQA,0.652,[],helm_lite_240829.csv deepseek_llm_chat_67b,Helm Lite MedQA,0.628,[],helm_lite_240829.csv phi_3_7b,Helm Lite MedQA,0.672,[],helm_lite_240829.csv llama_2_70b,Helm Lite MedQA,0.618,[],helm_lite_240829.csv yi_large_preview,Helm Lite MedQA,0.66,[],helm_lite_240829.csv command_r_plus,Helm Lite MedQA,0.567,[],helm_lite_240829.csv gpt_3_5_text_davinci_003,Helm Lite MedQA,0.531,[],helm_lite_240829.csv claude_2_1,Helm Lite MedQA,0.644,[],helm_lite_240829.csv qwen1_5_14b,Helm Lite MedQA,0.515,[],helm_lite_240829.csv gemini_1_0_pro_002,Helm Lite MedQA,0.483,[],helm_lite_240829.csv claude_instant_1_2,Helm Lite MedQA,0.559,[],helm_lite_240829.csv llama3_8b,Helm Lite MedQA,0.581,[],helm_lite_240829.csv gpt_3_5_turbo_0613,Helm Lite MedQA,0.622,[],helm_lite_240829.csv claude_3_sonnet_20240229,Helm Lite MedQA,0.684,[],helm_lite_240829.csv mistral_nemo_2402,Helm Lite MedQA,0.59,[],helm_lite_240829.csv arctic_instruct,Helm Lite MedQA,0.581,[],helm_lite_240829.csv gemma_7b,Helm Lite MedQA,0.513,[],helm_lite_240829.csv gpt_3_5_text_davinci_002,Helm Lite MedQA,0.525,[],helm_lite_240829.csv llama_65b,Helm Lite MedQA,0.507,[],helm_lite_240829.csv mistral_large_2402,Helm Lite MedQA,0.499,[],helm_lite_240829.csv command,Helm Lite MedQA,0.445,[],helm_lite_240829.csv command_r,Helm Lite MedQA,0.555,[],helm_lite_240829.csv llama3_1_instruct_turbo_8b,Helm Lite MedQA,0.245,[],helm_lite_240829.csv mistral_small_2402,Helm Lite MedQA,0.616,[],helm_lite_240829.csv dbrx_instructruct,Helm Lite MedQA,0.694,[],helm_lite_240829.csv jamba_instruct,Helm Lite MedQA,0.519,[],helm_lite_240829.csv mistral_v0_1_7b,Helm Lite MedQA,0.525,[],helm_lite_240829.csv mistral_medium_2312,Helm Lite MedQA,0.61,[],helm_lite_240829.csv qwen1_5_7b,Helm Lite MedQA,0.479,[],helm_lite_240829.csv claude_3_haiku_20240307,Helm Lite MedQA,0.702,[],helm_lite_240829.csv yi_6b,Helm Lite MedQA,0.497,[],helm_lite_240829.csv llama_2_13b,Helm Lite MedQA,0.392,[],helm_lite_240829.csv jurassic_2_jumbo_178b,Helm Lite MedQA,0.431,[],helm_lite_240829.csv falcon_40b,Helm Lite MedQA,0.419,[],helm_lite_240829.csv mistral_instruct_v0_3_7b,Helm Lite MedQA,0.517,[],helm_lite_240829.csv jurassic_2_grande_17b,Helm Lite MedQA,0.39,[],helm_lite_240829.csv phi_2,Helm Lite MedQA,0.41,[],helm_lite_240829.csv llama_2_7b,Helm Lite MedQA,0.392,[],helm_lite_240829.csv luminous_supreme_70b,Helm Lite MedQA,0.276,[],helm_lite_240829.csv command_light,Helm Lite MedQA,0.312,[],helm_lite_240829.csv luminous_extended_30b,Helm Lite MedQA,0.276,[],helm_lite_240829.csv falcon_7b,Helm Lite MedQA,0.254,[],helm_lite_240829.csv olmo_7b,Helm Lite MedQA,0.229,[],helm_lite_240829.csv luminous_base_13b,Helm Lite MedQA,0.26,[],helm_lite_240829.csv gpt_4o_2024_05_13,Helm Lite WMT2014,0.231,[],helm_lite_240829.csv claude_3_5_sonnet_20240620,Helm Lite WMT2014,0.229,[],helm_lite_240829.csv gpt_4_0613,Helm Lite WMT2014,0.211,[],helm_lite_240829.csv gpt_4_turbo_2024_04_09,Helm Lite WMT2014,0.218,[],helm_lite_240829.csv llama3_1_instruct_turbo_405b,Helm Lite WMT2014,0.238,[],helm_lite_240829.csv llama3_1_instruct_turbo_70b,Helm Lite WMT2014,0.223,[],helm_lite_240829.csv llama3_70b,Helm Lite WMT2014,0.225,[],helm_lite_240829.csv qwen2_instruct_72b,Helm Lite WMT2014,0.207,[],helm_lite_240829.csv mistral_large_2_2407,Helm Lite WMT2014,0.192,[],helm_lite_240829.csv gemini_1_5_pro_001,Helm Lite WMT2014,0.189,[],helm_lite_240829.csv gpt_4o_mini_2024_07_18,Helm Lite WMT2014,0.206,[],helm_lite_240829.csv mixtral_8x22b,Helm Lite WMT2014,0.209,[],helm_lite_240829.csv gpt_4_turbo_1106_preview,Helm Lite WMT2014,0.205,[],helm_lite_240829.csv palmyra_x_v3_72b,Helm Lite WMT2014,0.262,[],helm_lite_240829.csv gemma_2_instruct_27b,Helm Lite WMT2014,0.214,[],helm_lite_240829.csv gemini_1_5_flash_001,Helm Lite WMT2014,0.225,[],helm_lite_240829.csv claude_3_opus_20240229,Helm Lite WMT2014,0.24,[],helm_lite_240829.csv palm_2_unicorn,Helm Lite WMT2014,0.26,[],helm_lite_240829.csv qwen1_5_72b,Helm Lite WMT2014,0.201,[],helm_lite_240829.csv palmyra_x_v2_33b,Helm Lite WMT2014,0.239,[],helm_lite_240829.csv gemma_2_instruct_9b,Helm Lite WMT2014,0.201,[],helm_lite_240829.csv yi_34b,Helm Lite WMT2014,0.172,[],helm_lite_240829.csv qwen1_5_chat_110b,Helm Lite WMT2014,0.192,[],helm_lite_240829.csv qwen1_5_32b,Helm Lite WMT2014,0.193,[],helm_lite_240829.csv claude_v1_3,Helm Lite WMT2014,0.219,[],helm_lite_240829.csv palm_2_bison,Helm Lite WMT2014,0.241,[],helm_lite_240829.csv mixtral_8x7b_32k_seqlen,Helm Lite WMT2014,0.19,[],helm_lite_240829.csv phi_3_14b,Helm Lite WMT2014,0.17,[],helm_lite_240829.csv claude_2_0,Helm Lite WMT2014,0.219,[],helm_lite_240829.csv deepseek_llm_chat_67b,Helm Lite WMT2014,0.186,[],helm_lite_240829.csv phi_3_7b,Helm Lite WMT2014,0.154,[],helm_lite_240829.csv llama_2_70b,Helm Lite WMT2014,0.196,[],helm_lite_240829.csv yi_large_preview,Helm Lite WMT2014,0.176,[],helm_lite_240829.csv command_r_plus,Helm Lite WMT2014,0.203,[],helm_lite_240829.csv gpt_3_5_text_davinci_003,Helm Lite WMT2014,0.191,[],helm_lite_240829.csv claude_2_1,Helm Lite WMT2014,0.204,[],helm_lite_240829.csv qwen1_5_14b,Helm Lite WMT2014,0.178,[],helm_lite_240829.csv gemini_1_0_pro_002,Helm Lite WMT2014,0.194,[],helm_lite_240829.csv claude_instant_1_2,Helm Lite WMT2014,0.194,[],helm_lite_240829.csv llama3_8b,Helm Lite WMT2014,0.183,[],helm_lite_240829.csv gpt_3_5_turbo_0613,Helm Lite WMT2014,0.187,[],helm_lite_240829.csv claude_3_sonnet_20240229,Helm Lite WMT2014,0.218,[],helm_lite_240829.csv mistral_nemo_2402,Helm Lite WMT2014,0.177,[],helm_lite_240829.csv arctic_instruct,Helm Lite WMT2014,0.172,[],helm_lite_240829.csv gemma_7b,Helm Lite WMT2014,0.187,[],helm_lite_240829.csv gpt_3_5_text_davinci_002,Helm Lite WMT2014,0.174,[],helm_lite_240829.csv llama_65b,Helm Lite WMT2014,0.189,[],helm_lite_240829.csv mistral_large_2402,Helm Lite WMT2014,0.182,[],helm_lite_240829.csv command,Helm Lite WMT2014,0.088,[],helm_lite_240829.csv command_r,Helm Lite WMT2014,0.149,[],helm_lite_240829.csv llama3_1_instruct_turbo_8b,Helm Lite WMT2014,0.181,[],helm_lite_240829.csv mistral_small_2402,Helm Lite WMT2014,0.169,[],helm_lite_240829.csv dbrx_instructruct,Helm Lite WMT2014,0.131,[],helm_lite_240829.csv jamba_instruct,Helm Lite WMT2014,0.164,[],helm_lite_240829.csv mistral_v0_1_7b,Helm Lite WMT2014,0.16,[],helm_lite_240829.csv mistral_medium_2312,Helm Lite WMT2014,0.169,[],helm_lite_240829.csv qwen1_5_7b,Helm Lite WMT2014,0.153,[],helm_lite_240829.csv claude_3_haiku_20240307,Helm Lite WMT2014,0.148,[],helm_lite_240829.csv yi_6b,Helm Lite WMT2014,0.117,[],helm_lite_240829.csv llama_2_13b,Helm Lite WMT2014,0.167,[],helm_lite_240829.csv jurassic_2_jumbo_178b,Helm Lite WMT2014,0.114,[],helm_lite_240829.csv falcon_40b,Helm Lite WMT2014,0.162,[],helm_lite_240829.csv mistral_instruct_v0_3_7b,Helm Lite WMT2014,0.142,[],helm_lite_240829.csv jurassic_2_grande_17b,Helm Lite WMT2014,0.102,[],helm_lite_240829.csv phi_2,Helm Lite WMT2014,0.038,[],helm_lite_240829.csv llama_2_7b,Helm Lite WMT2014,0.144,[],helm_lite_240829.csv luminous_supreme_70b,Helm Lite WMT2014,0.102,[],helm_lite_240829.csv command_light,Helm Lite WMT2014,0.023,[],helm_lite_240829.csv luminous_extended_30b,Helm Lite WMT2014,0.083,[],helm_lite_240829.csv falcon_7b,Helm Lite WMT2014,0.094,[],helm_lite_240829.csv olmo_7b,Helm Lite WMT2014,0.097,[],helm_lite_240829.csv luminous_base_13b,Helm Lite WMT2014,0.066,[],helm_lite_240829.csv chatglm_6b,LMSys Arena,855.910565721209,[],chatbot_arena_241104.csv koala_13b,LMSys Arena,901.4444159097708,[],chatbot_arena_241104.csv oasst_pythia_12b,LMSys Arena,812.3918514404036,[],chatbot_arena_241104.csv alpaca_13b,LMSys Arena,851.3113435573603,[],chatbot_arena_241104.csv vicuna_13b,LMSys Arena,874.2126379649785,[],chatbot_arena_241104.csv dolly_v2_12b,LMSys Arena,781.4370567093974,[],chatbot_arena_241104.csv stablelm_tuned_alpha_7b,LMSys Arena,829.7609284591157,[],chatbot_arena_241104.csv llama_13b,LMSys Arena,800.0,[],chatbot_arena_241104.csv fastchat_t5_3b,LMSys Arena,794.3748535699036,[],chatbot_arena_241104.csv gpt_3_5_turbo_0314,LMSys Arena,1051.024508411953,[],chatbot_arena_241104.csv gpt_4_0314,LMSys Arena,980.6906633214736,[],chatbot_arena_241104.csv rwkv_4_raven_14b,LMSys Arena,874.536173297737,[],chatbot_arena_241104.csv claude_1,LMSys Arena,1039.7803750141782,[],chatbot_arena_241104.csv mpt_7b_chat,LMSys Arena,869.0762171208861,[],chatbot_arena_241104.csv palm_2,LMSys Arena,922.5218005276812,[],chatbot_arena_241104.csv claude_instant_1,LMSys Arena,991.8056867962612,[],chatbot_arena_241104.csv vicuna_7b,LMSys Arena,910.6856107758756,[],chatbot_arena_241104.csv wizardlm_13b,LMSys Arena,971.8432912657483,[],chatbot_arena_241104.csv gpt4all_13b_snoozy,LMSys Arena,885.7452637089059,[],chatbot_arena_241104.csv guanaco_33b,LMSys Arena,974.3076720194276,[],chatbot_arena_241104.csv vicuna_33b,LMSys Arena,906.4317166108784,[],chatbot_arena_241104.csv mpt_30b_chat,LMSys Arena,971.1057122702124,[],chatbot_arena_241104.csv gpt_3_5_turbo_0613,LMSys Arena,999.7201069046866,[],chatbot_arena_241104.csv gpt_4_0613,LMSys Arena,960.3770824361336,[],chatbot_arena_241104.csv llama_2_7b_chat,LMSys Arena,895.4706517283653,[],chatbot_arena_241104.csv claude_2_0,LMSys Arena,1016.5801503367938,[],chatbot_arena_241104.csv llama_2_13b_chat,LMSys Arena,963.7146661400922,[],chatbot_arena_241104.csv chatglm2_6b,LMSys Arena,835.3074735731766,[],chatbot_arena_241104.csv llama_2_70b_chat,LMSys Arena,1007.6844327159828,[],chatbot_arena_241104.csv codellama34b_instruct,LMSys Arena,934.0457254208728,[],chatbot_arena_241104.csv wizardlm_70b,LMSys Arena,979.5605650746356,[],chatbot_arena_241104.csv falcon_180b_chat,LMSys Arena,923.054729229491,[],chatbot_arena_241104.csv mistral_7b_instruct,LMSys Arena,895.9405753947756,[],chatbot_arena_241104.csv qwen_14b_chat,LMSys Arena,921.4887868532272,[],chatbot_arena_241104.csv zephyr_7b_alpha,LMSys Arena,946.9339607858802,[],chatbot_arena_241104.csv zephyr_7b_beta,LMSys Arena,913.246312461937,[],chatbot_arena_241104.csv openchat_3_5,LMSys Arena,948.9893819327424,[],chatbot_arena_241104.csv gpt_4_1106_preview,LMSys Arena,1001.256303019811,[],chatbot_arena_241104.csv gpt_3_5_turbo_1106,LMSys Arena,937.6322384103784,[],chatbot_arena_241104.csv chatglm3_6b,LMSys Arena,814.5480014217649,[],chatbot_arena_241104.csv claude_2_1,LMSys Arena,979.863770513184,[],chatbot_arena_241104.csv tulu_2_dpo_70b,LMSys Arena,961.7298633389956,[],chatbot_arena_241104.csv yi_34b_chat,LMSys Arena,932.0283635154188,[],chatbot_arena_241104.csv starling_lm_7b_alpha,LMSys Arena,945.1430459412009,[],chatbot_arena_241104.csv openhermes_2_5_mistral_7b,LMSys Arena,935.5573447997912,[],chatbot_arena_241104.csv pplx_70b_online,LMSys Arena,931.0576338876376,[],chatbot_arena_241104.csv pplx_7b_online,LMSys Arena,948.7421850358356,[],chatbot_arena_241104.csv dolphin_2_2_1_mistral_7b,LMSys Arena,977.0069489193058,[],chatbot_arena_241104.csv mixtral_8x7b_instruct_v0_1,LMSys Arena,867.9036424292025,[],chatbot_arena_241104.csv gemini_pro,LMSys Arena,1006.251403062337,[],chatbot_arena_241104.csv solar_10_7b_instruct_v1_0,LMSys Arena,958.6549095565916,[],chatbot_arena_241104.csv mistral_medium,LMSys Arena,965.0537859905728,[],chatbot_arena_241104.csv llama2_70b_steerlm_chat,LMSys Arena,965.6376159085758,[],chatbot_arena_241104.csv gemini_pro_dev_api,LMSys Arena,1019.3566145491036,[],chatbot_arena_241104.csv stripedhyena_nous_7b,LMSys Arena,919.5708420570646,[],chatbot_arena_241104.csv bard_jan_24_gemini_pro,LMSys Arena,1041.261256012453,[],chatbot_arena_241104.csv deepseek_llm_67b_chat,LMSys Arena,958.7276958964316,[],chatbot_arena_241104.csv gpt_4_0125_preview,LMSys Arena,997.1712467949897,[],chatbot_arena_241104.csv gpt_3_5_turbo_0125,LMSys Arena,898.9675086846296,[],chatbot_arena_241104.csv nous_hermes_2_mixtral_8x7b_dpo,LMSys Arena,972.2639217501226,[],chatbot_arena_241104.csv mistral_7b_instruct_v0_2,LMSys Arena,892.8914241485261,[],chatbot_arena_241104.csv qwen1_5_72b_chat,LMSys Arena,947.9919390672214,[],chatbot_arena_241104.csv openchat_3_5_0106,LMSys Arena,956.5639851579056,[],chatbot_arena_241104.csv qwen1_5_4b_chat,LMSys Arena,857.8615305194531,[],chatbot_arena_241104.csv qwen1_5_7b_chat,LMSys Arena,937.5784150291832,[],chatbot_arena_241104.csv codellama_70b_instruct,LMSys Arena,873.7635218944325,[],chatbot_arena_241104.csv mistral_next,LMSys Arena,969.0249137331156,[],chatbot_arena_241104.csv gemma_2b_it,LMSys Arena,865.630898513726,[],chatbot_arena_241104.csv gemma_7b_it,LMSys Arena,913.3020846629596,[],chatbot_arena_241104.csv mistral_large_2402,LMSys Arena,939.5529442890696,[],chatbot_arena_241104.csv olmo_7b_instruct,LMSys Arena,875.880001693062,[],chatbot_arena_241104.csv claude_3_sonnet_20240229,LMSys Arena,970.6832692453124,[],chatbot_arena_241104.csv claude_3_opus_20240229,LMSys Arena,1021.9572137608476,[],chatbot_arena_241104.csv claude_3_haiku_20240307,LMSys Arena,946.756591266114,[],chatbot_arena_241104.csv starling_lm_7b_beta,LMSys Arena,967.1740802373936,[],chatbot_arena_241104.csv command_r,LMSys Arena,915.3923710382184,[],chatbot_arena_241104.csv dbrx_instructruct_preview,LMSys Arena,930.1149113654316,[],chatbot_arena_241104.csv qwen1_5_14b_chat,LMSys Arena,932.8461519507624,[],chatbot_arena_241104.csv qwen1_5_32b_chat,LMSys Arena,917.6067239158654,[],chatbot_arena_241104.csv command_r_plus,LMSys Arena,981.9316261444284,[],chatbot_arena_241104.csv gemma_1_1_7b_it,LMSys Arena,888.863535227059,[],chatbot_arena_241104.csv gpt_4_turbo_2024_04_09,LMSys Arena,1001.95083675947,[],chatbot_arena_241104.csv zephyr_orpo_141b_a35b_v0_1,LMSys Arena,992.2709969445071,[],chatbot_arena_241104.csv gemma_1_1_2b_it,LMSys Arena,839.3449619004468,[],chatbot_arena_241104.csv gemini_1_5_pro_api_0409_preview,LMSys Arena,1106.8697777575628,[],chatbot_arena_241104.csv reka_flash_21b_20240226_online,LMSys Arena,967.873277488609,[],chatbot_arena_241104.csv reka_flash_21b_20240226,LMSys Arena,939.8601363871352,[],chatbot_arena_241104.csv mixtral_8x22b_instruct_v0_1,LMSys Arena,911.463562145636,[],chatbot_arena_241104.csv llama3_8b_instruct,LMSys Arena,925.300077951389,[],chatbot_arena_241104.csv llama3_70b_instruct,LMSys Arena,987.92132812523,[],chatbot_arena_241104.csv phi_3_mini_128k_instruct,LMSys Arena,875.3830177408651,[],chatbot_arena_241104.csv snowflake_arctic_instruct,LMSys Arena,908.9578096804898,[],chatbot_arena_241104.csv reka_core_20240501,LMSys Arena,960.871641047353,[],chatbot_arena_241104.csv qwen1_5_110b_chat,LMSys Arena,970.825546150876,[],chatbot_arena_241104.csv qwen_max_0428,LMSys Arena,991.8829133949346,[],chatbot_arena_241104.csv gpt_4o_2024_05_13,LMSys Arena,1033.7736651812086,[],chatbot_arena_241104.csv yi_large_preview,LMSys Arena,1007.9055342457846,[],chatbot_arena_241104.csv glm_4_0116,LMSys Arena,996.2388680185244,[],chatbot_arena_241104.csv phi_3_mini_4k_instruct,LMSys Arena,875.486575120554,[],chatbot_arena_241104.csv gemini_advanced_0514,LMSys Arena,1034.5901919978594,[],chatbot_arena_241104.csv gemini_1_5_pro_api_0514,LMSys Arena,1006.938590226684,[],chatbot_arena_241104.csv gemini_1_5_flash_api_0514,LMSys Arena,988.426072144592,[],chatbot_arena_241104.csv yi_1_5_34b_chat,LMSys Arena,935.8573439301474,[],chatbot_arena_241104.csv phi_3_small_8k_instruct,LMSys Arena,877.7438151636035,[],chatbot_arena_241104.csv phi_3_medium_4k_instruct,LMSys Arena,866.7539620360035,[],chatbot_arena_241104.csv qwen2_72b_instruct,LMSys Arena,930.7722721046767,[],chatbot_arena_241104.csv yi_large,LMSys Arena,991.78684277118,[],chatbot_arena_241104.csv nemotron_4_340b_instruct,LMSys Arena,1011.0291063554424,[],chatbot_arena_241104.csv reka_flash_preview_20240611,LMSys Arena,937.4782906143832,[],chatbot_arena_241104.csv glm_4_0520,LMSys Arena,1012.3461462160476,[],chatbot_arena_241104.csv deepseek_coder_v2,LMSys Arena,968.7272337322494,[],chatbot_arena_241104.csv claude_3_5_sonnet_20240620,LMSys Arena,1026.059060767346,[],chatbot_arena_241104.csv gemma_2_9b_it,LMSys Arena,950.0755523266928,[],chatbot_arena_241104.csv gemma_2_27b_it,LMSys Arena,977.8470656596852,[],chatbot_arena_241104.csv phi_3_mini_4k_instruct_june_2024,LMSys Arena,860.4379813139254,[],chatbot_arena_241104.csv deepseek_v2_api_0628,LMSys Arena,989.5345921181048,[],chatbot_arena_241104.csv athene_70b_0725,LMSys Arena,1020.8101504540734,[],chatbot_arena_241104.csv gemini_1_5_pro_exp_0801,LMSys Arena,1074.9371768117894,[],chatbot_arena_241104.csv gpt_4o_mini_2024_07_18,LMSys Arena,1026.236414405759,[],chatbot_arena_241104.csv deepseek_coder_v2_0724,LMSys Arena,990.94288841608,[],chatbot_arena_241104.csv gemma_2_2b_it,LMSys Arena,906.320768087545,[],chatbot_arena_241104.csv llama3_1_8b_instruct,LMSys Arena,949.3125757952852,[],chatbot_arena_241104.csv llama3_1_405b_instruct,LMSys Arena,1005.4497444176718,[],chatbot_arena_241104.csv llama3_1_70b_instruct,LMSys Arena,1034.402372751568,[],chatbot_arena_241104.csv mistral_large_2407,LMSys Arena,1005.1771608005986,[],chatbot_arena_241104.csv reka_core_20240722,LMSys Arena,1006.982150804202,[],chatbot_arena_241104.csv reka_flash_20240722,LMSys Arena,950.554264764622,[],chatbot_arena_241104.csv chatgpt_4o_latest,LMSys Arena,1073.7429047571106,[],chatbot_arena_241104.csv gpt_4o_2024_08_06,LMSys Arena,1032.650635133711,[],chatbot_arena_241104.csv alphamonarch_7b,HF OpenLLM v2,17.59,,hf_open_llm_v2_240829.csv alphamonarch_7b,HFv2 BBH,23.95,,hf_open_llm_v2_240829.csv alphamonarch_7b,HFv2 GPQA,2.68,,hf_open_llm_v2_240829.csv alphamonarch_7b,HFv2 IFEval,49.39,,hf_open_llm_v2_240829.csv alphamonarch_7b,HFv2 MMLU Pro,16.36,,hf_open_llm_v2_240829.csv alphamonarch_7b,HFv2 Math Level 5,3.85,,hf_open_llm_v2_240829.csv alphamonarch_7b,HFv2 MuSR,9.32,,hf_open_llm_v2_240829.csv arcee_spark,HF OpenLLM v2,25.33,,hf_open_llm_v2_240829.csv arcee_spark,HFv2 BBH,36.92,,hf_open_llm_v2_240829.csv arcee_spark,HFv2 GPQA,7.49,,hf_open_llm_v2_240829.csv arcee_spark,HFv2 IFEval,57.18,,hf_open_llm_v2_240829.csv arcee_spark,HFv2 MMLU Pro,31.26,,hf_open_llm_v2_240829.csv arcee_spark,HFv2 Math Level 5,10.73,,hf_open_llm_v2_240829.csv arcee_spark,HFv2 MuSR,8.4,,hf_open_llm_v2_240829.csv autotrain_llama3_orpo_v2,HF OpenLLM v2,12.2,,hf_open_llm_v2_240829.csv autotrain_llama3_orpo_v2,HFv2 BBH,4.38,,hf_open_llm_v2_240829.csv autotrain_llama3_orpo_v2,HFv2 GPQA,2.24,,hf_open_llm_v2_240829.csv autotrain_llama3_orpo_v2,HFv2 IFEval,43.72,,hf_open_llm_v2_240829.csv autotrain_llama3_orpo_v2,HFv2 MMLU Pro,13.54,,hf_open_llm_v2_240829.csv autotrain_llama3_orpo_v2,HFv2 Math Level 5,4.23,,hf_open_llm_v2_240829.csv autotrain_llama3_orpo_v2,HFv2 MuSR,5.1,,hf_open_llm_v2_240829.csv aya_23_35b,HF OpenLLM v2,24.62,,hf_open_llm_v2_240829.csv aya_23_35b,HFv2 BBH,34.86,,hf_open_llm_v2_240829.csv aya_23_35b,HFv2 GPQA,5.93,,hf_open_llm_v2_240829.csv aya_23_35b,HFv2 IFEval,64.62,,hf_open_llm_v2_240829.csv aya_23_35b,HFv2 MMLU Pro,26.18,,hf_open_llm_v2_240829.csv aya_23_35b,HFv2 Math Level 5,2.64,,hf_open_llm_v2_240829.csv aya_23_35b,HFv2 MuSR,13.47,,hf_open_llm_v2_240829.csv aya_23_8b,HF OpenLLM v2,15.97,,hf_open_llm_v2_240829.csv aya_23_8b,HFv2 BBH,20.2,,hf_open_llm_v2_240829.csv aya_23_8b,HFv2 GPQA,4.59,,hf_open_llm_v2_240829.csv aya_23_8b,HFv2 IFEval,46.99,,hf_open_llm_v2_240829.csv aya_23_8b,HFv2 MMLU Pro,14.2,,hf_open_llm_v2_240829.csv aya_23_8b,HFv2 Math Level 5,1.44,,hf_open_llm_v2_240829.csv aya_23_8b,HFv2 MuSR,8.42,,hf_open_llm_v2_240829.csv bagelmisterytour_v2_8x7b,HF OpenLLM v2,24.55,,hf_open_llm_v2_240829.csv bagelmisterytour_v2_8x7b,HFv2 BBH,31.37,,hf_open_llm_v2_240829.csv bagelmisterytour_v2_8x7b,HFv2 GPQA,7.72,,hf_open_llm_v2_240829.csv bagelmisterytour_v2_8x7b,HFv2 IFEval,62.62,,hf_open_llm_v2_240829.csv bagelmisterytour_v2_8x7b,HFv2 MMLU Pro,27.56,,hf_open_llm_v2_240829.csv bagelmisterytour_v2_8x7b,HFv2 Math Level 5,7.7,,hf_open_llm_v2_240829.csv bagelmisterytour_v2_8x7b,HFv2 MuSR,10.32,,hf_open_llm_v2_240829.csv barcenas_14b_phi_3_medium_orpo,HF OpenLLM v2,31.42,,hf_open_llm_v2_240829.csv barcenas_14b_phi_3_medium_orpo,HFv2 BBH,51.03,,hf_open_llm_v2_240829.csv barcenas_14b_phi_3_medium_orpo,HFv2 GPQA,10.18,,hf_open_llm_v2_240829.csv barcenas_14b_phi_3_medium_orpo,HFv2 IFEval,47.99,,hf_open_llm_v2_240829.csv barcenas_14b_phi_3_medium_orpo,HFv2 MMLU Pro,41.37,,hf_open_llm_v2_240829.csv barcenas_14b_phi_3_medium_orpo,HFv2 Math Level 5,17.45,,hf_open_llm_v2_240829.csv barcenas_14b_phi_3_medium_orpo,HFv2 MuSR,20.53,,hf_open_llm_v2_240829.csv barcenas_llama3_8b_orpo,HF OpenLLM v2,26.38,,hf_open_llm_v2_240829.csv barcenas_llama3_8b_orpo,HFv2 BBH,28.6,,hf_open_llm_v2_240829.csv barcenas_llama3_8b_orpo,HFv2 GPQA,7.61,,hf_open_llm_v2_240829.csv barcenas_llama3_8b_orpo,HFv2 IFEval,73.72,,hf_open_llm_v2_240829.csv barcenas_llama3_8b_orpo,HFv2 MMLU Pro,31.44,,hf_open_llm_v2_240829.csv barcenas_llama3_8b_orpo,HFv2 Math Level 5,5.74,,hf_open_llm_v2_240829.csv barcenas_llama3_8b_orpo,HFv2 MuSR,11.17,,hf_open_llm_v2_240829.csv bloom_1b1,HF OpenLLM v2,3.96,,hf_open_llm_v2_240829.csv bloom_1b1,HFv2 BBH,4.04,,hf_open_llm_v2_240829.csv bloom_1b1,HFv2 GPQA,1.23,,hf_open_llm_v2_240829.csv bloom_1b1,HFv2 IFEval,13.73,,hf_open_llm_v2_240829.csv bloom_1b1,HFv2 MMLU Pro,1.2,,hf_open_llm_v2_240829.csv bloom_1b1,HFv2 Math Level 5,0.15,,hf_open_llm_v2_240829.csv bloom_1b1,HFv2 MuSR,3.42,,hf_open_llm_v2_240829.csv bloom_1b7,HF OpenLLM v2,3.97,,hf_open_llm_v2_240829.csv bloom_1b7,HFv2 BBH,4.4,,hf_open_llm_v2_240829.csv bloom_1b7,HFv2 GPQA,1.12,,hf_open_llm_v2_240829.csv bloom_1b7,HFv2 IFEval,10.44,,hf_open_llm_v2_240829.csv bloom_1b7,HFv2 MMLU Pro,0.96,,hf_open_llm_v2_240829.csv bloom_1b7,HFv2 Math Level 5,0.08,,hf_open_llm_v2_240829.csv bloom_1b7,HFv2 MuSR,6.84,,hf_open_llm_v2_240829.csv bloom_3b,HF OpenLLM v2,4.26,,hf_open_llm_v2_240829.csv bloom_3b,HFv2 BBH,3.42,,hf_open_llm_v2_240829.csv bloom_3b,HFv2 GPQA,0.0,,hf_open_llm_v2_240829.csv bloom_3b,HFv2 IFEval,12.71,,hf_open_llm_v2_240829.csv bloom_3b,HFv2 MMLU Pro,1.48,,hf_open_llm_v2_240829.csv bloom_3b,HFv2 Math Level 5,0.08,,hf_open_llm_v2_240829.csv bloom_3b,HFv2 MuSR,7.89,,hf_open_llm_v2_240829.csv bloom_560m,HF OpenLLM v2,3.46,,hf_open_llm_v2_240829.csv bloom_560m,HFv2 BBH,2.89,,hf_open_llm_v2_240829.csv bloom_560m,HFv2 GPQA,1.57,,hf_open_llm_v2_240829.csv bloom_560m,HFv2 IFEval,6.2,,hf_open_llm_v2_240829.csv bloom_560m,HFv2 MMLU Pro,1.83,,hf_open_llm_v2_240829.csv bloom_560m,HFv2 Math Level 5,0.08,,hf_open_llm_v2_240829.csv bloom_560m,HFv2 MuSR,8.19,,hf_open_llm_v2_240829.csv bloom_7b1,HF OpenLLM v2,3.71,,hf_open_llm_v2_240829.csv bloom_7b1,HFv2 BBH,4.04,,hf_open_llm_v2_240829.csv bloom_7b1,HFv2 GPQA,1.9,,hf_open_llm_v2_240829.csv bloom_7b1,HFv2 IFEval,13.22,,hf_open_llm_v2_240829.csv bloom_7b1,HFv2 MMLU Pro,1.16,,hf_open_llm_v2_240829.csv bloom_7b1,HFv2 Math Level 5,0.0,,hf_open_llm_v2_240829.csv bloom_7b1,HFv2 MuSR,1.92,,hf_open_llm_v2_240829.csv blossom_v5_1_34b,HF OpenLLM v2,28.39,,hf_open_llm_v2_240829.csv blossom_v5_1_34b,HFv2 BBH,44.15,,hf_open_llm_v2_240829.csv blossom_v5_1_34b,HFv2 GPQA,7.94,,hf_open_llm_v2_240829.csv blossom_v5_1_34b,HFv2 IFEval,56.97,,hf_open_llm_v2_240829.csv blossom_v5_1_34b,HFv2 MMLU Pro,39.53,,hf_open_llm_v2_240829.csv blossom_v5_1_34b,HFv2 Math Level 5,14.43,,hf_open_llm_v2_240829.csv blossom_v5_1_34b,HFv2 MuSR,7.3,,hf_open_llm_v2_240829.csv blossom_v5_1_9b,HF OpenLLM v2,24.68,,hf_open_llm_v2_240829.csv blossom_v5_1_9b,HFv2 BBH,34.2,,hf_open_llm_v2_240829.csv blossom_v5_1_9b,HFv2 GPQA,11.41,,hf_open_llm_v2_240829.csv blossom_v5_1_9b,HFv2 IFEval,50.86,,hf_open_llm_v2_240829.csv blossom_v5_1_9b,HFv2 MMLU Pro,33.1,,hf_open_llm_v2_240829.csv blossom_v5_1_9b,HFv2 Math Level 5,10.5,,hf_open_llm_v2_240829.csv blossom_v5_1_9b,HFv2 MuSR,8.02,,hf_open_llm_v2_240829.csv btlm_7b_base_v0_2,HF OpenLLM v2,8.84,,hf_open_llm_v2_240829.csv btlm_7b_base_v0_2,HFv2 BBH,16.19,,hf_open_llm_v2_240829.csv btlm_7b_base_v0_2,HFv2 GPQA,0.45,,hf_open_llm_v2_240829.csv btlm_7b_base_v0_2,HFv2 IFEval,14.83,,hf_open_llm_v2_240829.csv btlm_7b_base_v0_2,HFv2 MMLU Pro,15.0,,hf_open_llm_v2_240829.csv btlm_7b_base_v0_2,HFv2 Math Level 5,1.06,,hf_open_llm_v2_240829.csv btlm_7b_base_v0_2,HFv2 MuSR,5.54,,hf_open_llm_v2_240829.csv c4ai_command_r_plus,HF OpenLLM v2,30.86,,hf_open_llm_v2_240829.csv c4ai_command_r_plus,HFv2 BBH,39.92,,hf_open_llm_v2_240829.csv c4ai_command_r_plus,HFv2 GPQA,7.38,,hf_open_llm_v2_240829.csv c4ai_command_r_plus,HFv2 IFEval,76.64,,hf_open_llm_v2_240829.csv c4ai_command_r_plus,HFv2 MMLU Pro,33.24,,hf_open_llm_v2_240829.csv c4ai_command_r_plus,HFv2 Math Level 5,7.55,,hf_open_llm_v2_240829.csv c4ai_command_r_plus,HFv2 MuSR,20.42,,hf_open_llm_v2_240829.csv c4ai_command_r_v0_1,HF OpenLLM v2,25.35,,hf_open_llm_v2_240829.csv c4ai_command_r_v0_1,HFv2 BBH,34.56,,hf_open_llm_v2_240829.csv c4ai_command_r_v0_1,HFv2 GPQA,7.61,,hf_open_llm_v2_240829.csv c4ai_command_r_v0_1,HFv2 IFEval,67.48,,hf_open_llm_v2_240829.csv c4ai_command_r_v0_1,HFv2 MMLU Pro,26.33,,hf_open_llm_v2_240829.csv c4ai_command_r_v0_1,HFv2 Math Level 5,0.0,,hf_open_llm_v2_240829.csv c4ai_command_r_v0_1,HFv2 MuSR,16.13,,hf_open_llm_v2_240829.csv calm3_22b_chat,HF OpenLLM v2,21.27,,hf_open_llm_v2_240829.csv calm3_22b_chat,HFv2 BBH,29.52,,hf_open_llm_v2_240829.csv calm3_22b_chat,HFv2 GPQA,3.58,,hf_open_llm_v2_240829.csv calm3_22b_chat,HFv2 IFEval,50.91,,hf_open_llm_v2_240829.csv calm3_22b_chat,HFv2 MMLU Pro,21.66,,hf_open_llm_v2_240829.csv calm3_22b_chat,HFv2 Math Level 5,5.89,,hf_open_llm_v2_240829.csv calm3_22b_chat,HFv2 MuSR,16.08,,hf_open_llm_v2_240829.csv calme_2_1_phi3_4b,HF OpenLLM v2,24.51,,hf_open_llm_v2_240829.csv calme_2_1_phi3_4b,HFv2 BBH,38.12,,hf_open_llm_v2_240829.csv calme_2_1_phi3_4b,HFv2 GPQA,10.63,,hf_open_llm_v2_240829.csv calme_2_1_phi3_4b,HFv2 IFEval,55.25,,hf_open_llm_v2_240829.csv calme_2_1_phi3_4b,HFv2 MMLU Pro,30.51,,hf_open_llm_v2_240829.csv calme_2_1_phi3_4b,HFv2 Math Level 5,4.31,,hf_open_llm_v2_240829.csv calme_2_1_phi3_4b,HFv2 MuSR,8.26,,hf_open_llm_v2_240829.csv calme_2_1_qwen2_72b,HF OpenLLM v2,43.61,,hf_open_llm_v2_240829.csv calme_2_1_qwen2_72b,HFv2 BBH,57.33,,hf_open_llm_v2_240829.csv calme_2_1_qwen2_72b,HFv2 GPQA,17.45,,hf_open_llm_v2_240829.csv calme_2_1_qwen2_72b,HFv2 IFEval,81.63,,hf_open_llm_v2_240829.csv calme_2_1_qwen2_72b,HFv2 MMLU Pro,49.05,,hf_open_llm_v2_240829.csv calme_2_1_qwen2_72b,HFv2 Math Level 5,36.03,,hf_open_llm_v2_240829.csv calme_2_1_qwen2_72b,HFv2 MuSR,20.15,,hf_open_llm_v2_240829.csv calme_2_2_llama3_70b,HF OpenLLM v2,37.98,,hf_open_llm_v2_240829.csv calme_2_2_llama3_70b,HFv2 BBH,48.57,,hf_open_llm_v2_240829.csv calme_2_2_llama3_70b,HFv2 GPQA,12.19,,hf_open_llm_v2_240829.csv calme_2_2_llama3_70b,HFv2 IFEval,82.08,,hf_open_llm_v2_240829.csv calme_2_2_llama3_70b,HFv2 MMLU Pro,46.74,,hf_open_llm_v2_240829.csv calme_2_2_llama3_70b,HFv2 Math Level 5,22.96,,hf_open_llm_v2_240829.csv calme_2_2_llama3_70b,HFv2 MuSR,15.3,,hf_open_llm_v2_240829.csv calme_2_2_phi3_4b,HF OpenLLM v2,23.21,,hf_open_llm_v2_240829.csv calme_2_2_phi3_4b,HFv2 BBH,37.73,,hf_open_llm_v2_240829.csv calme_2_2_phi3_4b,HFv2 GPQA,9.51,,hf_open_llm_v2_240829.csv calme_2_2_phi3_4b,HFv2 IFEval,50.69,,hf_open_llm_v2_240829.csv calme_2_2_phi3_4b,HFv2 MMLU Pro,31.27,,hf_open_llm_v2_240829.csv calme_2_2_phi3_4b,HFv2 Math Level 5,2.34,,hf_open_llm_v2_240829.csv calme_2_2_phi3_4b,HFv2 MuSR,7.7,,hf_open_llm_v2_240829.csv calme_2_2_qwen2_72b,HF OpenLLM v2,43.4,,hf_open_llm_v2_240829.csv calme_2_2_qwen2_72b,HFv2 BBH,56.8,,hf_open_llm_v2_240829.csv calme_2_2_qwen2_72b,HFv2 GPQA,16.55,,hf_open_llm_v2_240829.csv calme_2_2_qwen2_72b,HFv2 IFEval,80.08,,hf_open_llm_v2_240829.csv calme_2_2_qwen2_72b,HFv2 MMLU Pro,49.27,,hf_open_llm_v2_240829.csv calme_2_2_qwen2_72b,HFv2 Math Level 5,41.16,,hf_open_llm_v2_240829.csv calme_2_2_qwen2_72b,HFv2 MuSR,16.52,,hf_open_llm_v2_240829.csv calme_2_3_phi3_4b,HF OpenLLM v2,23.02,,hf_open_llm_v2_240829.csv calme_2_3_phi3_4b,HFv2 BBH,37.66,,hf_open_llm_v2_240829.csv calme_2_3_phi3_4b,HFv2 GPQA,9.06,,hf_open_llm_v2_240829.csv calme_2_3_phi3_4b,HFv2 IFEval,49.26,,hf_open_llm_v2_240829.csv calme_2_3_phi3_4b,HFv2 MMLU Pro,31.42,,hf_open_llm_v2_240829.csv calme_2_3_phi3_4b,HFv2 Math Level 5,2.95,,hf_open_llm_v2_240829.csv calme_2_3_phi3_4b,HFv2 MuSR,7.75,,hf_open_llm_v2_240829.csv calme_2_4_llama3_70b,HF OpenLLM v2,32.18,,hf_open_llm_v2_240829.csv calme_2_4_llama3_70b,HFv2 BBH,48.4,,hf_open_llm_v2_240829.csv calme_2_4_llama3_70b,HFv2 GPQA,11.97,,hf_open_llm_v2_240829.csv calme_2_4_llama3_70b,HFv2 IFEval,50.27,,hf_open_llm_v2_240829.csv calme_2_4_llama3_70b,HFv2 MMLU Pro,46.71,,hf_open_llm_v2_240829.csv calme_2_4_llama3_70b,HFv2 Math Level 5,22.66,,hf_open_llm_v2_240829.csv calme_2_4_llama3_70b,HFv2 MuSR,13.1,,hf_open_llm_v2_240829.csv carbonbeagle_11b,HF OpenLLM v2,22.36,,hf_open_llm_v2_240829.csv carbonbeagle_11b,HFv2 BBH,33.06,,hf_open_llm_v2_240829.csv carbonbeagle_11b,HFv2 GPQA,6.94,,hf_open_llm_v2_240829.csv carbonbeagle_11b,HFv2 IFEval,54.15,,hf_open_llm_v2_240829.csv carbonbeagle_11b,HFv2 MMLU Pro,25.29,,hf_open_llm_v2_240829.csv carbonbeagle_11b,HFv2 Math Level 5,5.51,,hf_open_llm_v2_240829.csv carbonbeagle_11b,HFv2 MuSR,9.19,,hf_open_llm_v2_240829.csv carbonbeagle_11b_truthy,HF OpenLLM v2,21.29,,hf_open_llm_v2_240829.csv carbonbeagle_11b_truthy,HFv2 BBH,33.99,,hf_open_llm_v2_240829.csv carbonbeagle_11b_truthy,HFv2 GPQA,6.6,,hf_open_llm_v2_240829.csv carbonbeagle_11b_truthy,HFv2 IFEval,52.12,,hf_open_llm_v2_240829.csv carbonbeagle_11b_truthy,HFv2 MMLU Pro,26.19,,hf_open_llm_v2_240829.csv carbonbeagle_11b_truthy,HFv2 Math Level 5,4.76,,hf_open_llm_v2_240829.csv carbonbeagle_11b_truthy,HFv2 MuSR,4.11,,hf_open_llm_v2_240829.csv chocolatine_3b_instruct_dpo_revised,HF OpenLLM v2,27.63,,hf_open_llm_v2_240829.csv chocolatine_3b_instruct_dpo_revised,HFv2 BBH,37.16,,hf_open_llm_v2_240829.csv chocolatine_3b_instruct_dpo_revised,HFv2 GPQA,9.62,,hf_open_llm_v2_240829.csv chocolatine_3b_instruct_dpo_revised,HFv2 IFEval,56.23,,hf_open_llm_v2_240829.csv chocolatine_3b_instruct_dpo_revised,HFv2 MMLU Pro,33.21,,hf_open_llm_v2_240829.csv chocolatine_3b_instruct_dpo_revised,HFv2 Math Level 5,14.5,,hf_open_llm_v2_240829.csv chocolatine_3b_instruct_dpo_revised,HFv2 MuSR,15.1,,hf_open_llm_v2_240829.csv chocolatine_8b_instruct_dpo_v1_0,HF OpenLLM v2,22.03,,hf_open_llm_v2_240829.csv chocolatine_8b_instruct_dpo_v1_0,HFv2 BBH,29.96,,hf_open_llm_v2_240829.csv chocolatine_8b_instruct_dpo_v1_0,HFv2 GPQA,6.71,,hf_open_llm_v2_240829.csv chocolatine_8b_instruct_dpo_v1_0,HFv2 IFEval,47.33,,hf_open_llm_v2_240829.csv chocolatine_8b_instruct_dpo_v1_0,HFv2 MMLU Pro,31.88,,hf_open_llm_v2_240829.csv chocolatine_8b_instruct_dpo_v1_0,HFv2 Math Level 5,7.55,,hf_open_llm_v2_240829.csv chocolatine_8b_instruct_dpo_v1_0,HFv2 MuSR,8.74,,hf_open_llm_v2_240829.csv codegemma_1_1_2b,HF OpenLLM v2,7.02,,hf_open_llm_v2_240829.csv codegemma_1_1_2b,HFv2 BBH,7.55,,hf_open_llm_v2_240829.csv codegemma_1_1_2b,HFv2 GPQA,2.01,,hf_open_llm_v2_240829.csv codegemma_1_1_2b,HFv2 IFEval,22.94,,hf_open_llm_v2_240829.csv codegemma_1_1_2b,HFv2 MMLU Pro,3.09,,hf_open_llm_v2_240829.csv codegemma_1_1_2b,HFv2 Math Level 5,0.6,,hf_open_llm_v2_240829.csv codegemma_1_1_2b,HFv2 MuSR,5.93,,hf_open_llm_v2_240829.csv configurable_llama3_1_8b_instruct,HF OpenLLM v2,27.77,,hf_open_llm_v2_240829.csv configurable_llama3_1_8b_instruct,HFv2 BBH,29.66,,hf_open_llm_v2_240829.csv configurable_llama3_1_8b_instruct,HFv2 GPQA,3.24,,hf_open_llm_v2_240829.csv configurable_llama3_1_8b_instruct,HFv2 IFEval,83.12,,hf_open_llm_v2_240829.csv configurable_llama3_1_8b_instruct,HFv2 MMLU Pro,28.8,,hf_open_llm_v2_240829.csv configurable_llama3_1_8b_instruct,HFv2 Math Level 5,15.86,,hf_open_llm_v2_240829.csv configurable_llama3_1_8b_instruct,HFv2 MuSR,5.93,,hf_open_llm_v2_240829.csv configurable_yi_1_5_9b_chat,HF OpenLLM v2,23.77,,hf_open_llm_v2_240829.csv configurable_yi_1_5_9b_chat,HFv2 BBH,35.33,,hf_open_llm_v2_240829.csv configurable_yi_1_5_9b_chat,HFv2 GPQA,12.42,,hf_open_llm_v2_240829.csv configurable_yi_1_5_9b_chat,HFv2 IFEval,43.23,,hf_open_llm_v2_240829.csv configurable_yi_1_5_9b_chat,HFv2 MMLU Pro,33.5,,hf_open_llm_v2_240829.csv configurable_yi_1_5_9b_chat,HFv2 Math Level 5,6.12,,hf_open_llm_v2_240829.csv configurable_yi_1_5_9b_chat,HFv2 MuSR,12.02,,hf_open_llm_v2_240829.csv configurablebeagle_11b,HF OpenLLM v2,22.52,,hf_open_llm_v2_240829.csv configurablebeagle_11b,HFv2 BBH,32.39,,hf_open_llm_v2_240829.csv configurablebeagle_11b,HFv2 GPQA,6.94,,hf_open_llm_v2_240829.csv configurablebeagle_11b,HFv2 IFEval,58.34,,hf_open_llm_v2_240829.csv configurablebeagle_11b,HFv2 MMLU Pro,26.38,,hf_open_llm_v2_240829.csv configurablebeagle_11b,HFv2 Math Level 5,3.7,,hf_open_llm_v2_240829.csv configurablebeagle_11b,HFv2 MuSR,7.38,,hf_open_llm_v2_240829.csv configurablehermes_7b,HF OpenLLM v2,19.46,,hf_open_llm_v2_240829.csv configurablehermes_7b,HFv2 BBH,23.16,,hf_open_llm_v2_240829.csv configurablehermes_7b,HFv2 GPQA,3.58,,hf_open_llm_v2_240829.csv configurablehermes_7b,HFv2 IFEval,54.11,,hf_open_llm_v2_240829.csv configurablehermes_7b,HFv2 MMLU Pro,22.5,,hf_open_llm_v2_240829.csv configurablehermes_7b,HFv2 Math Level 5,4.31,,hf_open_llm_v2_240829.csv configurablehermes_7b,HFv2 MuSR,9.11,,hf_open_llm_v2_240829.csv configurablesolar_10_7b,HF OpenLLM v2,19.05,,hf_open_llm_v2_240829.csv configurablesolar_10_7b,HFv2 BBH,27.45,,hf_open_llm_v2_240829.csv configurablesolar_10_7b,HFv2 GPQA,6.49,,hf_open_llm_v2_240829.csv configurablesolar_10_7b,HFv2 IFEval,51.0,,hf_open_llm_v2_240829.csv configurablesolar_10_7b,HFv2 MMLU Pro,24.15,,hf_open_llm_v2_240829.csv configurablesolar_10_7b,HFv2 Math Level 5,0.0,,hf_open_llm_v2_240829.csv configurablesolar_10_7b,HFv2 MuSR,5.19,,hf_open_llm_v2_240829.csv dbrx_instructruct,HF OpenLLM v2,25.2,,hf_open_llm_v2_240829.csv dbrx_instructruct,HFv2 BBH,35.96,,hf_open_llm_v2_240829.csv dbrx_instructruct,HFv2 GPQA,12.19,,hf_open_llm_v2_240829.csv dbrx_instructruct,HFv2 IFEval,54.16,,hf_open_llm_v2_240829.csv dbrx_instructruct,HFv2 MMLU Pro,29.81,,hf_open_llm_v2_240829.csv dbrx_instructruct,HFv2 Math Level 5,6.87,,hf_open_llm_v2_240829.csv dbrx_instructruct,HFv2 MuSR,12.2,,hf_open_llm_v2_240829.csv dclm_7b,HF OpenLLM v2,13.99,,hf_open_llm_v2_240829.csv dclm_7b,HFv2 BBH,19.76,,hf_open_llm_v2_240829.csv dclm_7b,HFv2 GPQA,8.72,,hf_open_llm_v2_240829.csv dclm_7b,HFv2 IFEval,21.73,,hf_open_llm_v2_240829.csv dclm_7b,HFv2 MMLU Pro,23.45,,hf_open_llm_v2_240829.csv dclm_7b,HFv2 Math Level 5,2.95,,hf_open_llm_v2_240829.csv dclm_7b,HFv2 MuSR,7.31,,hf_open_llm_v2_240829.csv decilm_7b,HF OpenLLM v2,14.95,,hf_open_llm_v2_240829.csv decilm_7b,HFv2 BBH,21.25,,hf_open_llm_v2_240829.csv decilm_7b,HFv2 GPQA,6.04,,hf_open_llm_v2_240829.csv decilm_7b,HFv2 IFEval,28.13,,hf_open_llm_v2_240829.csv decilm_7b,HFv2 MMLU Pro,18.8,,hf_open_llm_v2_240829.csv decilm_7b,HFv2 Math Level 5,2.42,,hf_open_llm_v2_240829.csv decilm_7b,HFv2 MuSR,13.05,,hf_open_llm_v2_240829.csv decilm_7b_instruct,HF OpenLLM v2,17.43,,hf_open_llm_v2_240829.csv decilm_7b_instruct,HFv2 BBH,23.89,,hf_open_llm_v2_240829.csv decilm_7b_instruct,HFv2 GPQA,5.26,,hf_open_llm_v2_240829.csv decilm_7b_instruct,HFv2 IFEval,48.8,,hf_open_llm_v2_240829.csv decilm_7b_instruct,HFv2 MMLU Pro,17.87,,hf_open_llm_v2_240829.csv decilm_7b_instruct,HFv2 Math Level 5,2.79,,hf_open_llm_v2_240829.csv decilm_7b_instruct,HFv2 MuSR,5.99,,hf_open_llm_v2_240829.csv deepseek_llm_67b_chat,HF OpenLLM v2,26.87,,hf_open_llm_v2_240829.csv deepseek_llm_67b_chat,HFv2 BBH,33.23,,hf_open_llm_v2_240829.csv deepseek_llm_67b_chat,HFv2 GPQA,8.84,,hf_open_llm_v2_240829.csv deepseek_llm_67b_chat,HFv2 IFEval,55.87,,hf_open_llm_v2_240829.csv deepseek_llm_67b_chat,HFv2 MMLU Pro,32.71,,hf_open_llm_v2_240829.csv deepseek_llm_67b_chat,HFv2 Math Level 5,6.65,,hf_open_llm_v2_240829.csv deepseek_llm_67b_chat,HFv2 MuSR,23.93,,hf_open_llm_v2_240829.csv deepseek_llm_7b_base,HF OpenLLM v2,8.1,,hf_open_llm_v2_240829.csv deepseek_llm_7b_base,HFv2 BBH,9.77,,hf_open_llm_v2_240829.csv deepseek_llm_7b_base,HFv2 GPQA,3.13,,hf_open_llm_v2_240829.csv deepseek_llm_7b_base,HFv2 IFEval,21.79,,hf_open_llm_v2_240829.csv deepseek_llm_7b_base,HFv2 MMLU Pro,8.96,,hf_open_llm_v2_240829.csv deepseek_llm_7b_base,HFv2 Math Level 5,1.21,,hf_open_llm_v2_240829.csv deepseek_llm_7b_base,HFv2 MuSR,3.76,,hf_open_llm_v2_240829.csv deepseek_llm_7b_chat,HF OpenLLM v2,14.77,,hf_open_llm_v2_240829.csv deepseek_llm_7b_chat,HFv2 BBH,11.26,,hf_open_llm_v2_240829.csv deepseek_llm_7b_chat,HFv2 GPQA,2.13,,hf_open_llm_v2_240829.csv deepseek_llm_7b_chat,HFv2 IFEval,41.71,,hf_open_llm_v2_240829.csv deepseek_llm_7b_chat,HFv2 MMLU Pro,12.59,,hf_open_llm_v2_240829.csv deepseek_llm_7b_chat,HFv2 Math Level 5,1.74,,hf_open_llm_v2_240829.csv deepseek_llm_7b_chat,HFv2 MuSR,19.21,,hf_open_llm_v2_240829.csv deepseek_moe_16b_base,HF OpenLLM v2,7.37,,hf_open_llm_v2_240829.csv deepseek_moe_16b_base,HFv2 BBH,8.36,,hf_open_llm_v2_240829.csv deepseek_moe_16b_base,HFv2 GPQA,0.56,,hf_open_llm_v2_240829.csv deepseek_moe_16b_base,HFv2 IFEval,24.5,,hf_open_llm_v2_240829.csv deepseek_moe_16b_base,HFv2 MMLU Pro,5.61,,hf_open_llm_v2_240829.csv deepseek_moe_16b_base,HFv2 Math Level 5,1.81,,hf_open_llm_v2_240829.csv deepseek_moe_16b_base,HFv2 MuSR,3.36,,hf_open_llm_v2_240829.csv deepseek_moe_16b_chat,HF OpenLLM v2,10.14,,hf_open_llm_v2_240829.csv deepseek_moe_16b_chat,HFv2 BBH,6.57,,hf_open_llm_v2_240829.csv deepseek_moe_16b_chat,HFv2 GPQA,0.0,,hf_open_llm_v2_240829.csv deepseek_moe_16b_chat,HFv2 IFEval,36.63,,hf_open_llm_v2_240829.csv deepseek_moe_16b_chat,HFv2 MMLU Pro,10.71,,hf_open_llm_v2_240829.csv deepseek_moe_16b_chat,HFv2 Math Level 5,1.66,,hf_open_llm_v2_240829.csv deepseek_moe_16b_chat,HFv2 MuSR,5.26,,hf_open_llm_v2_240829.csv dialogpt_medium,HF OpenLLM v2,5.25,,hf_open_llm_v2_240829.csv dialogpt_medium,HFv2 BBH,2.56,,hf_open_llm_v2_240829.csv dialogpt_medium,HFv2 GPQA,0.56,,hf_open_llm_v2_240829.csv dialogpt_medium,HFv2 IFEval,14.79,,hf_open_llm_v2_240829.csv dialogpt_medium,HFv2 MMLU Pro,1.32,,hf_open_llm_v2_240829.csv dialogpt_medium,HFv2 Math Level 5,0.0,,hf_open_llm_v2_240829.csv dialogpt_medium,HFv2 MuSR,12.28,,hf_open_llm_v2_240829.csv dictalm2_0_instruct,HF OpenLLM v2,16.58,,hf_open_llm_v2_240829.csv dictalm2_0_instruct,HFv2 BBH,19.69,,hf_open_llm_v2_240829.csv dictalm2_0_instruct,HFv2 GPQA,7.05,,hf_open_llm_v2_240829.csv dictalm2_0_instruct,HFv2 IFEval,44.12,,hf_open_llm_v2_240829.csv dictalm2_0_instruct,HFv2 MMLU Pro,17.83,,hf_open_llm_v2_240829.csv dictalm2_0_instruct,HFv2 Math Level 5,1.06,,hf_open_llm_v2_240829.csv dictalm2_0_instruct,HFv2 MuSR,9.72,,hf_open_llm_v2_240829.csv distilgpt2,HF OpenLLM v2,3.9,,hf_open_llm_v2_240829.csv distilgpt2,HFv2 BBH,2.84,,hf_open_llm_v2_240829.csv distilgpt2,HFv2 GPQA,1.23,,hf_open_llm_v2_240829.csv distilgpt2,HFv2 IFEval,6.11,,hf_open_llm_v2_240829.csv distilgpt2,HFv2 MMLU Pro,2.08,,hf_open_llm_v2_240829.csv distilgpt2,HFv2 Math Level 5,0.0,,hf_open_llm_v2_240829.csv distilgpt2,HFv2 MuSR,11.16,,hf_open_llm_v2_240829.csv dolly_v1_6b,HF OpenLLM v2,6.89,,hf_open_llm_v2_240829.csv dolly_v1_6b,HFv2 BBH,4.78,,hf_open_llm_v2_240829.csv dolly_v1_6b,HFv2 GPQA,1.9,,hf_open_llm_v2_240829.csv dolly_v1_6b,HFv2 IFEval,22.24,,hf_open_llm_v2_240829.csv dolly_v1_6b,HFv2 MMLU Pro,2.95,,hf_open_llm_v2_240829.csv dolly_v1_6b,HFv2 Math Level 5,1.36,,hf_open_llm_v2_240829.csv dolly_v1_6b,HFv2 MuSR,8.12,,hf_open_llm_v2_240829.csv dolphin_2_9_2_phi_3_medium,HF OpenLLM v2,25.66,,hf_open_llm_v2_240829.csv dolphin_2_9_2_phi_3_medium,HFv2 BBH,49.72,,hf_open_llm_v2_240829.csv dolphin_2_9_2_phi_3_medium,HFv2 GPQA,10.29,,hf_open_llm_v2_240829.csv dolphin_2_9_2_phi_3_medium,HFv2 IFEval,42.48,,hf_open_llm_v2_240829.csv dolphin_2_9_2_phi_3_medium,HFv2 MMLU Pro,39.5,,hf_open_llm_v2_240829.csv dolphin_2_9_2_phi_3_medium,HFv2 Math Level 5,0.53,,hf_open_llm_v2_240829.csv dolphin_2_9_2_phi_3_medium,HFv2 MuSR,11.41,,hf_open_llm_v2_240829.csv dolphin_2_9_2_qwen2_72b,HF OpenLLM v2,32.0,,hf_open_llm_v2_240829.csv dolphin_2_9_2_qwen2_72b,HFv2 BBH,47.7,,hf_open_llm_v2_240829.csv dolphin_2_9_2_qwen2_72b,HFv2 GPQA,16.0,,hf_open_llm_v2_240829.csv dolphin_2_9_2_qwen2_72b,HFv2 IFEval,40.38,,hf_open_llm_v2_240829.csv dolphin_2_9_2_qwen2_72b,HFv2 MMLU Pro,49.52,,hf_open_llm_v2_240829.csv dolphin_2_9_2_qwen2_72b,HFv2 Math Level 5,21.37,,hf_open_llm_v2_240829.csv dolphin_2_9_2_qwen2_72b,HFv2 MuSR,17.04,,hf_open_llm_v2_240829.csv dolphin_2_9_2_qwen2_7b,HF OpenLLM v2,20.96,,hf_open_llm_v2_240829.csv dolphin_2_9_2_qwen2_7b,HFv2 BBH,27.91,,hf_open_llm_v2_240829.csv dolphin_2_9_2_qwen2_7b,HFv2 GPQA,5.37,,hf_open_llm_v2_240829.csv dolphin_2_9_2_qwen2_7b,HFv2 IFEval,35.35,,hf_open_llm_v2_240829.csv dolphin_2_9_2_qwen2_7b,HFv2 MMLU Pro,33.9,,hf_open_llm_v2_240829.csv dolphin_2_9_2_qwen2_7b,HFv2 Math Level 5,11.56,,hf_open_llm_v2_240829.csv dolphin_2_9_2_qwen2_7b,HFv2 MuSR,11.66,,hf_open_llm_v2_240829.csv dolphin_2_9_3_mistral_7b_32k,HF OpenLLM v2,19.31,,hf_open_llm_v2_240829.csv dolphin_2_9_3_mistral_7b_32k,HFv2 BBH,26.91,,hf_open_llm_v2_240829.csv dolphin_2_9_3_mistral_7b_32k,HFv2 GPQA,4.7,,hf_open_llm_v2_240829.csv dolphin_2_9_3_mistral_7b_32k,HFv2 IFEval,41.26,,hf_open_llm_v2_240829.csv dolphin_2_9_3_mistral_7b_32k,HFv2 MMLU Pro,20.23,,hf_open_llm_v2_240829.csv dolphin_2_9_3_mistral_7b_32k,HFv2 Math Level 5,4.83,,hf_open_llm_v2_240829.csv dolphin_2_9_3_mistral_7b_32k,HFv2 MuSR,17.93,,hf_open_llm_v2_240829.csv dolphin_2_9_llama3_8b,HF OpenLLM v2,18.3,,hf_open_llm_v2_240829.csv dolphin_2_9_llama3_8b,HFv2 BBH,27.86,,hf_open_llm_v2_240829.csv dolphin_2_9_llama3_8b,HFv2 GPQA,4.92,,hf_open_llm_v2_240829.csv dolphin_2_9_llama3_8b,HFv2 IFEval,38.5,,hf_open_llm_v2_240829.csv dolphin_2_9_llama3_8b,HFv2 MMLU Pro,19.68,,hf_open_llm_v2_240829.csv dolphin_2_9_llama3_8b,HFv2 Math Level 5,5.06,,hf_open_llm_v2_240829.csv dolphin_2_9_llama3_8b,HFv2 MuSR,13.79,,hf_open_llm_v2_240829.csv einstein_v4_7b,HF OpenLLM v2,16.73,,hf_open_llm_v2_240829.csv einstein_v4_7b,HFv2 BBH,14.3,,hf_open_llm_v2_240829.csv einstein_v4_7b,HFv2 GPQA,4.25,,hf_open_llm_v2_240829.csv einstein_v4_7b,HFv2 IFEval,47.08,,hf_open_llm_v2_240829.csv einstein_v4_7b,HFv2 MMLU Pro,13.99,,hf_open_llm_v2_240829.csv einstein_v4_7b,HFv2 Math Level 5,1.74,,hf_open_llm_v2_240829.csv einstein_v4_7b,HFv2 MuSR,19.02,,hf_open_llm_v2_240829.csv einstein_v6_1_developed_by_weyaxi_llama3_8b,HF OpenLLM v2,19.05,,hf_open_llm_v2_240829.csv einstein_v6_1_developed_by_weyaxi_llama3_8b,HFv2 BBH,29.69,,hf_open_llm_v2_240829.csv einstein_v6_1_developed_by_weyaxi_llama3_8b,HFv2 GPQA,3.13,,hf_open_llm_v2_240829.csv einstein_v6_1_developed_by_weyaxi_llama3_8b,HFv2 IFEval,39.27,,hf_open_llm_v2_240829.csv einstein_v6_1_developed_by_weyaxi_llama3_8b,HFv2 MMLU Pro,23.25,,hf_open_llm_v2_240829.csv einstein_v6_1_developed_by_weyaxi_llama3_8b,HFv2 Math Level 5,5.59,,hf_open_llm_v2_240829.csv einstein_v6_1_developed_by_weyaxi_llama3_8b,HFv2 MuSR,13.39,,hf_open_llm_v2_240829.csv einstein_v6_1_llama3_8b,HF OpenLLM v2,19.99,,hf_open_llm_v2_240829.csv einstein_v6_1_llama3_8b,HFv2 BBH,29.38,,hf_open_llm_v2_240829.csv einstein_v6_1_llama3_8b,HFv2 GPQA,4.25,,hf_open_llm_v2_240829.csv einstein_v6_1_llama3_8b,HFv2 IFEval,45.68,,hf_open_llm_v2_240829.csv einstein_v6_1_llama3_8b,HFv2 MMLU Pro,23.68,,hf_open_llm_v2_240829.csv einstein_v6_1_llama3_8b,HFv2 Math Level 5,5.74,,hf_open_llm_v2_240829.csv einstein_v6_1_llama3_8b,HFv2 MuSR,11.23,,hf_open_llm_v2_240829.csv ende_chat_0_0_7,HF OpenLLM v2,13.08,,hf_open_llm_v2_240829.csv ende_chat_0_0_7,HFv2 BBH,13.58,,hf_open_llm_v2_240829.csv ende_chat_0_0_7,HFv2 GPQA,4.14,,hf_open_llm_v2_240829.csv ende_chat_0_0_7,HFv2 IFEval,44.01,,hf_open_llm_v2_240829.csv ende_chat_0_0_7,HFv2 MMLU Pro,10.74,,hf_open_llm_v2_240829.csv ende_chat_0_0_7,HFv2 Math Level 5,0.0,,hf_open_llm_v2_240829.csv ende_chat_0_0_7,HFv2 MuSR,6.03,,hf_open_llm_v2_240829.csv ennoai_pro_french_llama3_8b_v0_4,HF OpenLLM v2,15.18,,hf_open_llm_v2_240829.csv ennoai_pro_french_llama3_8b_v0_4,HFv2 BBH,16.88,,hf_open_llm_v2_240829.csv ennoai_pro_french_llama3_8b_v0_4,HFv2 GPQA,2.8,,hf_open_llm_v2_240829.csv ennoai_pro_french_llama3_8b_v0_4,HFv2 IFEval,41.89,,hf_open_llm_v2_240829.csv ennoai_pro_french_llama3_8b_v0_4,HFv2 MMLU Pro,18.16,,hf_open_llm_v2_240829.csv ennoai_pro_french_llama3_8b_v0_4,HFv2 Math Level 5,0.6,,hf_open_llm_v2_240829.csv ennoai_pro_french_llama3_8b_v0_4,HFv2 MuSR,10.76,,hf_open_llm_v2_240829.csv ennoai_pro_llama3_8b,HF OpenLLM v2,12.17,,hf_open_llm_v2_240829.csv ennoai_pro_llama3_8b,HFv2 BBH,17.51,,hf_open_llm_v2_240829.csv ennoai_pro_llama3_8b,HFv2 GPQA,1.57,,hf_open_llm_v2_240829.csv ennoai_pro_llama3_8b,HFv2 IFEval,31.95,,hf_open_llm_v2_240829.csv ennoai_pro_llama3_8b,HFv2 MMLU Pro,12.79,,hf_open_llm_v2_240829.csv ennoai_pro_llama3_8b,HFv2 Math Level 5,0.15,,hf_open_llm_v2_240829.csv ennoai_pro_llama3_8b,HFv2 MuSR,9.08,,hf_open_llm_v2_240829.csv exaone_3_0_7_8b_instruct,HF OpenLLM v2,21.4,,hf_open_llm_v2_240829.csv exaone_3_0_7_8b_instruct,HFv2 BBH,17.98,,hf_open_llm_v2_240829.csv exaone_3_0_7_8b_instruct,HFv2 GPQA,2.13,,hf_open_llm_v2_240829.csv exaone_3_0_7_8b_instruct,HFv2 IFEval,71.93,,hf_open_llm_v2_240829.csv exaone_3_0_7_8b_instruct,HFv2 MMLU Pro,28.63,,hf_open_llm_v2_240829.csv exaone_3_0_7_8b_instruct,HFv2 Math Level 5,4.46,,hf_open_llm_v2_240829.csv exaone_3_0_7_8b_instruct,HFv2 MuSR,3.3,,hf_open_llm_v2_240829.csv falcon_11b,HF OpenLLM v2,13.78,,hf_open_llm_v2_240829.csv falcon_11b,HFv2 BBH,21.94,,hf_open_llm_v2_240829.csv falcon_11b,HFv2 GPQA,2.8,,hf_open_llm_v2_240829.csv falcon_11b,HFv2 IFEval,32.61,,hf_open_llm_v2_240829.csv falcon_11b,HFv2 MMLU Pro,15.44,,hf_open_llm_v2_240829.csv falcon_11b,HFv2 Math Level 5,2.34,,hf_open_llm_v2_240829.csv falcon_11b,HFv2 MuSR,7.53,,hf_open_llm_v2_240829.csv falcon_40b,HF OpenLLM v2,11.33,,hf_open_llm_v2_240829.csv falcon_40b,HFv2 BBH,16.58,,hf_open_llm_v2_240829.csv falcon_40b,HFv2 GPQA,3.13,,hf_open_llm_v2_240829.csv falcon_40b,HFv2 IFEval,24.96,,hf_open_llm_v2_240829.csv falcon_40b,HFv2 MMLU Pro,16.72,,hf_open_llm_v2_240829.csv falcon_40b,HFv2 Math Level 5,1.36,,hf_open_llm_v2_240829.csv falcon_40b,HFv2 MuSR,5.19,,hf_open_llm_v2_240829.csv falcon_40b_instruct,HF OpenLLM v2,10.41,,hf_open_llm_v2_240829.csv falcon_40b_instruct,HFv2 BBH,17.22,,hf_open_llm_v2_240829.csv falcon_40b_instruct,HFv2 GPQA,0.0,,hf_open_llm_v2_240829.csv falcon_40b_instruct,HFv2 IFEval,24.54,,hf_open_llm_v2_240829.csv falcon_40b_instruct,HFv2 MMLU Pro,14.02,,hf_open_llm_v2_240829.csv falcon_40b_instruct,HFv2 Math Level 5,1.51,,hf_open_llm_v2_240829.csv falcon_40b_instruct,HFv2 MuSR,5.16,,hf_open_llm_v2_240829.csv falcon_7b,HF OpenLLM v2,5.1,,hf_open_llm_v2_240829.csv falcon_7b,HFv2 BBH,5.96,,hf_open_llm_v2_240829.csv falcon_7b,HFv2 GPQA,0.0,,hf_open_llm_v2_240829.csv falcon_7b,HFv2 IFEval,18.21,,hf_open_llm_v2_240829.csv falcon_7b,HFv2 MMLU Pro,1.39,,hf_open_llm_v2_240829.csv falcon_7b,HFv2 Math Level 5,0.53,,hf_open_llm_v2_240829.csv falcon_7b,HFv2 MuSR,4.5,,hf_open_llm_v2_240829.csv falcon_7b_instruct,HF OpenLLM v2,5.02,,hf_open_llm_v2_240829.csv falcon_7b_instruct,HFv2 BBH,4.82,,hf_open_llm_v2_240829.csv falcon_7b_instruct,HFv2 GPQA,0.0,,hf_open_llm_v2_240829.csv falcon_7b_instruct,HFv2 IFEval,19.69,,hf_open_llm_v2_240829.csv falcon_7b_instruct,HFv2 MMLU Pro,1.73,,hf_open_llm_v2_240829.csv falcon_7b_instruct,HFv2 Math Level 5,0.6,,hf_open_llm_v2_240829.csv falcon_7b_instruct,HFv2 MuSR,3.25,,hf_open_llm_v2_240829.csv falcon_mamba_7b,HF OpenLLM v2,15.04,,hf_open_llm_v2_240829.csv falcon_mamba_7b,HFv2 BBH,19.88,,hf_open_llm_v2_240829.csv falcon_mamba_7b,HFv2 GPQA,8.05,,hf_open_llm_v2_240829.csv falcon_mamba_7b,HFv2 IFEval,33.36,,hf_open_llm_v2_240829.csv falcon_mamba_7b,HFv2 MMLU Pro,14.47,,hf_open_llm_v2_240829.csv falcon_mamba_7b,HFv2 Math Level 5,3.63,,hf_open_llm_v2_240829.csv falcon_mamba_7b,HFv2 MuSR,10.86,,hf_open_llm_v2_240829.csv flan_flan-ul2,HF OpenLLM v2,13.55,,hf_open_llm_v2_240829.csv flan_flan-ul2,HFv2 BBH,30.02,,hf_open_llm_v2_240829.csv flan_flan-ul2,HFv2 GPQA,5.03,,hf_open_llm_v2_240829.csv flan_flan-ul2,HFv2 IFEval,23.93,,hf_open_llm_v2_240829.csv flan_flan-ul2,HFv2 MMLU Pro,16.59,,hf_open_llm_v2_240829.csv flan_flan-ul2,HFv2 Math Level 5,0.15,,hf_open_llm_v2_240829.csv flan_flan-ul2,HFv2 MuSR,5.58,,hf_open_llm_v2_240829.csv flan_t5_base,HF OpenLLM v2,6.24,,hf_open_llm_v2_240829.csv flan_t5_base,HFv2 BBH,11.34,,hf_open_llm_v2_240829.csv flan_t5_base,HFv2 GPQA,0.0,,hf_open_llm_v2_240829.csv flan_t5_base,HFv2 IFEval,18.91,,hf_open_llm_v2_240829.csv flan_t5_base,HFv2 MMLU Pro,3.97,,hf_open_llm_v2_240829.csv flan_t5_base,HFv2 Math Level 5,0.0,,hf_open_llm_v2_240829.csv flan_t5_base,HFv2 MuSR,3.22,,hf_open_llm_v2_240829.csv flan_t5_large,HF OpenLLM v2,9.42,,hf_open_llm_v2_240829.csv flan_t5_large,HFv2 BBH,17.51,,hf_open_llm_v2_240829.csv flan_t5_large,HFv2 GPQA,0.11,,hf_open_llm_v2_240829.csv flan_t5_large,HFv2 IFEval,22.01,,hf_open_llm_v2_240829.csv flan_t5_large,HFv2 MMLU Pro,7.88,,hf_open_llm_v2_240829.csv flan_t5_large,HFv2 Math Level 5,0.0,,hf_open_llm_v2_240829.csv flan_t5_large,HFv2 MuSR,9.01,,hf_open_llm_v2_240829.csv flan_t5_small,HF OpenLLM v2,6.0,,hf_open_llm_v2_240829.csv flan_t5_small,HFv2 BBH,6.36,,hf_open_llm_v2_240829.csv flan_t5_small,HFv2 GPQA,1.45,,hf_open_llm_v2_240829.csv flan_t5_small,HFv2 IFEval,15.24,,hf_open_llm_v2_240829.csv flan_t5_small,HFv2 MMLU Pro,2.59,,hf_open_llm_v2_240829.csv flan_t5_small,HFv2 Math Level 5,0.0,,hf_open_llm_v2_240829.csv flan_t5_small,HFv2 MuSR,10.37,,hf_open_llm_v2_240829.csv flan_t5_xl,HF OpenLLM v2,11.59,,hf_open_llm_v2_240829.csv flan_t5_xl,HFv2 BBH,22.84,,hf_open_llm_v2_240829.csv flan_t5_xl,HFv2 GPQA,0.34,,hf_open_llm_v2_240829.csv flan_t5_xl,HFv2 IFEval,22.37,,hf_open_llm_v2_240829.csv flan_t5_xl,HFv2 MMLU Pro,12.74,,hf_open_llm_v2_240829.csv flan_t5_xl,HFv2 Math Level 5,0.08,,hf_open_llm_v2_240829.csv flan_t5_xl,HFv2 MuSR,11.85,,hf_open_llm_v2_240829.csv flan_t5_xxl,HF OpenLLM v2,13.49,,hf_open_llm_v2_240829.csv flan_t5_xxl,HFv2 BBH,30.12,,hf_open_llm_v2_240829.csv flan_t5_xxl,HFv2 GPQA,2.68,,hf_open_llm_v2_240829.csv flan_t5_xxl,HFv2 IFEval,22.0,,hf_open_llm_v2_240829.csv flan_t5_xxl,HFv2 MMLU Pro,14.92,,hf_open_llm_v2_240829.csv flan_t5_xxl,HFv2 Math Level 5,0.0,,hf_open_llm_v2_240829.csv flan_t5_xxl,HFv2 MuSR,11.19,,hf_open_llm_v2_240829.csv fox_1_1_6b,HF OpenLLM v2,7.69,,hf_open_llm_v2_240829.csv fox_1_1_6b,HFv2 BBH,7.4,,hf_open_llm_v2_240829.csv fox_1_1_6b,HFv2 GPQA,1.79,,hf_open_llm_v2_240829.csv fox_1_1_6b,HFv2 IFEval,27.66,,hf_open_llm_v2_240829.csv fox_1_1_6b,HFv2 MMLU Pro,4.13,,hf_open_llm_v2_240829.csv fox_1_1_6b,HFv2 Math Level 5,1.28,,hf_open_llm_v2_240829.csv fox_1_1_6b,HFv2 MuSR,3.87,,hf_open_llm_v2_240829.csv gemma_1_1_2b_it,HF OpenLLM v2,7.78,,hf_open_llm_v2_240829.csv gemma_1_1_2b_it,HFv2 BBH,5.86,,hf_open_llm_v2_240829.csv gemma_1_1_2b_it,HFv2 GPQA,2.57,,hf_open_llm_v2_240829.csv gemma_1_1_2b_it,HFv2 IFEval,30.67,,hf_open_llm_v2_240829.csv gemma_1_1_2b_it,HFv2 MMLU Pro,5.37,,hf_open_llm_v2_240829.csv gemma_1_1_2b_it,HFv2 Math Level 5,0.15,,hf_open_llm_v2_240829.csv gemma_1_1_2b_it,HFv2 MuSR,2.02,,hf_open_llm_v2_240829.csv gemma_1_1_7b_it,HF OpenLLM v2,17.4,,hf_open_llm_v2_240829.csv gemma_1_1_7b_it,HFv2 BBH,15.93,,hf_open_llm_v2_240829.csv gemma_1_1_7b_it,HFv2 GPQA,5.82,,hf_open_llm_v2_240829.csv gemma_1_1_7b_it,HFv2 IFEval,50.39,,hf_open_llm_v2_240829.csv gemma_1_1_7b_it,HFv2 MMLU Pro,17.6,,hf_open_llm_v2_240829.csv gemma_1_1_7b_it,HFv2 Math Level 5,3.17,,hf_open_llm_v2_240829.csv gemma_1_1_7b_it,HFv2 MuSR,11.51,,hf_open_llm_v2_240829.csv gemma_2_27b,HF OpenLLM v2,23.64,,hf_open_llm_v2_240829.csv gemma_2_27b,HFv2 BBH,37.39,,hf_open_llm_v2_240829.csv gemma_2_27b,HFv2 GPQA,13.42,,hf_open_llm_v2_240829.csv gemma_2_27b,HFv2 IFEval,24.75,,hf_open_llm_v2_240829.csv gemma_2_27b,HFv2 MMLU Pro,37.45,,hf_open_llm_v2_240829.csv gemma_2_27b,HFv2 Math Level 5,14.88,,hf_open_llm_v2_240829.csv gemma_2_27b,HFv2 MuSR,13.92,,hf_open_llm_v2_240829.csv gemma_2_27b_it,HF OpenLLM v2,32.31,,hf_open_llm_v2_240829.csv gemma_2_27b_it,HFv2 BBH,49.27,,hf_open_llm_v2_240829.csv gemma_2_27b_it,HFv2 GPQA,16.67,,hf_open_llm_v2_240829.csv gemma_2_27b_it,HFv2 IFEval,79.78,,hf_open_llm_v2_240829.csv gemma_2_27b_it,HFv2 MMLU Pro,38.35,,hf_open_llm_v2_240829.csv gemma_2_27b_it,HFv2 Math Level 5,0.68,,hf_open_llm_v2_240829.csv gemma_2_27b_it,HFv2 MuSR,9.11,,hf_open_llm_v2_240829.csv gemma_2_2b,HF OpenLLM v2,10.13,,hf_open_llm_v2_240829.csv gemma_2_2b,HFv2 BBH,11.76,,hf_open_llm_v2_240829.csv gemma_2_2b,HFv2 GPQA,1.68,,hf_open_llm_v2_240829.csv gemma_2_2b,HFv2 IFEval,19.93,,hf_open_llm_v2_240829.csv gemma_2_2b,HFv2 MMLU Pro,13.11,,hf_open_llm_v2_240829.csv gemma_2_2b,HFv2 Math Level 5,2.87,,hf_open_llm_v2_240829.csv gemma_2_2b,HFv2 MuSR,11.43,,hf_open_llm_v2_240829.csv gemma_2_9b,HF OpenLLM v2,20.93,,hf_open_llm_v2_240829.csv gemma_2_9b,HFv2 BBH,34.1,,hf_open_llm_v2_240829.csv gemma_2_9b,HFv2 GPQA,10.51,,hf_open_llm_v2_240829.csv gemma_2_9b,HFv2 IFEval,20.4,,hf_open_llm_v2_240829.csv gemma_2_9b,HFv2 MMLU Pro,34.48,,hf_open_llm_v2_240829.csv gemma_2_9b,HFv2 Math Level 5,11.78,,hf_open_llm_v2_240829.csv gemma_2_9b,HFv2 MuSR,14.3,,hf_open_llm_v2_240829.csv gemma_2_9b_it,HF OpenLLM v2,28.86,,hf_open_llm_v2_240829.csv gemma_2_9b_it,HFv2 BBH,42.14,,hf_open_llm_v2_240829.csv gemma_2_9b_it,HFv2 GPQA,14.77,,hf_open_llm_v2_240829.csv gemma_2_9b_it,HFv2 IFEval,74.36,,hf_open_llm_v2_240829.csv gemma_2_9b_it,HFv2 MMLU Pro,31.95,,hf_open_llm_v2_240829.csv gemma_2_9b_it,HFv2 Math Level 5,0.23,,hf_open_llm_v2_240829.csv gemma_2_9b_it,HFv2 MuSR,9.74,,hf_open_llm_v2_240829.csv gemma_2_9b_it_simpo,HF OpenLLM v2,21.16,,hf_open_llm_v2_240829.csv gemma_2_9b_it_simpo,HFv2 BBH,40.09,,hf_open_llm_v2_240829.csv gemma_2_9b_it_simpo,HFv2 GPQA,11.41,,hf_open_llm_v2_240829.csv gemma_2_9b_it_simpo,HFv2 IFEval,32.07,,hf_open_llm_v2_240829.csv gemma_2_9b_it_simpo,HFv2 MMLU Pro,33.06,,hf_open_llm_v2_240829.csv gemma_2_9b_it_simpo,HFv2 Math Level 5,0.0,,hf_open_llm_v2_240829.csv gemma_2_9b_it_simpo,HFv2 MuSR,10.34,,hf_open_llm_v2_240829.csv gemma_2_9b_it_sppo_iter2,HF OpenLLM v2,21.22,,hf_open_llm_v2_240829.csv gemma_2_9b_it_sppo_iter2,HFv2 BBH,42.17,,hf_open_llm_v2_240829.csv gemma_2_9b_it_sppo_iter2,HFv2 GPQA,11.3,,hf_open_llm_v2_240829.csv gemma_2_9b_it_sppo_iter2,HFv2 IFEval,31.0,,hf_open_llm_v2_240829.csv gemma_2_9b_it_sppo_iter2,HFv2 MMLU Pro,31.89,,hf_open_llm_v2_240829.csv gemma_2_9b_it_sppo_iter2,HFv2 Math Level 5,0.0,,hf_open_llm_v2_240829.csv gemma_2_9b_it_sppo_iter2,HFv2 MuSR,10.94,,hf_open_llm_v2_240829.csv gemma_2_9b_it_sppo_iter_1,HF OpenLLM v2,20.55,,hf_open_llm_v2_240829.csv gemma_2_9b_it_sppo_iter_1,HFv2 BBH,41.68,,hf_open_llm_v2_240829.csv gemma_2_9b_it_sppo_iter_1,HFv2 GPQA,12.64,,hf_open_llm_v2_240829.csv gemma_2_9b_it_sppo_iter_1,HFv2 IFEval,30.15,,hf_open_llm_v2_240829.csv gemma_2_9b_it_sppo_iter_1,HFv2 MMLU Pro,31.71,,hf_open_llm_v2_240829.csv gemma_2_9b_it_sppo_iter_1,HFv2 Math Level 5,0.0,,hf_open_llm_v2_240829.csv gemma_2_9b_it_sppo_iter_1,HFv2 MuSR,7.15,,hf_open_llm_v2_240829.csv gemma_2_9b_it_sppo_iter_1_evol_1,HF OpenLLM v2,20.1,,hf_open_llm_v2_240829.csv gemma_2_9b_it_sppo_iter_1_evol_1,HFv2 BBH,41.1,,hf_open_llm_v2_240829.csv gemma_2_9b_it_sppo_iter_1_evol_1,HFv2 GPQA,12.08,,hf_open_llm_v2_240829.csv gemma_2_9b_it_sppo_iter_1_evol_1,HFv2 IFEval,29.42,,hf_open_llm_v2_240829.csv gemma_2_9b_it_sppo_iter_1_evol_1,HFv2 MMLU Pro,31.11,,hf_open_llm_v2_240829.csv gemma_2_9b_it_sppo_iter_1_evol_1,HFv2 Math Level 5,0.0,,hf_open_llm_v2_240829.csv gemma_2_9b_it_sppo_iter_1_evol_1,HFv2 MuSR,6.9,,hf_open_llm_v2_240829.csv gemma_2b,HF OpenLLM v2,7.31,,hf_open_llm_v2_240829.csv gemma_2b,HFv2 BBH,8.47,,hf_open_llm_v2_240829.csv gemma_2b,HFv2 GPQA,0.67,,hf_open_llm_v2_240829.csv gemma_2b,HFv2 IFEval,20.38,,hf_open_llm_v2_240829.csv gemma_2b,HFv2 MMLU Pro,4.06,,hf_open_llm_v2_240829.csv gemma_2b,HFv2 Math Level 5,2.72,,hf_open_llm_v2_240829.csv gemma_2b,HFv2 MuSR,7.56,,hf_open_llm_v2_240829.csv gemma_2b_it,HF OpenLLM v2,7.22,,hf_open_llm_v2_240829.csv gemma_2b_it,HFv2 BBH,5.21,,hf_open_llm_v2_240829.csv gemma_2b_it,HFv2 GPQA,3.8,,hf_open_llm_v2_240829.csv gemma_2b_it,HFv2 IFEval,26.9,,hf_open_llm_v2_240829.csv gemma_2b_it,HFv2 MMLU Pro,3.92,,hf_open_llm_v2_240829.csv gemma_2b_it,HFv2 Math Level 5,0.45,,hf_open_llm_v2_240829.csv gemma_2b_it,HFv2 MuSR,3.03,,hf_open_llm_v2_240829.csv gemma_2b_orpo,HF OpenLLM v2,7.17,,hf_open_llm_v2_240829.csv gemma_2b_orpo,HFv2 BBH,7.95,,hf_open_llm_v2_240829.csv gemma_2b_orpo,HFv2 GPQA,1.57,,hf_open_llm_v2_240829.csv gemma_2b_orpo,HFv2 IFEval,24.78,,hf_open_llm_v2_240829.csv gemma_2b_orpo,HFv2 MMLU Pro,3.4,,hf_open_llm_v2_240829.csv gemma_2b_orpo,HFv2 Math Level 5,1.21,,hf_open_llm_v2_240829.csv gemma_2b_orpo,HFv2 MuSR,4.13,,hf_open_llm_v2_240829.csv gemma_7b,HF OpenLLM v2,15.28,,hf_open_llm_v2_240829.csv gemma_7b,HFv2 BBH,21.12,,hf_open_llm_v2_240829.csv gemma_7b,HFv2 GPQA,4.92,,hf_open_llm_v2_240829.csv gemma_7b,HFv2 IFEval,26.59,,hf_open_llm_v2_240829.csv gemma_7b,HFv2 MMLU Pro,21.64,,hf_open_llm_v2_240829.csv gemma_7b,HFv2 Math Level 5,6.42,,hf_open_llm_v2_240829.csv gemma_7b,HFv2 MuSR,10.98,,hf_open_llm_v2_240829.csv gemma_7b_it,HF OpenLLM v2,12.83,,hf_open_llm_v2_240829.csv gemma_7b_it,HFv2 BBH,11.88,,hf_open_llm_v2_240829.csv gemma_7b_it,HFv2 GPQA,4.59,,hf_open_llm_v2_240829.csv gemma_7b_it,HFv2 IFEval,38.68,,hf_open_llm_v2_240829.csv gemma_7b_it,HFv2 MMLU Pro,7.72,,hf_open_llm_v2_240829.csv gemma_7b_it,HFv2 Math Level 5,1.59,,hf_open_llm_v2_240829.csv gemma_7b_it,HFv2 MuSR,12.53,,hf_open_llm_v2_240829.csv glm_4_9b,HF OpenLLM v2,18.01,,hf_open_llm_v2_240829.csv glm_4_9b,HFv2 BBH,35.81,,hf_open_llm_v2_240829.csv glm_4_9b,HFv2 GPQA,8.84,,hf_open_llm_v2_240829.csv glm_4_9b,HFv2 IFEval,14.26,,hf_open_llm_v2_240829.csv glm_4_9b,HFv2 MMLU Pro,34.94,,hf_open_llm_v2_240829.csv glm_4_9b,HFv2 Math Level 5,0.0,,hf_open_llm_v2_240829.csv glm_4_9b,HFv2 MuSR,14.19,,hf_open_llm_v2_240829.csv glm_4_9b_chat,HF OpenLLM v2,10.97,,hf_open_llm_v2_240829.csv glm_4_9b_chat,HFv2 BBH,25.21,,hf_open_llm_v2_240829.csv glm_4_9b_chat,HFv2 GPQA,8.5,,hf_open_llm_v2_240829.csv glm_4_9b_chat,HFv2 IFEval,0.0,,hf_open_llm_v2_240829.csv glm_4_9b_chat,HFv2 MMLU Pro,24.07,,hf_open_llm_v2_240829.csv glm_4_9b_chat,HFv2 Math Level 5,0.0,,hf_open_llm_v2_240829.csv glm_4_9b_chat,HFv2 MuSR,8.06,,hf_open_llm_v2_240829.csv go_bruins_v2,HF OpenLLM v2,15.27,,hf_open_llm_v2_240829.csv go_bruins_v2,HFv2 BBH,12.69,,hf_open_llm_v2_240829.csv go_bruins_v2,HFv2 GPQA,1.68,,hf_open_llm_v2_240829.csv go_bruins_v2,HFv2 IFEval,40.96,,hf_open_llm_v2_240829.csv go_bruins_v2,HFv2 MMLU Pro,19.57,,hf_open_llm_v2_240829.csv go_bruins_v2,HFv2 Math Level 5,5.74,,hf_open_llm_v2_240829.csv go_bruins_v2,HFv2 MuSR,10.99,,hf_open_llm_v2_240829.csv gpt2,HF OpenLLM v2,6.54,,hf_open_llm_v2_240829.csv gpt2,HFv2 BBH,9.2,,hf_open_llm_v2_240829.csv gpt2,HFv2 GPQA,1.12,,hf_open_llm_v2_240829.csv gpt2,HFv2 IFEval,18.08,,hf_open_llm_v2_240829.csv gpt2,HFv2 MMLU Pro,1.84,,hf_open_llm_v2_240829.csv gpt2,HFv2 Math Level 5,0.3,,hf_open_llm_v2_240829.csv gpt2,HFv2 MuSR,18.33,,hf_open_llm_v2_240829.csv gpt2_large,HF OpenLLM v2,5.48,,hf_open_llm_v2_240829.csv gpt2_large,HFv2 BBH,3.25,,hf_open_llm_v2_240829.csv gpt2_large,HFv2 GPQA,1.23,,hf_open_llm_v2_240829.csv gpt2_large,HFv2 IFEval,20.48,,hf_open_llm_v2_240829.csv gpt2_large,HFv2 MMLU Pro,1.58,,hf_open_llm_v2_240829.csv gpt2_large,HFv2 Math Level 5,0.68,,hf_open_llm_v2_240829.csv gpt2_large,HFv2 MuSR,5.66,,hf_open_llm_v2_240829.csv gpt2_medium,HF OpenLLM v2,5.81,,hf_open_llm_v2_240829.csv gpt2_medium,HFv2 BBH,2.72,,hf_open_llm_v2_240829.csv gpt2_medium,HFv2 GPQA,1.68,,hf_open_llm_v2_240829.csv gpt2_medium,HFv2 IFEval,22.08,,hf_open_llm_v2_240829.csv gpt2_medium,HFv2 MMLU Pro,2.02,,hf_open_llm_v2_240829.csv gpt2_medium,HFv2 Math Level 5,0.23,,hf_open_llm_v2_240829.csv gpt2_medium,HFv2 MuSR,6.16,,hf_open_llm_v2_240829.csv gpt2_xl,HF OpenLLM v2,4.98,,hf_open_llm_v2_240829.csv gpt2_xl,HFv2 BBH,2.58,,hf_open_llm_v2_240829.csv gpt2_xl,HFv2 GPQA,1.12,,hf_open_llm_v2_240829.csv gpt2_xl,HFv2 IFEval,20.39,,hf_open_llm_v2_240829.csv gpt2_xl,HFv2 MMLU Pro,1.46,,hf_open_llm_v2_240829.csv gpt2_xl,HFv2 Math Level 5,0.3,,hf_open_llm_v2_240829.csv gpt2_xl,HFv2 MuSR,4.04,,hf_open_llm_v2_240829.csv gpt_j_6b,HF OpenLLM v2,6.55,,hf_open_llm_v2_240829.csv gpt_j_6b,HFv2 BBH,4.91,,hf_open_llm_v2_240829.csv gpt_j_6b,HFv2 GPQA,0.0,,hf_open_llm_v2_240829.csv gpt_j_6b,HFv2 IFEval,25.22,,hf_open_llm_v2_240829.csv gpt_j_6b,HFv2 MMLU Pro,2.68,,hf_open_llm_v2_240829.csv gpt_j_6b,HFv2 Math Level 5,1.21,,hf_open_llm_v2_240829.csv gpt_j_6b,HFv2 MuSR,5.25,,hf_open_llm_v2_240829.csv gpt_neo_125m,HF OpenLLM v2,4.38,,hf_open_llm_v2_240829.csv gpt_neo_125m,HFv2 BBH,3.44,,hf_open_llm_v2_240829.csv gpt_neo_125m,HFv2 GPQA,0.45,,hf_open_llm_v2_240829.csv gpt_neo_125m,HFv2 IFEval,19.05,,hf_open_llm_v2_240829.csv gpt_neo_125m,HFv2 MMLU Pro,0.28,,hf_open_llm_v2_240829.csv gpt_neo_125m,HFv2 Math Level 5,0.45,,hf_open_llm_v2_240829.csv gpt_neo_125m,HFv2 MuSR,2.62,,hf_open_llm_v2_240829.csv gpt_neo_1_3b,HF OpenLLM v2,5.33,,hf_open_llm_v2_240829.csv gpt_neo_1_3b,HFv2 BBH,3.02,,hf_open_llm_v2_240829.csv gpt_neo_1_3b,HFv2 GPQA,0.78,,hf_open_llm_v2_240829.csv gpt_neo_1_3b,HFv2 IFEval,20.79,,hf_open_llm_v2_240829.csv gpt_neo_1_3b,HFv2 MMLU Pro,1.82,,hf_open_llm_v2_240829.csv gpt_neo_1_3b,HFv2 Math Level 5,0.68,,hf_open_llm_v2_240829.csv gpt_neo_1_3b,HFv2 MuSR,4.87,,hf_open_llm_v2_240829.csv gpt_neo_2_7b,HF OpenLLM v2,6.34,,hf_open_llm_v2_240829.csv gpt_neo_2_7b,HFv2 BBH,4.18,,hf_open_llm_v2_240829.csv gpt_neo_2_7b,HFv2 GPQA,2.13,,hf_open_llm_v2_240829.csv gpt_neo_2_7b,HFv2 IFEval,25.9,,hf_open_llm_v2_240829.csv gpt_neo_2_7b,HFv2 MMLU Pro,1.81,,hf_open_llm_v2_240829.csv gpt_neo_2_7b,HFv2 Math Level 5,0.53,,hf_open_llm_v2_240829.csv gpt_neo_2_7b,HFv2 MuSR,3.52,,hf_open_llm_v2_240829.csv gpt_neox_20b,HF OpenLLM v2,5.99,,hf_open_llm_v2_240829.csv gpt_neox_20b,HFv2 BBH,4.93,,hf_open_llm_v2_240829.csv gpt_neox_20b,HFv2 GPQA,0.0,,hf_open_llm_v2_240829.csv gpt_neox_20b,HFv2 IFEval,25.87,,hf_open_llm_v2_240829.csv gpt_neox_20b,HFv2 MMLU Pro,1.73,,hf_open_llm_v2_240829.csv gpt_neox_20b,HFv2 Math Level 5,0.6,,hf_open_llm_v2_240829.csv gpt_neox_20b,HFv2 MuSR,2.82,,hf_open_llm_v2_240829.csv gpt_sw3_40b,HF OpenLLM v2,4.68,,hf_open_llm_v2_240829.csv gpt_sw3_40b,HFv2 BBH,6.89,,hf_open_llm_v2_240829.csv gpt_sw3_40b,HFv2 GPQA,0.0,,hf_open_llm_v2_240829.csv gpt_sw3_40b,HFv2 IFEval,14.7,,hf_open_llm_v2_240829.csv gpt_sw3_40b,HFv2 MMLU Pro,3.06,,hf_open_llm_v2_240829.csv gpt_sw3_40b,HFv2 Math Level 5,0.6,,hf_open_llm_v2_240829.csv gpt_sw3_40b,HFv2 MuSR,2.84,,hf_open_llm_v2_240829.csv granite_7b_base,HF OpenLLM v2,7.75,,hf_open_llm_v2_240829.csv granite_7b_base,HFv2 BBH,9.05,,hf_open_llm_v2_240829.csv granite_7b_base,HFv2 GPQA,0.0,,hf_open_llm_v2_240829.csv granite_7b_base,HFv2 IFEval,24.14,,hf_open_llm_v2_240829.csv granite_7b_base,HFv2 MMLU Pro,9.27,,hf_open_llm_v2_240829.csv granite_7b_base,HFv2 Math Level 5,0.6,,hf_open_llm_v2_240829.csv granite_7b_base,HFv2 MuSR,3.4,,hf_open_llm_v2_240829.csv gritlm_7b_kto,HF OpenLLM v2,19.15,,hf_open_llm_v2_240829.csv gritlm_7b_kto,HFv2 BBH,27.9,,hf_open_llm_v2_240829.csv gritlm_7b_kto,HFv2 GPQA,6.38,,hf_open_llm_v2_240829.csv gritlm_7b_kto,HFv2 IFEval,53.1,,hf_open_llm_v2_240829.csv gritlm_7b_kto,HFv2 MMLU Pro,18.67,,hf_open_llm_v2_240829.csv gritlm_7b_kto,HFv2 Math Level 5,2.19,,hf_open_llm_v2_240829.csv gritlm_7b_kto,HFv2 MuSR,6.64,,hf_open_llm_v2_240829.csv gritlm_8x7b_kto,HF OpenLLM v2,25.62,,hf_open_llm_v2_240829.csv gritlm_8x7b_kto,HFv2 BBH,40.83,,hf_open_llm_v2_240829.csv gritlm_8x7b_kto,HFv2 GPQA,6.15,,hf_open_llm_v2_240829.csv gritlm_8x7b_kto,HFv2 IFEval,57.14,,hf_open_llm_v2_240829.csv gritlm_8x7b_kto,HFv2 MMLU Pro,29.42,,hf_open_llm_v2_240829.csv gritlm_8x7b_kto,HFv2 Math Level 5,8.53,,hf_open_llm_v2_240829.csv gritlm_8x7b_kto,HFv2 MuSR,11.67,,hf_open_llm_v2_240829.csv h2o_danube3_4b_chat,HF OpenLLM v2,11.36,,hf_open_llm_v2_240829.csv h2o_danube3_4b_chat,HFv2 BBH,8.84,,hf_open_llm_v2_240829.csv h2o_danube3_4b_chat,HFv2 GPQA,1.34,,hf_open_llm_v2_240829.csv h2o_danube3_4b_chat,HFv2 IFEval,36.29,,hf_open_llm_v2_240829.csv h2o_danube3_4b_chat,HFv2 MMLU Pro,13.65,,hf_open_llm_v2_240829.csv h2o_danube3_4b_chat,HFv2 Math Level 5,2.79,,hf_open_llm_v2_240829.csv h2o_danube3_4b_chat,HFv2 MuSR,5.23,,hf_open_llm_v2_240829.csv hare1_0_beta,HF OpenLLM v2,12.38,,hf_open_llm_v2_240829.csv hare1_0_beta,HFv2 BBH,14.09,,hf_open_llm_v2_240829.csv hare1_0_beta,HFv2 GPQA,0.67,,hf_open_llm_v2_240829.csv hare1_0_beta,HFv2 IFEval,34.71,,hf_open_llm_v2_240829.csv hare1_0_beta,HFv2 MMLU Pro,8.35,,hf_open_llm_v2_240829.csv hare1_0_beta,HFv2 Math Level 5,0.76,,hf_open_llm_v2_240829.csv hare1_0_beta,HFv2 MuSR,15.72,,hf_open_llm_v2_240829.csv hare_1_1b_base,HF OpenLLM v2,1.95,,hf_open_llm_v2_240829.csv hare_1_1b_base,HFv2 BBH,1.72,,hf_open_llm_v2_240829.csv hare_1_1b_base,HFv2 GPQA,1.23,,hf_open_llm_v2_240829.csv hare_1_1b_base,HFv2 IFEval,0.12,,hf_open_llm_v2_240829.csv hare_1_1b_base,HFv2 MMLU Pro,1.04,,hf_open_llm_v2_240829.csv hare_1_1b_base,HFv2 Math Level 5,0.0,,hf_open_llm_v2_240829.csv hare_1_1b_base,HFv2 MuSR,7.58,,hf_open_llm_v2_240829.csv hare_1_1b_base_0_5v,HF OpenLLM v2,10.78,,hf_open_llm_v2_240829.csv hare_1_1b_base_0_5v,HFv2 BBH,5.14,,hf_open_llm_v2_240829.csv hare_1_1b_base_0_5v,HFv2 GPQA,2.01,,hf_open_llm_v2_240829.csv hare_1_1b_base_0_5v,HFv2 IFEval,36.33,,hf_open_llm_v2_240829.csv hare_1_1b_base_0_5v,HFv2 MMLU Pro,8.08,,hf_open_llm_v2_240829.csv hare_1_1b_base_0_5v,HFv2 Math Level 5,0.68,,hf_open_llm_v2_240829.csv hare_1_1b_base_0_5v,HFv2 MuSR,12.44,,hf_open_llm_v2_240829.csv hebrew_gemma_11b_instruct,HF OpenLLM v2,13.81,,hf_open_llm_v2_240829.csv hebrew_gemma_11b_instruct,HFv2 BBH,16.86,,hf_open_llm_v2_240829.csv hebrew_gemma_11b_instruct,HFv2 GPQA,3.47,,hf_open_llm_v2_240829.csv hebrew_gemma_11b_instruct,HFv2 IFEval,30.21,,hf_open_llm_v2_240829.csv hebrew_gemma_11b_instruct,HFv2 MMLU Pro,17.27,,hf_open_llm_v2_240829.csv hebrew_gemma_11b_instruct,HFv2 Math Level 5,5.06,,hf_open_llm_v2_240829.csv hebrew_gemma_11b_instruct,HFv2 MuSR,9.97,,hf_open_llm_v2_240829.csv helpingai_15b,HF OpenLLM v2,4.52,,hf_open_llm_v2_240829.csv helpingai_15b,HFv2 BBH,1.82,,hf_open_llm_v2_240829.csv helpingai_15b,HFv2 GPQA,1.01,,hf_open_llm_v2_240829.csv helpingai_15b,HFv2 IFEval,20.3,,hf_open_llm_v2_240829.csv helpingai_15b,HFv2 MMLU Pro,1.24,,hf_open_llm_v2_240829.csv helpingai_15b,HFv2 Math Level 5,0.0,,hf_open_llm_v2_240829.csv helpingai_15b,HFv2 MuSR,2.73,,hf_open_llm_v2_240829.csv hermes_2_pro_llama3_8b,HF OpenLLM v2,21.63,,hf_open_llm_v2_240829.csv hermes_2_pro_llama3_8b,HFv2 BBH,30.67,,hf_open_llm_v2_240829.csv hermes_2_pro_llama3_8b,HFv2 GPQA,5.7,,hf_open_llm_v2_240829.csv hermes_2_pro_llama3_8b,HFv2 IFEval,53.62,,hf_open_llm_v2_240829.csv hermes_2_pro_llama3_8b,HFv2 MMLU Pro,22.8,,hf_open_llm_v2_240829.csv hermes_2_pro_llama3_8b,HFv2 Math Level 5,5.74,,hf_open_llm_v2_240829.csv hermes_2_pro_llama3_8b,HFv2 MuSR,11.25,,hf_open_llm_v2_240829.csv hermes_2_pro_mistral_7b,HF OpenLLM v2,21.64,,hf_open_llm_v2_240829.csv hermes_2_pro_mistral_7b,HFv2 BBH,29.43,,hf_open_llm_v2_240829.csv hermes_2_pro_mistral_7b,HFv2 GPQA,3.13,,hf_open_llm_v2_240829.csv hermes_2_pro_mistral_7b,HFv2 IFEval,56.68,,hf_open_llm_v2_240829.csv hermes_2_pro_mistral_7b,HFv2 MMLU Pro,21.63,,hf_open_llm_v2_240829.csv hermes_2_pro_mistral_7b,HFv2 Math Level 5,4.83,,hf_open_llm_v2_240829.csv hermes_2_pro_mistral_7b,HFv2 MuSR,14.13,,hf_open_llm_v2_240829.csv hermes_3_llama3_1_70b,HF OpenLLM v2,37.31,,hf_open_llm_v2_240829.csv hermes_3_llama3_1_70b,HFv2 BBH,53.77,,hf_open_llm_v2_240829.csv hermes_3_llama3_1_70b,HFv2 GPQA,14.88,,hf_open_llm_v2_240829.csv hermes_3_llama3_1_70b,HFv2 IFEval,76.61,,hf_open_llm_v2_240829.csv hermes_3_llama3_1_70b,HFv2 MMLU Pro,41.41,,hf_open_llm_v2_240829.csv hermes_3_llama3_1_70b,HFv2 Math Level 5,13.75,,hf_open_llm_v2_240829.csv hermes_3_llama3_1_70b,HFv2 MuSR,23.43,,hf_open_llm_v2_240829.csv hermes_3_llama3_1_8b,HF OpenLLM v2,23.49,,hf_open_llm_v2_240829.csv hermes_3_llama3_1_8b,HFv2 BBH,30.72,,hf_open_llm_v2_240829.csv hermes_3_llama3_1_8b,HFv2 GPQA,6.38,,hf_open_llm_v2_240829.csv hermes_3_llama3_1_8b,HFv2 IFEval,61.7,,hf_open_llm_v2_240829.csv hermes_3_llama3_1_8b,HFv2 MMLU Pro,23.77,,hf_open_llm_v2_240829.csv hermes_3_llama3_1_8b,HFv2 Math Level 5,4.76,,hf_open_llm_v2_240829.csv hermes_3_llama3_1_8b,HFv2 MuSR,13.62,,hf_open_llm_v2_240829.csv humanish_rp_llama3_1_8b,HF OpenLLM v2,25.17,,hf_open_llm_v2_240829.csv humanish_rp_llama3_1_8b,HFv2 BBH,29.96,,hf_open_llm_v2_240829.csv humanish_rp_llama3_1_8b,HFv2 GPQA,4.92,,hf_open_llm_v2_240829.csv humanish_rp_llama3_1_8b,HFv2 IFEval,66.69,,hf_open_llm_v2_240829.csv humanish_rp_llama3_1_8b,HFv2 MMLU Pro,27.52,,hf_open_llm_v2_240829.csv humanish_rp_llama3_1_8b,HFv2 Math Level 5,13.67,,hf_open_llm_v2_240829.csv humanish_rp_llama3_1_8b,HFv2 MuSR,8.27,,hf_open_llm_v2_240829.csv infinity_instruct_3m_0625_llama3_8b,HF OpenLLM v2,21.47,,hf_open_llm_v2_240829.csv infinity_instruct_3m_0625_llama3_8b,HFv2 BBH,28.99,,hf_open_llm_v2_240829.csv infinity_instruct_3m_0625_llama3_8b,HFv2 GPQA,3.36,,hf_open_llm_v2_240829.csv infinity_instruct_3m_0625_llama3_8b,HFv2 IFEval,60.5,,hf_open_llm_v2_240829.csv infinity_instruct_3m_0625_llama3_8b,HFv2 MMLU Pro,25.02,,hf_open_llm_v2_240829.csv infinity_instruct_3m_0625_llama3_8b,HFv2 Math Level 5,5.29,,hf_open_llm_v2_240829.csv infinity_instruct_3m_0625_llama3_8b,HFv2 MuSR,5.67,,hf_open_llm_v2_240829.csv instructlm_500m,HF OpenLLM v2,2.85,,hf_open_llm_v2_240829.csv instructlm_500m,HFv2 BBH,2.32,,hf_open_llm_v2_240829.csv instructlm_500m,HFv2 GPQA,0.89,,hf_open_llm_v2_240829.csv instructlm_500m,HFv2 IFEval,10.28,,hf_open_llm_v2_240829.csv instructlm_500m,HFv2 MMLU Pro,1.57,,hf_open_llm_v2_240829.csv instructlm_500m,HFv2 Math Level 5,0.0,,hf_open_llm_v2_240829.csv instructlm_500m,HFv2 MuSR,2.07,,hf_open_llm_v2_240829.csv internlm2_1_8b,HF OpenLLM v2,8.58,,hf_open_llm_v2_240829.csv internlm2_1_8b,HFv2 BBH,13.63,,hf_open_llm_v2_240829.csv internlm2_1_8b,HFv2 GPQA,0.0,,hf_open_llm_v2_240829.csv internlm2_1_8b,HFv2 IFEval,21.98,,hf_open_llm_v2_240829.csv internlm2_1_8b,HFv2 MMLU Pro,6.54,,hf_open_llm_v2_240829.csv internlm2_1_8b,HFv2 Math Level 5,1.13,,hf_open_llm_v2_240829.csv internlm2_1_8b,HFv2 MuSR,8.23,,hf_open_llm_v2_240829.csv internlm2_5_1_8b_chat,HF OpenLLM v2,12.11,,hf_open_llm_v2_240829.csv internlm2_5_1_8b_chat,HFv2 BBH,21.03,,hf_open_llm_v2_240829.csv internlm2_5_1_8b_chat,HFv2 GPQA,5.37,,hf_open_llm_v2_240829.csv internlm2_5_1_8b_chat,HFv2 IFEval,38.49,,hf_open_llm_v2_240829.csv internlm2_5_1_8b_chat,HFv2 MMLU Pro,3.32,,hf_open_llm_v2_240829.csv internlm2_5_1_8b_chat,HFv2 Math Level 5,0.0,,hf_open_llm_v2_240829.csv internlm2_5_1_8b_chat,HFv2 MuSR,4.42,,hf_open_llm_v2_240829.csv internlm2_5_20b_chat,HF OpenLLM v2,32.08,,hf_open_llm_v2_240829.csv internlm2_5_20b_chat,HFv2 BBH,62.83,,hf_open_llm_v2_240829.csv internlm2_5_20b_chat,HFv2 GPQA,9.51,,hf_open_llm_v2_240829.csv internlm2_5_20b_chat,HFv2 IFEval,70.1,,hf_open_llm_v2_240829.csv internlm2_5_20b_chat,HFv2 MMLU Pro,33.31,,hf_open_llm_v2_240829.csv internlm2_5_20b_chat,HFv2 Math Level 5,0.0,,hf_open_llm_v2_240829.csv internlm2_5_20b_chat,HFv2 MuSR,16.74,,hf_open_llm_v2_240829.csv internlm2_5_7b_chat,HF OpenLLM v2,30.46,,hf_open_llm_v2_240829.csv internlm2_5_7b_chat,HFv2 BBH,57.67,,hf_open_llm_v2_240829.csv internlm2_5_7b_chat,HFv2 GPQA,10.63,,hf_open_llm_v2_240829.csv internlm2_5_7b_chat,HFv2 IFEval,61.4,,hf_open_llm_v2_240829.csv internlm2_5_7b_chat,HFv2 MMLU Pro,30.42,,hf_open_llm_v2_240829.csv internlm2_5_7b_chat,HFv2 Math Level 5,8.31,,hf_open_llm_v2_240829.csv internlm2_5_7b_chat,HFv2 MuSR,14.35,,hf_open_llm_v2_240829.csv internlm2_chat_1_8b,HF OpenLLM v2,10.5,,hf_open_llm_v2_240829.csv internlm2_chat_1_8b,HFv2 BBH,20.67,,hf_open_llm_v2_240829.csv internlm2_chat_1_8b,HFv2 GPQA,2.13,,hf_open_llm_v2_240829.csv internlm2_chat_1_8b,HFv2 IFEval,23.87,,hf_open_llm_v2_240829.csv internlm2_chat_1_8b,HFv2 MMLU Pro,9.33,,hf_open_llm_v2_240829.csv internlm2_chat_1_8b,HFv2 Math Level 5,2.42,,hf_open_llm_v2_240829.csv internlm2_chat_1_8b,HFv2 MuSR,4.61,,hf_open_llm_v2_240829.csv jamba_v0_1,HF OpenLLM v2,9.1,,hf_open_llm_v2_240829.csv jamba_v0_1,HFv2 BBH,10.72,,hf_open_llm_v2_240829.csv jamba_v0_1,HFv2 GPQA,2.46,,hf_open_llm_v2_240829.csv jamba_v0_1,HFv2 IFEval,20.26,,hf_open_llm_v2_240829.csv jamba_v0_1,HFv2 MMLU Pro,16.45,,hf_open_llm_v2_240829.csv jamba_v0_1,HFv2 Math Level 5,0.98,,hf_open_llm_v2_240829.csv jamba_v0_1,HFv2 MuSR,3.71,,hf_open_llm_v2_240829.csv josiev4o_8b_stage1_v4,HF OpenLLM v2,15.57,,hf_open_llm_v2_240829.csv josiev4o_8b_stage1_v4,HFv2 BBH,25.79,,hf_open_llm_v2_240829.csv josiev4o_8b_stage1_v4,HFv2 GPQA,5.59,,hf_open_llm_v2_240829.csv josiev4o_8b_stage1_v4,HFv2 IFEval,25.53,,hf_open_llm_v2_240829.csv josiev4o_8b_stage1_v4,HFv2 MMLU Pro,25.74,,hf_open_llm_v2_240829.csv josiev4o_8b_stage1_v4,HFv2 Math Level 5,4.68,,hf_open_llm_v2_240829.csv josiev4o_8b_stage1_v4,HFv2 MuSR,6.08,,hf_open_llm_v2_240829.csv k2,HF OpenLLM v2,14.53,,hf_open_llm_v2_240829.csv k2,HFv2 BBH,28.22,,hf_open_llm_v2_240829.csv k2,HFv2 GPQA,3.58,,hf_open_llm_v2_240829.csv k2,HFv2 IFEval,22.52,,hf_open_llm_v2_240829.csv k2,HFv2 MMLU Pro,22.27,,hf_open_llm_v2_240829.csv k2,HFv2 Math Level 5,2.04,,hf_open_llm_v2_240829.csv k2,HFv2 MuSR,8.55,,hf_open_llm_v2_240829.csv k2_chat,HF OpenLLM v2,22.93,,hf_open_llm_v2_240829.csv k2_chat,HFv2 BBH,33.79,,hf_open_llm_v2_240829.csv k2_chat,HFv2 GPQA,7.49,,hf_open_llm_v2_240829.csv k2_chat,HFv2 IFEval,51.52,,hf_open_llm_v2_240829.csv k2_chat,HFv2 MMLU Pro,26.34,,hf_open_llm_v2_240829.csv k2_chat,HFv2 Math Level 5,1.59,,hf_open_llm_v2_240829.csv k2_chat,HFv2 MuSR,16.82,,hf_open_llm_v2_240829.csv lion_gemma_2b_dpo_v1_0,HF OpenLLM v2,11.48,,hf_open_llm_v2_240829.csv lion_gemma_2b_dpo_v1_0,HFv2 BBH,14.59,,hf_open_llm_v2_240829.csv lion_gemma_2b_dpo_v1_0,HFv2 GPQA,0.45,,hf_open_llm_v2_240829.csv lion_gemma_2b_dpo_v1_0,HFv2 IFEval,32.78,,hf_open_llm_v2_240829.csv lion_gemma_2b_dpo_v1_0,HFv2 MMLU Pro,7.4,,hf_open_llm_v2_240829.csv lion_gemma_2b_dpo_v1_0,HFv2 Math Level 5,4.31,,hf_open_llm_v2_240829.csv lion_gemma_2b_dpo_v1_0,HFv2 MuSR,9.83,,hf_open_llm_v2_240829.csv lion_gemma_2b_odpo_v1_0,HF OpenLLM v2,11.36,,hf_open_llm_v2_240829.csv lion_gemma_2b_odpo_v1_0,HFv2 BBH,14.02,,hf_open_llm_v2_240829.csv lion_gemma_2b_odpo_v1_0,HFv2 GPQA,0.0,,hf_open_llm_v2_240829.csv lion_gemma_2b_odpo_v1_0,HFv2 IFEval,30.66,,hf_open_llm_v2_240829.csv lion_gemma_2b_odpo_v1_0,HFv2 MMLU Pro,7.69,,hf_open_llm_v2_240829.csv lion_gemma_2b_odpo_v1_0,HFv2 Math Level 5,3.7,,hf_open_llm_v2_240829.csv lion_gemma_2b_odpo_v1_0,HFv2 MuSR,12.06,,hf_open_llm_v2_240829.csv lion_gemma_2b_sft_v1_0,HF OpenLLM v2,12.33,,hf_open_llm_v2_240829.csv lion_gemma_2b_sft_v1_0,HFv2 BBH,14.12,,hf_open_llm_v2_240829.csv lion_gemma_2b_sft_v1_0,HFv2 GPQA,0.78,,hf_open_llm_v2_240829.csv lion_gemma_2b_sft_v1_0,HFv2 IFEval,36.92,,hf_open_llm_v2_240829.csv lion_gemma_2b_sft_v1_0,HFv2 MMLU Pro,8.69,,hf_open_llm_v2_240829.csv lion_gemma_2b_sft_v1_0,HFv2 Math Level 5,5.14,,hf_open_llm_v2_240829.csv lion_gemma_2b_sft_v1_0,HFv2 MuSR,8.31,,hf_open_llm_v2_240829.csv lion_llama3_8b_dpo_v1_0,HF OpenLLM v2,21.34,,hf_open_llm_v2_240829.csv lion_llama3_8b_dpo_v1_0,HFv2 BBH,30.36,,hf_open_llm_v2_240829.csv lion_llama3_8b_dpo_v1_0,HFv2 GPQA,4.14,,hf_open_llm_v2_240829.csv lion_llama3_8b_dpo_v1_0,HFv2 IFEval,49.57,,hf_open_llm_v2_240829.csv lion_llama3_8b_dpo_v1_0,HFv2 MMLU Pro,24.65,,hf_open_llm_v2_240829.csv lion_llama3_8b_dpo_v1_0,HFv2 Math Level 5,9.06,,hf_open_llm_v2_240829.csv lion_llama3_8b_dpo_v1_0,HFv2 MuSR,10.28,,hf_open_llm_v2_240829.csv lion_llama3_8b_odpo_v1_0,HF OpenLLM v2,19.29,,hf_open_llm_v2_240829.csv lion_llama3_8b_odpo_v1_0,HFv2 BBH,30.46,,hf_open_llm_v2_240829.csv lion_llama3_8b_odpo_v1_0,HFv2 GPQA,4.7,,hf_open_llm_v2_240829.csv lion_llama3_8b_odpo_v1_0,HFv2 IFEval,39.68,,hf_open_llm_v2_240829.csv lion_llama3_8b_odpo_v1_0,HFv2 MMLU Pro,23.92,,hf_open_llm_v2_240829.csv lion_llama3_8b_odpo_v1_0,HFv2 Math Level 5,7.25,,hf_open_llm_v2_240829.csv lion_llama3_8b_odpo_v1_0,HFv2 MuSR,9.72,,hf_open_llm_v2_240829.csv lion_llama3_8b_sft_v1_0,HF OpenLLM v2,20.26,,hf_open_llm_v2_240829.csv lion_llama3_8b_sft_v1_0,HFv2 BBH,30.88,,hf_open_llm_v2_240829.csv lion_llama3_8b_sft_v1_0,HFv2 GPQA,3.69,,hf_open_llm_v2_240829.csv lion_llama3_8b_sft_v1_0,HFv2 IFEval,38.17,,hf_open_llm_v2_240829.csv lion_llama3_8b_sft_v1_0,HFv2 MMLU Pro,24.86,,hf_open_llm_v2_240829.csv lion_llama3_8b_sft_v1_0,HFv2 Math Level 5,8.46,,hf_open_llm_v2_240829.csv lion_llama3_8b_sft_v1_0,HFv2 MuSR,15.48,,hf_open_llm_v2_240829.csv llama3_1_70b,HF OpenLLM v2,25.91,,hf_open_llm_v2_240829.csv llama3_1_70b,HFv2 BBH,46.4,,hf_open_llm_v2_240829.csv llama3_1_70b,HFv2 GPQA,18.34,,hf_open_llm_v2_240829.csv llama3_1_70b,HFv2 IFEval,16.84,,hf_open_llm_v2_240829.csv llama3_1_70b,HFv2 MMLU Pro,40.6,,hf_open_llm_v2_240829.csv llama3_1_70b,HFv2 Math Level 5,16.69,,hf_open_llm_v2_240829.csv llama3_1_70b,HFv2 MuSR,16.58,,hf_open_llm_v2_240829.csv llama3_1_70b_instruct,HF OpenLLM v2,41.74,,hf_open_llm_v2_240829.csv llama3_1_70b_instruct,HFv2 BBH,55.93,,hf_open_llm_v2_240829.csv llama3_1_70b_instruct,HFv2 GPQA,14.21,,hf_open_llm_v2_240829.csv llama3_1_70b_instruct,HFv2 IFEval,86.69,,hf_open_llm_v2_240829.csv llama3_1_70b_instruct,HFv2 MMLU Pro,47.88,,hf_open_llm_v2_240829.csv llama3_1_70b_instruct,HFv2 Math Level 5,28.02,,hf_open_llm_v2_240829.csv llama3_1_70b_instruct,HFv2 MuSR,17.69,,hf_open_llm_v2_240829.csv llama3_1_8b,HF OpenLLM v2,13.78,,hf_open_llm_v2_240829.csv llama3_1_8b,HFv2 BBH,25.29,,hf_open_llm_v2_240829.csv llama3_1_8b,HFv2 GPQA,6.15,,hf_open_llm_v2_240829.csv llama3_1_8b,HFv2 IFEval,12.7,,hf_open_llm_v2_240829.csv llama3_1_8b,HFv2 MMLU Pro,24.95,,hf_open_llm_v2_240829.csv llama3_1_8b,HFv2 Math Level 5,4.61,,hf_open_llm_v2_240829.csv llama3_1_8b,HFv2 MuSR,8.98,,hf_open_llm_v2_240829.csv llama3_1_8b_fireplace2,HF OpenLLM v2,18.05,,hf_open_llm_v2_240829.csv llama3_1_8b_fireplace2,HFv2 BBH,24.09,,hf_open_llm_v2_240829.csv llama3_1_8b_fireplace2,HFv2 GPQA,5.26,,hf_open_llm_v2_240829.csv llama3_1_8b_fireplace2,HFv2 IFEval,53.28,,hf_open_llm_v2_240829.csv llama3_1_8b_fireplace2,HFv2 MMLU Pro,15.82,,hf_open_llm_v2_240829.csv llama3_1_8b_fireplace2,HFv2 Math Level 5,5.66,,hf_open_llm_v2_240829.csv llama3_1_8b_fireplace2,HFv2 MuSR,4.22,,hf_open_llm_v2_240829.csv llama3_1_8b_instruct,HF OpenLLM v2,27.91,,hf_open_llm_v2_240829.csv llama3_1_8b_instruct,HFv2 BBH,29.89,,hf_open_llm_v2_240829.csv llama3_1_8b_instruct,HFv2 GPQA,2.35,,hf_open_llm_v2_240829.csv llama3_1_8b_instruct,HFv2 IFEval,78.56,,hf_open_llm_v2_240829.csv llama3_1_8b_instruct,HFv2 MMLU Pro,30.68,,hf_open_llm_v2_240829.csv llama3_1_8b_instruct,HFv2 Math Level 5,17.6,,hf_open_llm_v2_240829.csv llama3_1_8b_instruct,HFv2 MuSR,8.41,,hf_open_llm_v2_240829.csv llama3_1_instruct_nsfw_pretrained_e1_plus_reddit,HF OpenLLM v2,20.74,,hf_open_llm_v2_240829.csv llama3_1_instruct_nsfw_pretrained_e1_plus_reddit,HFv2 BBH,28.02,,hf_open_llm_v2_240829.csv llama3_1_instruct_nsfw_pretrained_e1_plus_reddit,HFv2 GPQA,5.59,,hf_open_llm_v2_240829.csv llama3_1_instruct_nsfw_pretrained_e1_plus_reddit,HFv2 IFEval,45.21,,hf_open_llm_v2_240829.csv llama3_1_instruct_nsfw_pretrained_e1_plus_reddit,HFv2 MMLU Pro,28.5,,hf_open_llm_v2_240829.csv llama3_1_instruct_nsfw_pretrained_e1_plus_reddit,HFv2 Math Level 5,8.84,,hf_open_llm_v2_240829.csv llama3_1_instruct_nsfw_pretrained_e1_plus_reddit,HFv2 MuSR,8.3,,hf_open_llm_v2_240829.csv llama3_70b,HF OpenLLM v2,26.37,,hf_open_llm_v2_240829.csv llama3_70b,HFv2 BBH,48.71,,hf_open_llm_v2_240829.csv llama3_70b,HFv2 GPQA,19.69,,hf_open_llm_v2_240829.csv llama3_70b,HFv2 IFEval,16.03,,hf_open_llm_v2_240829.csv llama3_70b,HFv2 MMLU Pro,41.21,,hf_open_llm_v2_240829.csv llama3_70b,HFv2 Math Level 5,16.54,,hf_open_llm_v2_240829.csv llama3_70b,HFv2 MuSR,16.01,,hf_open_llm_v2_240829.csv llama3_70b_instruct,HF OpenLLM v2,36.18,,hf_open_llm_v2_240829.csv llama3_70b_instruct,HFv2 BBH,50.19,,hf_open_llm_v2_240829.csv llama3_70b_instruct,HFv2 GPQA,4.92,,hf_open_llm_v2_240829.csv llama3_70b_instruct,HFv2 IFEval,80.99,,hf_open_llm_v2_240829.csv llama3_70b_instruct,HFv2 MMLU Pro,46.74,,hf_open_llm_v2_240829.csv llama3_70b_instruct,HFv2 Math Level 5,23.34,,hf_open_llm_v2_240829.csv llama3_70b_instruct,HFv2 MuSR,10.92,,hf_open_llm_v2_240829.csv llama3_70b_shiningvaliant2,HF OpenLLM v2,30.45,,hf_open_llm_v2_240829.csv llama3_70b_shiningvaliant2,HFv2 BBH,46.71,,hf_open_llm_v2_240829.csv llama3_70b_shiningvaliant2,HFv2 GPQA,10.74,,hf_open_llm_v2_240829.csv llama3_70b_shiningvaliant2,HFv2 IFEval,61.22,,hf_open_llm_v2_240829.csv llama3_70b_shiningvaliant2,HFv2 MMLU Pro,43.31,,hf_open_llm_v2_240829.csv llama3_70b_shiningvaliant2,HFv2 Math Level 5,7.1,,hf_open_llm_v2_240829.csv llama3_70b_shiningvaliant2,HFv2 MuSR,13.64,,hf_open_llm_v2_240829.csv llama3_8b,HF OpenLLM v2,13.41,,hf_open_llm_v2_240829.csv llama3_8b,HFv2 BBH,24.5,,hf_open_llm_v2_240829.csv llama3_8b,HFv2 GPQA,7.38,,hf_open_llm_v2_240829.csv llama3_8b,HFv2 IFEval,14.55,,hf_open_llm_v2_240829.csv llama3_8b,HFv2 MMLU Pro,24.55,,hf_open_llm_v2_240829.csv llama3_8b,HFv2 Math Level 5,3.25,,hf_open_llm_v2_240829.csv llama3_8b,HFv2 MuSR,6.24,,hf_open_llm_v2_240829.csv llama3_8b_instruct,HF OpenLLM v2,23.91,,hf_open_llm_v2_240829.csv llama3_8b_instruct,HFv2 BBH,28.24,,hf_open_llm_v2_240829.csv llama3_8b_instruct,HFv2 GPQA,5.7,,hf_open_llm_v2_240829.csv llama3_8b_instruct,HFv2 IFEval,74.08,,hf_open_llm_v2_240829.csv llama3_8b_instruct,HFv2 MMLU Pro,29.6,,hf_open_llm_v2_240829.csv llama3_8b_instruct,HFv2 Math Level 5,8.69,,hf_open_llm_v2_240829.csv llama3_8b_instruct,HFv2 MuSR,5.4,,hf_open_llm_v2_240829.csv llama3_8b_instruct_gradient_1048k,HF OpenLLM v2,18.12,,hf_open_llm_v2_240829.csv llama3_8b_instruct_gradient_1048k,HFv2 BBH,21.01,,hf_open_llm_v2_240829.csv llama3_8b_instruct_gradient_1048k,HFv2 GPQA,3.69,,hf_open_llm_v2_240829.csv llama3_8b_instruct_gradient_1048k,HFv2 IFEval,44.56,,hf_open_llm_v2_240829.csv llama3_8b_instruct_gradient_1048k,HFv2 MMLU Pro,21.56,,hf_open_llm_v2_240829.csv llama3_8b_instruct_gradient_1048k,HFv2 Math Level 5,4.38,,hf_open_llm_v2_240829.csv llama3_8b_instruct_gradient_1048k,HFv2 MuSR,13.52,,hf_open_llm_v2_240829.csv llama3_8b_magpie_align_v0_1,HF OpenLLM v2,16.47,,hf_open_llm_v2_240829.csv llama3_8b_magpie_align_v0_1,HFv2 BBH,26.69,,hf_open_llm_v2_240829.csv llama3_8b_magpie_align_v0_1,HFv2 GPQA,3.58,,hf_open_llm_v2_240829.csv llama3_8b_magpie_align_v0_1,HFv2 IFEval,41.18,,hf_open_llm_v2_240829.csv llama3_8b_magpie_align_v0_1,HFv2 MMLU Pro,22.29,,hf_open_llm_v2_240829.csv llama3_8b_magpie_align_v0_1,HFv2 Math Level 5,3.4,,hf_open_llm_v2_240829.csv llama3_8b_magpie_align_v0_1,HFv2 MuSR,1.92,,hf_open_llm_v2_240829.csv llama3_8b_magpie_align_v0_3,HF OpenLLM v2,16.89,,hf_open_llm_v2_240829.csv llama3_8b_magpie_align_v0_3,HFv2 BBH,24.31,,hf_open_llm_v2_240829.csv llama3_8b_magpie_align_v0_3,HFv2 GPQA,2.01,,hf_open_llm_v2_240829.csv llama3_8b_magpie_align_v0_3,HFv2 IFEval,44.97,,hf_open_llm_v2_240829.csv llama3_8b_magpie_align_v0_3,HFv2 MMLU Pro,23.71,,hf_open_llm_v2_240829.csv llama3_8b_magpie_align_v0_3,HFv2 Math Level 5,2.57,,hf_open_llm_v2_240829.csv llama3_8b_magpie_align_v0_3,HFv2 MuSR,3.74,,hf_open_llm_v2_240829.csv llama3_cantonese_8b_instruct,HF OpenLLM v2,24.16,,hf_open_llm_v2_240829.csv llama3_cantonese_8b_instruct,HFv2 BBH,26.79,,hf_open_llm_v2_240829.csv llama3_cantonese_8b_instruct,HFv2 GPQA,5.82,,hf_open_llm_v2_240829.csv llama3_cantonese_8b_instruct,HFv2 IFEval,66.69,,hf_open_llm_v2_240829.csv llama3_cantonese_8b_instruct,HFv2 MMLU Pro,27.94,,hf_open_llm_v2_240829.csv llama3_cantonese_8b_instruct,HFv2 Math Level 5,8.23,,hf_open_llm_v2_240829.csv llama3_cantonese_8b_instruct,HFv2 MuSR,9.48,,hf_open_llm_v2_240829.csv llama3_instruct_8b_cpo_simpo,HF OpenLLM v2,24.48,,hf_open_llm_v2_240829.csv llama3_instruct_8b_cpo_simpo,HFv2 BBH,29.76,,hf_open_llm_v2_240829.csv llama3_instruct_8b_cpo_simpo,HFv2 GPQA,5.7,,hf_open_llm_v2_240829.csv llama3_instruct_8b_cpo_simpo,HFv2 IFEval,70.46,,hf_open_llm_v2_240829.csv llama3_instruct_8b_cpo_simpo,HFv2 MMLU Pro,29.84,,hf_open_llm_v2_240829.csv llama3_instruct_8b_cpo_simpo,HFv2 Math Level 5,7.7,,hf_open_llm_v2_240829.csv llama3_instruct_8b_cpo_simpo,HFv2 MuSR,3.42,,hf_open_llm_v2_240829.csv llama3_instruct_8b_simpo,HF OpenLLM v2,24.71,,hf_open_llm_v2_240829.csv llama3_instruct_8b_simpo,HFv2 BBH,28.23,,hf_open_llm_v2_240829.csv llama3_instruct_8b_simpo,HFv2 GPQA,5.37,,hf_open_llm_v2_240829.csv llama3_instruct_8b_simpo,HFv2 IFEval,73.47,,hf_open_llm_v2_240829.csv llama3_instruct_8b_simpo,HFv2 MMLU Pro,30.37,,hf_open_llm_v2_240829.csv llama3_instruct_8b_simpo,HFv2 Math Level 5,7.1,,hf_open_llm_v2_240829.csv llama3_instruct_8b_simpo,HFv2 MuSR,3.74,,hf_open_llm_v2_240829.csv llama3_instruct_8b_sppo_iter2,HF OpenLLM v2,23.78,,hf_open_llm_v2_240829.csv llama3_instruct_8b_sppo_iter2,HFv2 BBH,29.87,,hf_open_llm_v2_240829.csv llama3_instruct_8b_sppo_iter2,HFv2 GPQA,2.24,,hf_open_llm_v2_240829.csv llama3_instruct_8b_sppo_iter2,HFv2 IFEval,69.89,,hf_open_llm_v2_240829.csv llama3_instruct_8b_sppo_iter2,HFv2 MMLU Pro,29.91,,hf_open_llm_v2_240829.csv llama3_instruct_8b_sppo_iter2,HFv2 Math Level 5,8.76,,hf_open_llm_v2_240829.csv llama3_instruct_8b_sppo_iter2,HFv2 MuSR,2.0,,hf_open_llm_v2_240829.csv llama3_instruct_8b_sppo_iter3,HF OpenLLM v2,23.06,,hf_open_llm_v2_240829.csv llama3_instruct_8b_sppo_iter3,HFv2 BBH,29.72,,hf_open_llm_v2_240829.csv llama3_instruct_8b_sppo_iter3,HFv2 GPQA,2.01,,hf_open_llm_v2_240829.csv llama3_instruct_8b_sppo_iter3,HFv2 IFEval,67.03,,hf_open_llm_v2_240829.csv llama3_instruct_8b_sppo_iter3,HFv2 MMLU Pro,29.53,,hf_open_llm_v2_240829.csv llama3_instruct_8b_sppo_iter3,HFv2 Math Level 5,7.18,,hf_open_llm_v2_240829.csv llama3_instruct_8b_sppo_iter3,HFv2 MuSR,2.89,,hf_open_llm_v2_240829.csv llama3_korean_bllossom_8b,HF OpenLLM v2,20.09,,hf_open_llm_v2_240829.csv llama3_korean_bllossom_8b,HFv2 BBH,26.93,,hf_open_llm_v2_240829.csv llama3_korean_bllossom_8b,HFv2 GPQA,1.68,,hf_open_llm_v2_240829.csv llama3_korean_bllossom_8b,HFv2 IFEval,51.13,,hf_open_llm_v2_240829.csv llama3_korean_bllossom_8b,HFv2 MMLU Pro,28.82,,hf_open_llm_v2_240829.csv llama3_korean_bllossom_8b,HFv2 Math Level 5,8.38,,hf_open_llm_v2_240829.csv llama3_korean_bllossom_8b,HFv2 MuSR,3.63,,hf_open_llm_v2_240829.csv llama3_neuralhercules_5_0_8b,HF OpenLLM v2,15.93,,hf_open_llm_v2_240829.csv llama3_neuralhercules_5_0_8b,HFv2 BBH,16.34,,hf_open_llm_v2_240829.csv llama3_neuralhercules_5_0_8b,HFv2 GPQA,2.46,,hf_open_llm_v2_240829.csv llama3_neuralhercules_5_0_8b,HFv2 IFEval,44.89,,hf_open_llm_v2_240829.csv llama3_neuralhercules_5_0_8b,HFv2 MMLU Pro,21.48,,hf_open_llm_v2_240829.csv llama3_neuralhercules_5_0_8b,HFv2 Math Level 5,3.63,,hf_open_llm_v2_240829.csv llama3_neuralhercules_5_0_8b,HFv2 MuSR,6.78,,hf_open_llm_v2_240829.csv llama3_refueled,HF OpenLLM v2,22.73,,hf_open_llm_v2_240829.csv llama3_refueled,HFv2 BBH,41.72,,hf_open_llm_v2_240829.csv llama3_refueled,HFv2 GPQA,6.6,,hf_open_llm_v2_240829.csv llama3_refueled,HFv2 IFEval,46.2,,hf_open_llm_v2_240829.csv llama3_refueled,HFv2 MMLU Pro,23.28,,hf_open_llm_v2_240829.csv llama3_refueled,HFv2 Math Level 5,3.93,,hf_open_llm_v2_240829.csv llama3_refueled,HFv2 MuSR,14.64,,hf_open_llm_v2_240829.csv llama3_tenyxchat_70b,HF OpenLLM v2,36.54,,hf_open_llm_v2_240829.csv llama3_tenyxchat_70b,HFv2 BBH,49.62,,hf_open_llm_v2_240829.csv llama3_tenyxchat_70b,HFv2 GPQA,6.82,,hf_open_llm_v2_240829.csv llama3_tenyxchat_70b,HFv2 IFEval,80.87,,hf_open_llm_v2_240829.csv llama3_tenyxchat_70b,HFv2 MMLU Pro,46.78,,hf_open_llm_v2_240829.csv llama3_tenyxchat_70b,HFv2 Math Level 5,22.66,,hf_open_llm_v2_240829.csv llama3_tenyxchat_70b,HFv2 MuSR,12.52,,hf_open_llm_v2_240829.csv llama_160m_chat_v1,HF OpenLLM v2,4.1,,hf_open_llm_v2_240829.csv llama_160m_chat_v1,HFv2 BBH,3.17,,hf_open_llm_v2_240829.csv llama_160m_chat_v1,HFv2 GPQA,1.01,,hf_open_llm_v2_240829.csv llama_160m_chat_v1,HFv2 IFEval,15.75,,hf_open_llm_v2_240829.csv llama_160m_chat_v1,HFv2 MMLU Pro,1.51,,hf_open_llm_v2_240829.csv llama_160m_chat_v1,HFv2 Math Level 5,0.0,,hf_open_llm_v2_240829.csv llama_160m_chat_v1,HFv2 MuSR,3.17,,hf_open_llm_v2_240829.csv llama_2_13b,HF OpenLLM v2,10.99,,hf_open_llm_v2_240829.csv llama_2_13b,HFv2 BBH,17.22,,hf_open_llm_v2_240829.csv llama_2_13b,HFv2 GPQA,4.14,,hf_open_llm_v2_240829.csv llama_2_13b,HFv2 IFEval,24.82,,hf_open_llm_v2_240829.csv llama_2_13b,HFv2 MMLU Pro,15.31,,hf_open_llm_v2_240829.csv llama_2_13b,HFv2 Math Level 5,1.06,,hf_open_llm_v2_240829.csv llama_2_13b,HFv2 MuSR,3.39,,hf_open_llm_v2_240829.csv llama_2_13b_chat,HF OpenLLM v2,11.0,,hf_open_llm_v2_240829.csv llama_2_13b_chat,HFv2 BBH,7.16,,hf_open_llm_v2_240829.csv llama_2_13b_chat,HFv2 GPQA,0.0,,hf_open_llm_v2_240829.csv llama_2_13b_chat,HFv2 IFEval,39.85,,hf_open_llm_v2_240829.csv llama_2_13b_chat,HFv2 MMLU Pro,10.26,,hf_open_llm_v2_240829.csv llama_2_13b_chat,HFv2 Math Level 5,0.6,,hf_open_llm_v2_240829.csv llama_2_13b_chat,HFv2 MuSR,8.16,,hf_open_llm_v2_240829.csv llama_2_70b,HF OpenLLM v2,18.25,,hf_open_llm_v2_240829.csv llama_2_70b,HFv2 BBH,35.9,,hf_open_llm_v2_240829.csv llama_2_70b,HFv2 GPQA,7.05,,hf_open_llm_v2_240829.csv llama_2_70b,HFv2 IFEval,24.07,,hf_open_llm_v2_240829.csv llama_2_70b,HFv2 MMLU Pro,30.2,,hf_open_llm_v2_240829.csv llama_2_70b,HFv2 Math Level 5,2.49,,hf_open_llm_v2_240829.csv llama_2_70b,HFv2 MuSR,9.78,,hf_open_llm_v2_240829.csv llama_2_70b_chat,HF OpenLLM v2,12.73,,hf_open_llm_v2_240829.csv llama_2_70b_chat,HFv2 BBH,4.61,,hf_open_llm_v2_240829.csv llama_2_70b_chat,HFv2 GPQA,1.9,,hf_open_llm_v2_240829.csv llama_2_70b_chat,HFv2 IFEval,49.58,,hf_open_llm_v2_240829.csv llama_2_70b_chat,HFv2 MMLU Pro,15.92,,hf_open_llm_v2_240829.csv llama_2_70b_chat,HFv2 Math Level 5,0.91,,hf_open_llm_v2_240829.csv llama_2_70b_chat,HFv2 MuSR,3.48,,hf_open_llm_v2_240829.csv llama_2_7b,HF OpenLLM v2,8.72,,hf_open_llm_v2_240829.csv llama_2_7b,HFv2 BBH,10.35,,hf_open_llm_v2_240829.csv llama_2_7b,HFv2 GPQA,2.24,,hf_open_llm_v2_240829.csv llama_2_7b,HFv2 IFEval,25.19,,hf_open_llm_v2_240829.csv llama_2_7b,HFv2 MMLU Pro,9.57,,hf_open_llm_v2_240829.csv llama_2_7b,HFv2 Math Level 5,1.21,,hf_open_llm_v2_240829.csv llama_2_7b,HFv2 MuSR,3.76,,hf_open_llm_v2_240829.csv llama_2_7b_chat,HF OpenLLM v2,9.4,,hf_open_llm_v2_240829.csv llama_2_7b_chat,HFv2 BBH,4.49,,hf_open_llm_v2_240829.csv llama_2_7b_chat,HFv2 GPQA,0.56,,hf_open_llm_v2_240829.csv llama_2_7b_chat,HFv2 IFEval,39.65,,hf_open_llm_v2_240829.csv llama_2_7b_chat,HFv2 MMLU Pro,7.52,,hf_open_llm_v2_240829.csv llama_2_7b_chat,HFv2 Math Level 5,0.68,,hf_open_llm_v2_240829.csv llama_2_7b_chat,HFv2 MuSR,3.48,,hf_open_llm_v2_240829.csv llama_65b,HF OpenLLM v2,13.54,,hf_open_llm_v2_240829.csv llama_65b,HFv2 BBH,25.25,,hf_open_llm_v2_240829.csv llama_65b,HFv2 GPQA,3.47,,hf_open_llm_v2_240829.csv llama_65b,HFv2 IFEval,25.26,,hf_open_llm_v2_240829.csv llama_65b,HFv2 MMLU Pro,23.08,,hf_open_llm_v2_240829.csv llama_65b,HFv2 Math Level 5,2.19,,hf_open_llm_v2_240829.csv llama_65b,HFv2 MuSR,1.97,,hf_open_llm_v2_240829.csv llama_pro_8b_instruct,HF OpenLLM v2,15.14,,hf_open_llm_v2_240829.csv llama_pro_8b_instruct,HFv2 BBH,19.49,,hf_open_llm_v2_240829.csv llama_pro_8b_instruct,HFv2 GPQA,3.24,,hf_open_llm_v2_240829.csv llama_pro_8b_instruct,HFv2 IFEval,44.86,,hf_open_llm_v2_240829.csv llama_pro_8b_instruct,HFv2 MMLU Pro,10.51,,hf_open_llm_v2_240829.csv llama_pro_8b_instruct,HFv2 Math Level 5,1.66,,hf_open_llm_v2_240829.csv llama_pro_8b_instruct,HFv2 MuSR,11.11,,hf_open_llm_v2_240829.csv luxia_21_4b_alignment_v1_0,HF OpenLLM v2,22.86,,hf_open_llm_v2_240829.csv luxia_21_4b_alignment_v1_0,HFv2 BBH,48.02,,hf_open_llm_v2_240829.csv luxia_21_4b_alignment_v1_0,HFv2 GPQA,6.82,,hf_open_llm_v2_240829.csv luxia_21_4b_alignment_v1_0,HFv2 IFEval,36.93,,hf_open_llm_v2_240829.csv luxia_21_4b_alignment_v1_0,HFv2 MMLU Pro,26.7,,hf_open_llm_v2_240829.csv luxia_21_4b_alignment_v1_0,HFv2 Math Level 5,6.19,,hf_open_llm_v2_240829.csv luxia_21_4b_alignment_v1_0,HFv2 MuSR,12.51,,hf_open_llm_v2_240829.csv luxia_21_4b_alignment_v1_2,HF OpenLLM v2,23.44,,hf_open_llm_v2_240829.csv luxia_21_4b_alignment_v1_2,HFv2 BBH,47.77,,hf_open_llm_v2_240829.csv luxia_21_4b_alignment_v1_2,HFv2 GPQA,7.72,,hf_open_llm_v2_240829.csv luxia_21_4b_alignment_v1_2,HFv2 IFEval,41.15,,hf_open_llm_v2_240829.csv luxia_21_4b_alignment_v1_2,HFv2 MMLU Pro,27.48,,hf_open_llm_v2_240829.csv luxia_21_4b_alignment_v1_2,HFv2 Math Level 5,1.59,,hf_open_llm_v2_240829.csv luxia_21_4b_alignment_v1_2,HFv2 MuSR,14.9,,hf_open_llm_v2_240829.csv magnum_72b_v1,HF OpenLLM v2,42.17,,hf_open_llm_v2_240829.csv magnum_72b_v1,HFv2 BBH,57.65,,hf_open_llm_v2_240829.csv magnum_72b_v1,HFv2 GPQA,18.79,,hf_open_llm_v2_240829.csv magnum_72b_v1,HFv2 IFEval,76.06,,hf_open_llm_v2_240829.csv magnum_72b_v1,HFv2 MMLU Pro,49.64,,hf_open_llm_v2_240829.csv magnum_72b_v1,HFv2 Math Level 5,35.27,,hf_open_llm_v2_240829.csv magnum_72b_v1,HFv2 MuSR,15.62,,hf_open_llm_v2_240829.csv maid_yuzu_v7,HF OpenLLM v2,24.38,,hf_open_llm_v2_240829.csv maid_yuzu_v7,HFv2 BBH,26.82,,hf_open_llm_v2_240829.csv maid_yuzu_v7,HFv2 GPQA,7.94,,hf_open_llm_v2_240829.csv maid_yuzu_v7,HFv2 IFEval,64.62,,hf_open_llm_v2_240829.csv maid_yuzu_v7,HFv2 MMLU Pro,28.22,,hf_open_llm_v2_240829.csv maid_yuzu_v7,HFv2 Math Level 5,8.91,,hf_open_llm_v2_240829.csv maid_yuzu_v7,HFv2 MuSR,9.77,,hf_open_llm_v2_240829.csv matter_0_2_7b_dpo,HF OpenLLM v2,8.81,,hf_open_llm_v2_240829.csv matter_0_2_7b_dpo,HFv2 BBH,10.06,,hf_open_llm_v2_240829.csv matter_0_2_7b_dpo,HFv2 GPQA,1.23,,hf_open_llm_v2_240829.csv matter_0_2_7b_dpo,HFv2 IFEval,33.03,,hf_open_llm_v2_240829.csv matter_0_2_7b_dpo,HFv2 MMLU Pro,1.82,,hf_open_llm_v2_240829.csv matter_0_2_7b_dpo,HFv2 Math Level 5,0.83,,hf_open_llm_v2_240829.csv matter_0_2_7b_dpo,HFv2 MuSR,5.87,,hf_open_llm_v2_240829.csv merlinite_7b,HF OpenLLM v2,16.74,,hf_open_llm_v2_240829.csv merlinite_7b,HFv2 BBH,29.98,,hf_open_llm_v2_240829.csv merlinite_7b,HFv2 GPQA,6.26,,hf_open_llm_v2_240829.csv merlinite_7b,HFv2 IFEval,24.99,,hf_open_llm_v2_240829.csv merlinite_7b,HFv2 MMLU Pro,22.98,,hf_open_llm_v2_240829.csv merlinite_7b,HFv2 Math Level 5,2.34,,hf_open_llm_v2_240829.csv merlinite_7b,HFv2 MuSR,13.88,,hf_open_llm_v2_240829.csv minueza_32m_ultrachat,HF OpenLLM v2,3.85,,hf_open_llm_v2_240829.csv minueza_32m_ultrachat,HFv2 BBH,2.44,,hf_open_llm_v2_240829.csv minueza_32m_ultrachat,HFv2 GPQA,0.78,,hf_open_llm_v2_240829.csv minueza_32m_ultrachat,HFv2 IFEval,13.76,,hf_open_llm_v2_240829.csv minueza_32m_ultrachat,HFv2 MMLU Pro,1.48,,hf_open_llm_v2_240829.csv minueza_32m_ultrachat,HFv2 Math Level 5,0.0,,hf_open_llm_v2_240829.csv minueza_32m_ultrachat,HFv2 MuSR,4.64,,hf_open_llm_v2_240829.csv mistral7b_pairrm_sppo_iter2,HF OpenLLM v2,17.0,,hf_open_llm_v2_240829.csv mistral7b_pairrm_sppo_iter2,HFv2 BBH,22.48,,hf_open_llm_v2_240829.csv mistral7b_pairrm_sppo_iter2,HFv2 GPQA,5.15,,hf_open_llm_v2_240829.csv mistral7b_pairrm_sppo_iter2,HFv2 IFEval,44.46,,hf_open_llm_v2_240829.csv mistral7b_pairrm_sppo_iter2,HFv2 MMLU Pro,18.63,,hf_open_llm_v2_240829.csv mistral7b_pairrm_sppo_iter2,HFv2 Math Level 5,1.51,,hf_open_llm_v2_240829.csv mistral7b_pairrm_sppo_iter2,HFv2 MuSR,9.8,,hf_open_llm_v2_240829.csv mistral7b_pairrm_sppo_iter3,HF OpenLLM v2,16.36,,hf_open_llm_v2_240829.csv mistral7b_pairrm_sppo_iter3,HFv2 BBH,21.82,,hf_open_llm_v2_240829.csv mistral7b_pairrm_sppo_iter3,HFv2 GPQA,3.36,,hf_open_llm_v2_240829.csv mistral7b_pairrm_sppo_iter3,HFv2 IFEval,43.51,,hf_open_llm_v2_240829.csv mistral7b_pairrm_sppo_iter3,HFv2 MMLU Pro,18.42,,hf_open_llm_v2_240829.csv mistral7b_pairrm_sppo_iter3,HFv2 Math Level 5,1.59,,hf_open_llm_v2_240829.csv mistral7b_pairrm_sppo_iter3,HFv2 MuSR,9.49,,hf_open_llm_v2_240829.csv mistral_7b_instruct_v0_1,HF OpenLLM v2,12.67,,hf_open_llm_v2_240829.csv mistral_7b_instruct_v0_1,HFv2 BBH,7.65,,hf_open_llm_v2_240829.csv mistral_7b_instruct_v0_1,HFv2 GPQA,0.0,,hf_open_llm_v2_240829.csv mistral_7b_instruct_v0_1,HFv2 IFEval,44.87,,hf_open_llm_v2_240829.csv mistral_7b_instruct_v0_1,HFv2 MMLU Pro,15.72,,hf_open_llm_v2_240829.csv mistral_7b_instruct_v0_1,HFv2 Math Level 5,1.66,,hf_open_llm_v2_240829.csv mistral_7b_instruct_v0_1,HFv2 MuSR,6.13,,hf_open_llm_v2_240829.csv mistral_7b_instruct_v0_2,HF OpenLLM v2,18.44,,hf_open_llm_v2_240829.csv mistral_7b_instruct_v0_2,HFv2 BBH,22.91,,hf_open_llm_v2_240829.csv mistral_7b_instruct_v0_2,HFv2 GPQA,3.47,,hf_open_llm_v2_240829.csv mistral_7b_instruct_v0_2,HFv2 IFEval,54.96,,hf_open_llm_v2_240829.csv mistral_7b_instruct_v0_2,HFv2 MMLU Pro,19.08,,hf_open_llm_v2_240829.csv mistral_7b_instruct_v0_2,HFv2 Math Level 5,2.64,,hf_open_llm_v2_240829.csv mistral_7b_instruct_v0_2,HFv2 MuSR,7.61,,hf_open_llm_v2_240829.csv mistral_7b_instruct_v0_3,HF OpenLLM v2,19.11,,hf_open_llm_v2_240829.csv mistral_7b_instruct_v0_3,HFv2 BBH,25.57,,hf_open_llm_v2_240829.csv mistral_7b_instruct_v0_3,HFv2 GPQA,3.91,,hf_open_llm_v2_240829.csv mistral_7b_instruct_v0_3,HFv2 IFEval,54.65,,hf_open_llm_v2_240829.csv mistral_7b_instruct_v0_3,HFv2 MMLU Pro,23.06,,hf_open_llm_v2_240829.csv mistral_7b_instruct_v0_3,HFv2 Math Level 5,3.17,,hf_open_llm_v2_240829.csv mistral_7b_instruct_v0_3,HFv2 MuSR,4.3,,hf_open_llm_v2_240829.csv mistral_7b_openorca,HF OpenLLM v2,17.62,,hf_open_llm_v2_240829.csv mistral_7b_openorca,HFv2 BBH,25.84,,hf_open_llm_v2_240829.csv mistral_7b_openorca,HFv2 GPQA,2.91,,hf_open_llm_v2_240829.csv mistral_7b_openorca,HFv2 IFEval,49.78,,hf_open_llm_v2_240829.csv mistral_7b_openorca,HFv2 MMLU Pro,18.37,,hf_open_llm_v2_240829.csv mistral_7b_openorca,HFv2 Math Level 5,2.95,,hf_open_llm_v2_240829.csv mistral_7b_openorca,HFv2 MuSR,5.89,,hf_open_llm_v2_240829.csv mistral_7b_v0_1,HF OpenLLM v2,14.52,,hf_open_llm_v2_240829.csv mistral_7b_v0_1,HFv2 BBH,22.17,,hf_open_llm_v2_240829.csv mistral_7b_v0_1,HFv2 GPQA,5.59,,hf_open_llm_v2_240829.csv mistral_7b_v0_1,HFv2 IFEval,23.86,,hf_open_llm_v2_240829.csv mistral_7b_v0_1,HFv2 MMLU Pro,22.36,,hf_open_llm_v2_240829.csv mistral_7b_v0_1,HFv2 Math Level 5,2.49,,hf_open_llm_v2_240829.csv mistral_7b_v0_1,HFv2 MuSR,10.68,,hf_open_llm_v2_240829.csv mistral_7b_v0_1_signtensors_1_over_2,HF OpenLLM v2,14.26,,hf_open_llm_v2_240829.csv mistral_7b_v0_1_signtensors_1_over_2,HFv2 BBH,22.4,,hf_open_llm_v2_240829.csv mistral_7b_v0_1_signtensors_1_over_2,HFv2 GPQA,7.61,,hf_open_llm_v2_240829.csv mistral_7b_v0_1_signtensors_1_over_2,HFv2 IFEval,21.79,,hf_open_llm_v2_240829.csv mistral_7b_v0_1_signtensors_1_over_2,HFv2 MMLU Pro,22.22,,hf_open_llm_v2_240829.csv mistral_7b_v0_1_signtensors_1_over_2,HFv2 Math Level 5,2.72,,hf_open_llm_v2_240829.csv mistral_7b_v0_1_signtensors_1_over_2,HFv2 MuSR,8.81,,hf_open_llm_v2_240829.csv mistral_7b_v0_1_signtensors_1_over_4,HF OpenLLM v2,8.71,,hf_open_llm_v2_240829.csv mistral_7b_v0_1_signtensors_1_over_4,HFv2 BBH,9.23,,hf_open_llm_v2_240829.csv mistral_7b_v0_1_signtensors_1_over_4,HFv2 GPQA,2.68,,hf_open_llm_v2_240829.csv mistral_7b_v0_1_signtensors_1_over_4,HFv2 IFEval,21.33,,hf_open_llm_v2_240829.csv mistral_7b_v0_1_signtensors_1_over_4,HFv2 MMLU Pro,14.56,,hf_open_llm_v2_240829.csv mistral_7b_v0_1_signtensors_1_over_4,HFv2 Math Level 5,2.27,,hf_open_llm_v2_240829.csv mistral_7b_v0_1_signtensors_1_over_4,HFv2 MuSR,2.19,,hf_open_llm_v2_240829.csv mistral_7b_v0_1_signtensors_3_over_8,HF OpenLLM v2,13.73,,hf_open_llm_v2_240829.csv mistral_7b_v0_1_signtensors_3_over_8,HFv2 BBH,20.44,,hf_open_llm_v2_240829.csv mistral_7b_v0_1_signtensors_3_over_8,HFv2 GPQA,7.16,,hf_open_llm_v2_240829.csv mistral_7b_v0_1_signtensors_3_over_8,HFv2 IFEval,23.94,,hf_open_llm_v2_240829.csv mistral_7b_v0_1_signtensors_3_over_8,HFv2 MMLU Pro,22.24,,hf_open_llm_v2_240829.csv mistral_7b_v0_1_signtensors_3_over_8,HFv2 Math Level 5,2.79,,hf_open_llm_v2_240829.csv mistral_7b_v0_1_signtensors_3_over_8,HFv2 MuSR,5.79,,hf_open_llm_v2_240829.csv mistral_7b_v0_1_signtensors_5_over_16,HF OpenLLM v2,12.16,,hf_open_llm_v2_240829.csv mistral_7b_v0_1_signtensors_5_over_16,HFv2 BBH,17.54,,hf_open_llm_v2_240829.csv mistral_7b_v0_1_signtensors_5_over_16,HFv2 GPQA,4.14,,hf_open_llm_v2_240829.csv mistral_7b_v0_1_signtensors_5_over_16,HFv2 IFEval,21.18,,hf_open_llm_v2_240829.csv mistral_7b_v0_1_signtensors_5_over_16,HFv2 MMLU Pro,21.75,,hf_open_llm_v2_240829.csv mistral_7b_v0_1_signtensors_5_over_16,HFv2 Math Level 5,2.19,,hf_open_llm_v2_240829.csv mistral_7b_v0_1_signtensors_5_over_16,HFv2 MuSR,6.14,,hf_open_llm_v2_240829.csv mistral_7b_v0_1_signtensors_7_over_16,HF OpenLLM v2,14.15,,hf_open_llm_v2_240829.csv mistral_7b_v0_1_signtensors_7_over_16,HFv2 BBH,21.04,,hf_open_llm_v2_240829.csv mistral_7b_v0_1_signtensors_7_over_16,HFv2 GPQA,7.16,,hf_open_llm_v2_240829.csv mistral_7b_v0_1_signtensors_7_over_16,HFv2 IFEval,22.94,,hf_open_llm_v2_240829.csv mistral_7b_v0_1_signtensors_7_over_16,HFv2 MMLU Pro,22.56,,hf_open_llm_v2_240829.csv mistral_7b_v0_1_signtensors_7_over_16,HFv2 Math Level 5,3.25,,hf_open_llm_v2_240829.csv mistral_7b_v0_1_signtensors_7_over_16,HFv2 MuSR,7.93,,hf_open_llm_v2_240829.csv mistral_7b_v0_2,HF OpenLLM v2,14.15,,hf_open_llm_v2_240829.csv mistral_7b_v0_2,HFv2 BBH,23.95,,hf_open_llm_v2_240829.csv mistral_7b_v0_2,HFv2 GPQA,5.59,,hf_open_llm_v2_240829.csv mistral_7b_v0_2,HFv2 IFEval,22.66,,hf_open_llm_v2_240829.csv mistral_7b_v0_2,HFv2 MMLU Pro,21.7,,hf_open_llm_v2_240829.csv mistral_7b_v0_2,HFv2 Math Level 5,2.64,,hf_open_llm_v2_240829.csv mistral_7b_v0_2,HFv2 MuSR,8.36,,hf_open_llm_v2_240829.csv mistral_7b_v0_3,HF OpenLLM v2,14.15,,hf_open_llm_v2_240829.csv mistral_7b_v0_3,HFv2 BBH,23.95,,hf_open_llm_v2_240829.csv mistral_7b_v0_3,HFv2 GPQA,5.59,,hf_open_llm_v2_240829.csv mistral_7b_v0_3,HFv2 IFEval,22.66,,hf_open_llm_v2_240829.csv mistral_7b_v0_3,HFv2 MMLU Pro,21.7,,hf_open_llm_v2_240829.csv mistral_7b_v0_3,HFv2 Math Level 5,2.64,,hf_open_llm_v2_240829.csv mistral_7b_v0_3,HFv2 MuSR,8.36,,hf_open_llm_v2_240829.csv mistral_nemo_base_2407,HF OpenLLM v2,15.08,,hf_open_llm_v2_240829.csv mistral_nemo_base_2407,HFv2 BBH,29.37,,hf_open_llm_v2_240829.csv mistral_nemo_base_2407,HFv2 GPQA,5.82,,hf_open_llm_v2_240829.csv mistral_nemo_base_2407,HFv2 IFEval,16.3,,hf_open_llm_v2_240829.csv mistral_nemo_base_2407,HFv2 MMLU Pro,27.46,,hf_open_llm_v2_240829.csv mistral_nemo_base_2407,HFv2 Math Level 5,4.98,,hf_open_llm_v2_240829.csv mistral_nemo_base_2407,HFv2 MuSR,6.52,,hf_open_llm_v2_240829.csv mistral_nemo_instruct_2407,HF OpenLLM v2,22.27,,hf_open_llm_v2_240829.csv mistral_nemo_instruct_2407,HFv2 BBH,27.11,,hf_open_llm_v2_240829.csv mistral_nemo_instruct_2407,HFv2 GPQA,8.72,,hf_open_llm_v2_240829.csv mistral_nemo_instruct_2407,HFv2 IFEval,62.61,,hf_open_llm_v2_240829.csv mistral_nemo_instruct_2407,HFv2 MMLU Pro,26.37,,hf_open_llm_v2_240829.csv mistral_nemo_instruct_2407,HFv2 Math Level 5,0.3,,hf_open_llm_v2_240829.csv mistral_nemo_instruct_2407,HFv2 MuSR,8.48,,hf_open_llm_v2_240829.csv mistral_nemo_minitron_8b_base,HF OpenLLM v2,17.6,,hf_open_llm_v2_240829.csv mistral_nemo_minitron_8b_base,HFv2 BBH,32.52,,hf_open_llm_v2_240829.csv mistral_nemo_minitron_8b_base,HFv2 GPQA,8.61,,hf_open_llm_v2_240829.csv mistral_nemo_minitron_8b_base,HFv2 IFEval,19.46,,hf_open_llm_v2_240829.csv mistral_nemo_minitron_8b_base,HFv2 MMLU Pro,30.92,,hf_open_llm_v2_240829.csv mistral_nemo_minitron_8b_base,HFv2 Math Level 5,4.31,,hf_open_llm_v2_240829.csv mistral_nemo_minitron_8b_base,HFv2 MuSR,9.77,,hf_open_llm_v2_240829.csv mistral_v0_3_7b_orpo,HF OpenLLM v2,12.08,,hf_open_llm_v2_240829.csv mistral_v0_3_7b_orpo,HFv2 BBH,15.59,,hf_open_llm_v2_240829.csv mistral_v0_3_7b_orpo,HFv2 GPQA,2.57,,hf_open_llm_v2_240829.csv mistral_v0_3_7b_orpo,HFv2 IFEval,37.7,,hf_open_llm_v2_240829.csv mistral_v0_3_7b_orpo,HFv2 MMLU Pro,14.46,,hf_open_llm_v2_240829.csv mistral_v0_3_7b_orpo,HFv2 Math Level 5,0.53,,hf_open_llm_v2_240829.csv mistral_v0_3_7b_orpo,HFv2 MuSR,2.97,,hf_open_llm_v2_240829.csv mixtral_8x22b_instruct_v0_1,HF OpenLLM v2,33.89,,hf_open_llm_v2_240829.csv mixtral_8x22b_instruct_v0_1,HFv2 BBH,44.11,,hf_open_llm_v2_240829.csv mixtral_8x22b_instruct_v0_1,HFv2 GPQA,16.44,,hf_open_llm_v2_240829.csv mixtral_8x22b_instruct_v0_1,HFv2 IFEval,71.84,,hf_open_llm_v2_240829.csv mixtral_8x22b_instruct_v0_1,HFv2 MMLU Pro,38.7,,hf_open_llm_v2_240829.csv mixtral_8x22b_instruct_v0_1,HFv2 Math Level 5,18.73,,hf_open_llm_v2_240829.csv mixtral_8x22b_instruct_v0_1,HFv2 MuSR,13.49,,hf_open_llm_v2_240829.csv mixtral_8x22b_v0_1,HF OpenLLM v2,25.49,,hf_open_llm_v2_240829.csv mixtral_8x22b_v0_1,HFv2 BBH,45.59,,hf_open_llm_v2_240829.csv mixtral_8x22b_v0_1,HFv2 GPQA,16.78,,hf_open_llm_v2_240829.csv mixtral_8x22b_v0_1,HFv2 IFEval,25.83,,hf_open_llm_v2_240829.csv mixtral_8x22b_v0_1,HFv2 MMLU Pro,40.44,,hf_open_llm_v2_240829.csv mixtral_8x22b_v0_1,HFv2 Math Level 5,16.84,,hf_open_llm_v2_240829.csv mixtral_8x22b_v0_1,HFv2 MuSR,7.46,,hf_open_llm_v2_240829.csv mixtral_8x22b_v0_3,HF OpenLLM v2,25.55,,hf_open_llm_v2_240829.csv mixtral_8x22b_v0_3,HFv2 BBH,45.73,,hf_open_llm_v2_240829.csv mixtral_8x22b_v0_3,HFv2 GPQA,17.0,,hf_open_llm_v2_240829.csv mixtral_8x22b_v0_3,HFv2 IFEval,25.83,,hf_open_llm_v2_240829.csv mixtral_8x22b_v0_3,HFv2 MMLU Pro,40.44,,hf_open_llm_v2_240829.csv mixtral_8x22b_v0_3,HFv2 Math Level 5,16.84,,hf_open_llm_v2_240829.csv mixtral_8x22b_v0_3,HFv2 MuSR,7.46,,hf_open_llm_v2_240829.csv mixtral_8x7b_instruct_v0_1,HF OpenLLM v2,24.35,,hf_open_llm_v2_240829.csv mixtral_8x7b_instruct_v0_1,HFv2 BBH,34.02,,hf_open_llm_v2_240829.csv mixtral_8x7b_instruct_v0_1,HFv2 GPQA,7.61,,hf_open_llm_v2_240829.csv mixtral_8x7b_instruct_v0_1,HFv2 IFEval,53.95,,hf_open_llm_v2_240829.csv mixtral_8x7b_instruct_v0_1,HFv2 MMLU Pro,29.36,,hf_open_llm_v2_240829.csv mixtral_8x7b_instruct_v0_1,HFv2 Math Level 5,9.06,,hf_open_llm_v2_240829.csv mixtral_8x7b_instruct_v0_1,HFv2 MuSR,12.11,,hf_open_llm_v2_240829.csv mixtral_8x7b_v0_1,HF OpenLLM v2,19.33,,hf_open_llm_v2_240829.csv mixtral_8x7b_v0_1,HFv2 BBH,30.29,,hf_open_llm_v2_240829.csv mixtral_8x7b_v0_1,HFv2 GPQA,8.5,,hf_open_llm_v2_240829.csv mixtral_8x7b_v0_1,HFv2 IFEval,24.15,,hf_open_llm_v2_240829.csv mixtral_8x7b_v0_1,HFv2 MMLU Pro,31.66,,hf_open_llm_v2_240829.csv mixtral_8x7b_v0_1,HFv2 Math Level 5,8.76,,hf_open_llm_v2_240829.csv mixtral_8x7b_v0_1,HFv2 MuSR,12.58,,hf_open_llm_v2_240829.csv mpt_7b,HF OpenLLM v2,5.98,,hf_open_llm_v2_240829.csv mpt_7b,HFv2 BBH,6.55,,hf_open_llm_v2_240829.csv mpt_7b,HFv2 GPQA,1.34,,hf_open_llm_v2_240829.csv mpt_7b,HFv2 IFEval,21.52,,hf_open_llm_v2_240829.csv mpt_7b,HFv2 MMLU Pro,2.29,,hf_open_llm_v2_240829.csv mpt_7b,HFv2 Math Level 5,1.28,,hf_open_llm_v2_240829.csv mpt_7b,HFv2 MuSR,2.9,,hf_open_llm_v2_240829.csv multiverse_70b,HF OpenLLM v2,31.73,,hf_open_llm_v2_240829.csv multiverse_70b,HFv2 BBH,46.14,,hf_open_llm_v2_240829.csv multiverse_70b,HFv2 GPQA,13.87,,hf_open_llm_v2_240829.csv multiverse_70b,HFv2 IFEval,52.49,,hf_open_llm_v2_240829.csv multiverse_70b,HFv2 MMLU Pro,42.89,,hf_open_llm_v2_240829.csv multiverse_70b,HFv2 Math Level 5,16.16,,hf_open_llm_v2_240829.csv multiverse_70b,HFv2 MuSR,18.82,,hf_open_llm_v2_240829.csv neuralbeagle14_7b,HF OpenLLM v2,18.83,,hf_open_llm_v2_240829.csv neuralbeagle14_7b,HFv2 BBH,23.96,,hf_open_llm_v2_240829.csv neuralbeagle14_7b,HFv2 GPQA,4.25,,hf_open_llm_v2_240829.csv neuralbeagle14_7b,HFv2 IFEval,49.35,,hf_open_llm_v2_240829.csv neuralbeagle14_7b,HFv2 MMLU Pro,17.79,,hf_open_llm_v2_240829.csv neuralbeagle14_7b,HFv2 Math Level 5,4.76,,hf_open_llm_v2_240829.csv neuralbeagle14_7b,HFv2 MuSR,12.89,,hf_open_llm_v2_240829.csv neuralllama3_8b_orpo_v0_3,HF OpenLLM v2,17.52,,hf_open_llm_v2_240829.csv neuralllama3_8b_orpo_v0_3,HFv2 BBH,22.39,,hf_open_llm_v2_240829.csv neuralllama3_8b_orpo_v0_3,HFv2 GPQA,0.0,,hf_open_llm_v2_240829.csv neuralllama3_8b_orpo_v0_3,HFv2 IFEval,52.76,,hf_open_llm_v2_240829.csv neuralllama3_8b_orpo_v0_3,HFv2 MMLU Pro,22.85,,hf_open_llm_v2_240829.csv neuralllama3_8b_orpo_v0_3,HFv2 Math Level 5,3.47,,hf_open_llm_v2_240829.csv neuralllama3_8b_orpo_v0_3,HFv2 MuSR,3.65,,hf_open_llm_v2_240829.csv notus_7b_v1,HF OpenLLM v2,18.37,,hf_open_llm_v2_240829.csv notus_7b_v1,HFv2 BBH,22.75,,hf_open_llm_v2_240829.csv notus_7b_v1,HFv2 GPQA,5.26,,hf_open_llm_v2_240829.csv notus_7b_v1,HFv2 IFEval,50.82,,hf_open_llm_v2_240829.csv notus_7b_v1,HFv2 MMLU Pro,22.26,,hf_open_llm_v2_240829.csv notus_7b_v1,HFv2 Math Level 5,2.57,,hf_open_llm_v2_240829.csv notus_7b_v1,HFv2 MuSR,6.59,,hf_open_llm_v2_240829.csv notux_8x7b_v1,HF OpenLLM v2,24.23,,hf_open_llm_v2_240829.csv notux_8x7b_v1,HFv2 BBH,34.76,,hf_open_llm_v2_240829.csv notux_8x7b_v1,HFv2 GPQA,7.83,,hf_open_llm_v2_240829.csv notux_8x7b_v1,HFv2 IFEval,54.22,,hf_open_llm_v2_240829.csv notux_8x7b_v1,HFv2 MMLU Pro,29.56,,hf_open_llm_v2_240829.csv notux_8x7b_v1,HFv2 Math Level 5,8.46,,hf_open_llm_v2_240829.csv notux_8x7b_v1,HFv2 MuSR,10.53,,hf_open_llm_v2_240829.csv nous_hermes_2_mistral_7b_dpo,HF OpenLLM v2,21.01,,hf_open_llm_v2_240829.csv nous_hermes_2_mistral_7b_dpo,HFv2 BBH,27.79,,hf_open_llm_v2_240829.csv nous_hermes_2_mistral_7b_dpo,HFv2 GPQA,5.7,,hf_open_llm_v2_240829.csv nous_hermes_2_mistral_7b_dpo,HFv2 IFEval,57.63,,hf_open_llm_v2_240829.csv nous_hermes_2_mistral_7b_dpo,HFv2 MMLU Pro,22.39,,hf_open_llm_v2_240829.csv nous_hermes_2_mistral_7b_dpo,HFv2 Math Level 5,4.23,,hf_open_llm_v2_240829.csv nous_hermes_2_mistral_7b_dpo,HFv2 MuSR,8.33,,hf_open_llm_v2_240829.csv nous_hermes_2_mixtral_8x7b_dpo,HF OpenLLM v2,27.13,,hf_open_llm_v2_240829.csv nous_hermes_2_mixtral_8x7b_dpo,HFv2 BBH,37.11,,hf_open_llm_v2_240829.csv nous_hermes_2_mixtral_8x7b_dpo,HFv2 GPQA,9.51,,hf_open_llm_v2_240829.csv nous_hermes_2_mixtral_8x7b_dpo,HFv2 IFEval,58.97,,hf_open_llm_v2_240829.csv nous_hermes_2_mixtral_8x7b_dpo,HFv2 MMLU Pro,29.62,,hf_open_llm_v2_240829.csv nous_hermes_2_mixtral_8x7b_dpo,HFv2 Math Level 5,10.88,,hf_open_llm_v2_240829.csv nous_hermes_2_mixtral_8x7b_dpo,HFv2 MuSR,16.68,,hf_open_llm_v2_240829.csv nous_hermes_2_mixtral_8x7b_sft,HF OpenLLM v2,21.78,,hf_open_llm_v2_240829.csv nous_hermes_2_mixtral_8x7b_sft,HFv2 BBH,30.59,,hf_open_llm_v2_240829.csv nous_hermes_2_mixtral_8x7b_sft,HFv2 GPQA,6.94,,hf_open_llm_v2_240829.csv nous_hermes_2_mixtral_8x7b_sft,HFv2 IFEval,57.31,,hf_open_llm_v2_240829.csv nous_hermes_2_mixtral_8x7b_sft,HFv2 MMLU Pro,22.96,,hf_open_llm_v2_240829.csv nous_hermes_2_mixtral_8x7b_sft,HFv2 Math Level 5,1.74,,hf_open_llm_v2_240829.csv nous_hermes_2_mixtral_8x7b_sft,HFv2 MuSR,11.14,,hf_open_llm_v2_240829.csv nous_hermes_2_solar_10_7b,HF OpenLLM v2,23.32,,hf_open_llm_v2_240829.csv nous_hermes_2_solar_10_7b,HFv2 BBH,34.99,,hf_open_llm_v2_240829.csv nous_hermes_2_solar_10_7b,HFv2 GPQA,5.82,,hf_open_llm_v2_240829.csv nous_hermes_2_solar_10_7b,HFv2 IFEval,52.79,,hf_open_llm_v2_240829.csv nous_hermes_2_solar_10_7b,HFv2 MMLU Pro,27.31,,hf_open_llm_v2_240829.csv nous_hermes_2_solar_10_7b,HFv2 Math Level 5,5.21,,hf_open_llm_v2_240829.csv nous_hermes_2_solar_10_7b,HFv2 MuSR,13.83,,hf_open_llm_v2_240829.csv nucleus_22b_token_500b,HF OpenLLM v2,1.63,,hf_open_llm_v2_240829.csv nucleus_22b_token_500b,HFv2 BBH,1.89,,hf_open_llm_v2_240829.csv nucleus_22b_token_500b,HFv2 GPQA,0.0,,hf_open_llm_v2_240829.csv nucleus_22b_token_500b,HFv2 IFEval,2.57,,hf_open_llm_v2_240829.csv nucleus_22b_token_500b,HFv2 MMLU Pro,1.8,,hf_open_llm_v2_240829.csv nucleus_22b_token_500b,HFv2 Math Level 5,0.0,,hf_open_llm_v2_240829.csv nucleus_22b_token_500b,HFv2 MuSR,3.55,,hf_open_llm_v2_240829.csv nxcode_cq_7b_orpo,HF OpenLLM v2,12.3,,hf_open_llm_v2_240829.csv nxcode_cq_7b_orpo,HFv2 BBH,17.58,,hf_open_llm_v2_240829.csv nxcode_cq_7b_orpo,HFv2 GPQA,0.56,,hf_open_llm_v2_240829.csv nxcode_cq_7b_orpo,HFv2 IFEval,40.07,,hf_open_llm_v2_240829.csv nxcode_cq_7b_orpo,HFv2 MMLU Pro,6.79,,hf_open_llm_v2_240829.csv nxcode_cq_7b_orpo,HFv2 Math Level 5,1.74,,hf_open_llm_v2_240829.csv nxcode_cq_7b_orpo,HFv2 MuSR,7.05,,hf_open_llm_v2_240829.csv olmo_1b,HF OpenLLM v2,6.47,,hf_open_llm_v2_240829.csv olmo_1b,HFv2 BBH,3.2,,hf_open_llm_v2_240829.csv olmo_1b,HFv2 GPQA,1.57,,hf_open_llm_v2_240829.csv olmo_1b,HFv2 IFEval,21.82,,hf_open_llm_v2_240829.csv olmo_1b,HFv2 MMLU Pro,1.93,,hf_open_llm_v2_240829.csv olmo_1b,HFv2 Math Level 5,0.76,,hf_open_llm_v2_240829.csv olmo_1b,HFv2 MuSR,9.56,,hf_open_llm_v2_240829.csv olmo_7b,HF OpenLLM v2,6.78,,hf_open_llm_v2_240829.csv olmo_7b,HFv2 BBH,5.76,,hf_open_llm_v2_240829.csv olmo_7b,HFv2 GPQA,3.02,,hf_open_llm_v2_240829.csv olmo_7b,HFv2 IFEval,27.19,,hf_open_llm_v2_240829.csv olmo_7b,HFv2 MMLU Pro,1.92,,hf_open_llm_v2_240829.csv olmo_7b,HFv2 Math Level 5,0.68,,hf_open_llm_v2_240829.csv olmo_7b,HFv2 MuSR,2.08,,hf_open_llm_v2_240829.csv olmo_7b_instruct,HF OpenLLM v2,10.76,,hf_open_llm_v2_240829.csv olmo_7b_instruct,HFv2 BBH,13.16,,hf_open_llm_v2_240829.csv olmo_7b_instruct,HFv2 GPQA,2.8,,hf_open_llm_v2_240829.csv olmo_7b_instruct,HFv2 IFEval,34.73,,hf_open_llm_v2_240829.csv olmo_7b_instruct,HFv2 MMLU Pro,8.72,,hf_open_llm_v2_240829.csv olmo_7b_instruct,HFv2 Math Level 5,0.83,,hf_open_llm_v2_240829.csv olmo_7b_instruct,HFv2 MuSR,4.33,,hf_open_llm_v2_240829.csv openbuddy_llama3_1_8b_v22_2_131k,HF OpenLLM v2,24.07,,hf_open_llm_v2_240829.csv openbuddy_llama3_1_8b_v22_2_131k,HFv2 BBH,29.06,,hf_open_llm_v2_240829.csv openbuddy_llama3_1_8b_v22_2_131k,HFv2 GPQA,3.91,,hf_open_llm_v2_240829.csv openbuddy_llama3_1_8b_v22_2_131k,HFv2 IFEval,66.57,,hf_open_llm_v2_240829.csv openbuddy_llama3_1_8b_v22_2_131k,HFv2 MMLU Pro,25.67,,hf_open_llm_v2_240829.csv openbuddy_llama3_1_8b_v22_2_131k,HFv2 Math Level 5,9.37,,hf_open_llm_v2_240829.csv openbuddy_llama3_1_8b_v22_2_131k,HFv2 MuSR,9.81,,hf_open_llm_v2_240829.csv openbuddy_llama3_8b_v21_1_8k,HF OpenLLM v2,19.9,,hf_open_llm_v2_240829.csv openbuddy_llama3_8b_v21_1_8k,HFv2 BBH,26.12,,hf_open_llm_v2_240829.csv openbuddy_llama3_8b_v21_1_8k,HFv2 GPQA,2.8,,hf_open_llm_v2_240829.csv openbuddy_llama3_8b_v21_1_8k,HFv2 IFEval,55.7,,hf_open_llm_v2_240829.csv openbuddy_llama3_8b_v21_1_8k,HFv2 MMLU Pro,21.72,,hf_open_llm_v2_240829.csv openbuddy_llama3_8b_v21_1_8k,HFv2 Math Level 5,2.72,,hf_open_llm_v2_240829.csv openbuddy_llama3_8b_v21_1_8k,HFv2 MuSR,10.35,,hf_open_llm_v2_240829.csv openbuddy_llama3_8b_v21_2_32k,HF OpenLLM v2,21.84,,hf_open_llm_v2_240829.csv openbuddy_llama3_8b_v21_2_32k,HFv2 BBH,27.25,,hf_open_llm_v2_240829.csv openbuddy_llama3_8b_v21_2_32k,HFv2 GPQA,3.91,,hf_open_llm_v2_240829.csv openbuddy_llama3_8b_v21_2_32k,HFv2 IFEval,61.92,,hf_open_llm_v2_240829.csv openbuddy_llama3_8b_v21_2_32k,HFv2 MMLU Pro,25.54,,hf_open_llm_v2_240829.csv openbuddy_llama3_8b_v21_2_32k,HFv2 Math Level 5,6.5,,hf_open_llm_v2_240829.csv openbuddy_llama3_8b_v21_2_32k,HFv2 MuSR,5.93,,hf_open_llm_v2_240829.csv openbuddy_mixtral_7bx8_v18_1_32k,HF OpenLLM v2,22.12,,hf_open_llm_v2_240829.csv openbuddy_mixtral_7bx8_v18_1_32k,HFv2 BBH,24.54,,hf_open_llm_v2_240829.csv openbuddy_mixtral_7bx8_v18_1_32k,HFv2 GPQA,7.27,,hf_open_llm_v2_240829.csv openbuddy_mixtral_7bx8_v18_1_32k,HFv2 IFEval,54.93,,hf_open_llm_v2_240829.csv openbuddy_mixtral_7bx8_v18_1_32k,HFv2 MMLU Pro,31.16,,hf_open_llm_v2_240829.csv openbuddy_mixtral_7bx8_v18_1_32k,HFv2 Math Level 5,9.52,,hf_open_llm_v2_240829.csv openbuddy_mixtral_7bx8_v18_1_32k,HFv2 MuSR,5.28,,hf_open_llm_v2_240829.csv openbuddy_zero_14b_v22_3_32k,HF OpenLLM v2,19.14,,hf_open_llm_v2_240829.csv openbuddy_zero_14b_v22_3_32k,HFv2 BBH,26.29,,hf_open_llm_v2_240829.csv openbuddy_zero_14b_v22_3_32k,HFv2 GPQA,7.61,,hf_open_llm_v2_240829.csv openbuddy_zero_14b_v22_3_32k,HFv2 IFEval,37.53,,hf_open_llm_v2_240829.csv openbuddy_zero_14b_v22_3_32k,HFv2 MMLU Pro,24.3,,hf_open_llm_v2_240829.csv openbuddy_zero_14b_v22_3_32k,HFv2 Math Level 5,7.78,,hf_open_llm_v2_240829.csv openbuddy_zero_14b_v22_3_32k,HFv2 MuSR,11.34,,hf_open_llm_v2_240829.csv openbuddy_zero_3b_v21_2_32k,HF OpenLLM v2,11.55,,hf_open_llm_v2_240829.csv openbuddy_zero_3b_v21_2_32k,HFv2 BBH,15.29,,hf_open_llm_v2_240829.csv openbuddy_zero_3b_v21_2_32k,HFv2 GPQA,1.34,,hf_open_llm_v2_240829.csv openbuddy_zero_3b_v21_2_32k,HFv2 IFEval,38.02,,hf_open_llm_v2_240829.csv openbuddy_zero_3b_v21_2_32k,HFv2 MMLU Pro,11.49,,hf_open_llm_v2_240829.csv openbuddy_zero_3b_v21_2_32k,HFv2 Math Level 5,0.91,,hf_open_llm_v2_240829.csv openbuddy_zero_3b_v21_2_32k,HFv2 MuSR,2.25,,hf_open_llm_v2_240829.csv openchat_3_5,HF OpenLLM v2,21.52,,hf_open_llm_v2_240829.csv openchat_3_5,HFv2 BBH,21.58,,hf_open_llm_v2_240829.csv openchat_3_5,HFv2 GPQA,6.49,,hf_open_llm_v2_240829.csv openchat_3_5,HFv2 IFEval,59.31,,hf_open_llm_v2_240829.csv openchat_3_5,HFv2 MMLU Pro,23.93,,hf_open_llm_v2_240829.csv openchat_3_5,HFv2 Math Level 5,6.57,,hf_open_llm_v2_240829.csv openchat_3_5,HFv2 MuSR,11.26,,hf_open_llm_v2_240829.csv openchat_3_5_1210,HF OpenLLM v2,22.56,,hf_open_llm_v2_240829.csv openchat_3_5_1210,HFv2 BBH,23.24,,hf_open_llm_v2_240829.csv openchat_3_5_1210,HFv2 GPQA,6.82,,hf_open_llm_v2_240829.csv openchat_3_5_1210,HFv2 IFEval,60.37,,hf_open_llm_v2_240829.csv openchat_3_5_1210,HFv2 MMLU Pro,23.81,,hf_open_llm_v2_240829.csv openchat_3_5_1210,HFv2 Math Level 5,6.87,,hf_open_llm_v2_240829.csv openchat_3_5_1210,HFv2 MuSR,14.28,,hf_open_llm_v2_240829.csv openhermes_2_5_mistral_7b,HF OpenLLM v2,21.22,,hf_open_llm_v2_240829.csv openhermes_2_5_mistral_7b,HFv2 BBH,27.77,,hf_open_llm_v2_240829.csv openhermes_2_5_mistral_7b,HFv2 GPQA,4.47,,hf_open_llm_v2_240829.csv openhermes_2_5_mistral_7b,HFv2 IFEval,55.71,,hf_open_llm_v2_240829.csv openhermes_2_5_mistral_7b,HFv2 MMLU Pro,22.83,,hf_open_llm_v2_240829.csv openhermes_2_5_mistral_7b,HFv2 Math Level 5,4.46,,hf_open_llm_v2_240829.csv openhermes_2_5_mistral_7b,HFv2 MuSR,12.06,,hf_open_llm_v2_240829.csv openhermes_2_mistral_7b,HF OpenLLM v2,21.33,,hf_open_llm_v2_240829.csv openhermes_2_mistral_7b,HFv2 BBH,29.25,,hf_open_llm_v2_240829.csv openhermes_2_mistral_7b,HFv2 GPQA,4.47,,hf_open_llm_v2_240829.csv openhermes_2_mistral_7b,HFv2 IFEval,52.86,,hf_open_llm_v2_240829.csv openhermes_2_mistral_7b,HFv2 MMLU Pro,21.46,,hf_open_llm_v2_240829.csv openhermes_2_mistral_7b,HFv2 Math Level 5,3.85,,hf_open_llm_v2_240829.csv openhermes_2_mistral_7b,HFv2 MuSR,16.06,,hf_open_llm_v2_240829.csv opt_1_3b,HF OpenLLM v2,5.25,,hf_open_llm_v2_240829.csv opt_1_3b,HFv2 BBH,3.65,,hf_open_llm_v2_240829.csv opt_1_3b,HFv2 GPQA,0.0,,hf_open_llm_v2_240829.csv opt_1_3b,HFv2 IFEval,23.83,,hf_open_llm_v2_240829.csv opt_1_3b,HFv2 MMLU Pro,1.19,,hf_open_llm_v2_240829.csv opt_1_3b,HFv2 Math Level 5,0.76,,hf_open_llm_v2_240829.csv opt_1_3b,HFv2 MuSR,2.08,,hf_open_llm_v2_240829.csv opt_30b,HF OpenLLM v2,6.2,,hf_open_llm_v2_240829.csv opt_30b,HFv2 BBH,3.5,,hf_open_llm_v2_240829.csv opt_30b,HFv2 GPQA,2.57,,hf_open_llm_v2_240829.csv opt_30b,HFv2 IFEval,24.53,,hf_open_llm_v2_240829.csv opt_30b,HFv2 MMLU Pro,1.82,,hf_open_llm_v2_240829.csv opt_30b,HFv2 Math Level 5,0.6,,hf_open_llm_v2_240829.csv opt_30b,HFv2 MuSR,4.19,,hf_open_llm_v2_240829.csv orpollama3_8b,HF OpenLLM v2,14.87,,hf_open_llm_v2_240829.csv orpollama3_8b,HFv2 BBH,21.95,,hf_open_llm_v2_240829.csv orpollama3_8b,HFv2 GPQA,3.91,,hf_open_llm_v2_240829.csv orpollama3_8b,HFv2 IFEval,36.53,,hf_open_llm_v2_240829.csv orpollama3_8b,HFv2 MMLU Pro,18.95,,hf_open_llm_v2_240829.csv orpollama3_8b,HFv2 Math Level 5,3.85,,hf_open_llm_v2_240829.csv orpollama3_8b,HFv2 MuSR,4.01,,hf_open_llm_v2_240829.csv phi_1,HF OpenLLM v2,5.52,,hf_open_llm_v2_240829.csv phi_1,HFv2 BBH,4.27,,hf_open_llm_v2_240829.csv phi_1,HFv2 GPQA,2.01,,hf_open_llm_v2_240829.csv phi_1,HFv2 IFEval,20.68,,hf_open_llm_v2_240829.csv phi_1,HFv2 MMLU Pro,1.8,,hf_open_llm_v2_240829.csv phi_1,HFv2 Math Level 5,0.68,,hf_open_llm_v2_240829.csv phi_1,HFv2 MuSR,3.7,,hf_open_llm_v2_240829.csv phi_1_5,HF OpenLLM v2,7.06,,hf_open_llm_v2_240829.csv phi_1_5,HFv2 BBH,7.47,,hf_open_llm_v2_240829.csv phi_1_5,HFv2 GPQA,2.35,,hf_open_llm_v2_240829.csv phi_1_5,HFv2 IFEval,20.33,,hf_open_llm_v2_240829.csv phi_1_5,HFv2 MMLU Pro,7.68,,hf_open_llm_v2_240829.csv phi_1_5,HFv2 Math Level 5,1.13,,hf_open_llm_v2_240829.csv phi_1_5,HFv2 MuSR,3.39,,hf_open_llm_v2_240829.csv phi_1_5_instruct_v0_1,HF OpenLLM v2,6.64,,hf_open_llm_v2_240829.csv phi_1_5_instruct_v0_1,HFv2 BBH,4.82,,hf_open_llm_v2_240829.csv phi_1_5_instruct_v0_1,HFv2 GPQA,1.34,,hf_open_llm_v2_240829.csv phi_1_5_instruct_v0_1,HFv2 IFEval,24.02,,hf_open_llm_v2_240829.csv phi_1_5_instruct_v0_1,HFv2 MMLU Pro,6.24,,hf_open_llm_v2_240829.csv phi_1_5_instruct_v0_1,HFv2 Math Level 5,0.0,,hf_open_llm_v2_240829.csv phi_1_5_instruct_v0_1,HFv2 MuSR,3.4,,hf_open_llm_v2_240829.csv phi_2,HF OpenLLM v2,15.45,,hf_open_llm_v2_240829.csv phi_2,HFv2 BBH,28.04,,hf_open_llm_v2_240829.csv phi_2,HFv2 GPQA,2.91,,hf_open_llm_v2_240829.csv phi_2,HFv2 IFEval,27.39,,hf_open_llm_v2_240829.csv phi_2,HFv2 MMLU Pro,18.09,,hf_open_llm_v2_240829.csv phi_2,HFv2 Math Level 5,2.42,,hf_open_llm_v2_240829.csv phi_2,HFv2 MuSR,13.84,,hf_open_llm_v2_240829.csv phi_2_instruct_v0_1,HF OpenLLM v2,14.22,,hf_open_llm_v2_240829.csv phi_2_instruct_v0_1,HFv2 BBH,26.36,,hf_open_llm_v2_240829.csv phi_2_instruct_v0_1,HFv2 GPQA,3.24,,hf_open_llm_v2_240829.csv phi_2_instruct_v0_1,HFv2 IFEval,36.81,,hf_open_llm_v2_240829.csv phi_2_instruct_v0_1,HFv2 MMLU Pro,13.85,,hf_open_llm_v2_240829.csv phi_2_instruct_v0_1,HFv2 Math Level 5,0.0,,hf_open_llm_v2_240829.csv phi_2_instruct_v0_1,HFv2 MuSR,5.04,,hf_open_llm_v2_240829.csv phi_3_5_mini_instruct,HF OpenLLM v2,27.4,,hf_open_llm_v2_240829.csv phi_3_5_mini_instruct,HFv2 BBH,36.75,,hf_open_llm_v2_240829.csv phi_3_5_mini_instruct,HFv2 GPQA,11.97,,hf_open_llm_v2_240829.csv phi_3_5_mini_instruct,HFv2 IFEval,57.75,,hf_open_llm_v2_240829.csv phi_3_5_mini_instruct,HFv2 MMLU Pro,32.91,,hf_open_llm_v2_240829.csv phi_3_5_mini_instruct,HFv2 Math Level 5,14.95,,hf_open_llm_v2_240829.csv phi_3_5_mini_instruct,HFv2 MuSR,10.1,,hf_open_llm_v2_240829.csv phi_3_5_moe_instruct,HF OpenLLM v2,35.1,,hf_open_llm_v2_240829.csv phi_3_5_moe_instruct,HFv2 BBH,48.77,,hf_open_llm_v2_240829.csv phi_3_5_moe_instruct,HFv2 GPQA,14.09,,hf_open_llm_v2_240829.csv phi_3_5_moe_instruct,HFv2 IFEval,69.25,,hf_open_llm_v2_240829.csv phi_3_5_moe_instruct,HFv2 MMLU Pro,40.64,,hf_open_llm_v2_240829.csv phi_3_5_moe_instruct,HFv2 Math Level 5,20.54,,hf_open_llm_v2_240829.csv phi_3_5_moe_instruct,HFv2 MuSR,17.33,,hf_open_llm_v2_240829.csv phi_3_medium_4k_instruct,HF OpenLLM v2,32.67,,hf_open_llm_v2_240829.csv phi_3_medium_4k_instruct,HFv2 BBH,49.38,,hf_open_llm_v2_240829.csv phi_3_medium_4k_instruct,HFv2 GPQA,11.52,,hf_open_llm_v2_240829.csv phi_3_medium_4k_instruct,HFv2 IFEval,64.23,,hf_open_llm_v2_240829.csv phi_3_medium_4k_instruct,HFv2 MMLU Pro,40.84,,hf_open_llm_v2_240829.csv phi_3_medium_4k_instruct,HFv2 Math Level 5,16.99,,hf_open_llm_v2_240829.csv phi_3_medium_4k_instruct,HFv2 MuSR,13.05,,hf_open_llm_v2_240829.csv phi_3_mini_128k_instruct,HF OpenLLM v2,25.49,,hf_open_llm_v2_240829.csv phi_3_mini_128k_instruct,HFv2 BBH,37.1,,hf_open_llm_v2_240829.csv phi_3_mini_128k_instruct,HFv2 GPQA,9.06,,hf_open_llm_v2_240829.csv phi_3_mini_128k_instruct,HFv2 IFEval,59.76,,hf_open_llm_v2_240829.csv phi_3_mini_128k_instruct,HFv2 MMLU Pro,30.38,,hf_open_llm_v2_240829.csv phi_3_mini_128k_instruct,HFv2 Math Level 5,8.91,,hf_open_llm_v2_240829.csv phi_3_mini_128k_instruct,HFv2 MuSR,7.71,,hf_open_llm_v2_240829.csv phi_3_mini_4k_instruct,HF OpenLLM v2,27.2,,hf_open_llm_v2_240829.csv phi_3_mini_4k_instruct,HFv2 BBH,39.27,,hf_open_llm_v2_240829.csv phi_3_mini_4k_instruct,HFv2 GPQA,10.96,,hf_open_llm_v2_240829.csv phi_3_mini_4k_instruct,HFv2 IFEval,56.13,,hf_open_llm_v2_240829.csv phi_3_mini_4k_instruct,HFv2 MMLU Pro,33.58,,hf_open_llm_v2_240829.csv phi_3_mini_4k_instruct,HFv2 Math Level 5,14.2,,hf_open_llm_v2_240829.csv phi_3_mini_4k_instruct,HFv2 MuSR,13.12,,hf_open_llm_v2_240829.csv phi_3_mini_4k_instruct_cpo_simpo,HF OpenLLM v2,25.87,,hf_open_llm_v2_240829.csv phi_3_mini_4k_instruct_cpo_simpo,HFv2 BBH,39.15,,hf_open_llm_v2_240829.csv phi_3_mini_4k_instruct_cpo_simpo,HFv2 GPQA,10.74,,hf_open_llm_v2_240829.csv phi_3_mini_4k_instruct_cpo_simpo,HFv2 IFEval,57.14,,hf_open_llm_v2_240829.csv phi_3_mini_4k_instruct_cpo_simpo,HFv2 MMLU Pro,31.78,,hf_open_llm_v2_240829.csv phi_3_mini_4k_instruct_cpo_simpo,HFv2 Math Level 5,7.63,,hf_open_llm_v2_240829.csv phi_3_mini_4k_instruct_cpo_simpo,HFv2 MuSR,8.78,,hf_open_llm_v2_240829.csv phi_3_small_128k_instruct,HF OpenLLM v2,28.59,,hf_open_llm_v2_240829.csv phi_3_small_128k_instruct,HFv2 BBH,45.63,,hf_open_llm_v2_240829.csv phi_3_small_128k_instruct,HFv2 GPQA,8.95,,hf_open_llm_v2_240829.csv phi_3_small_128k_instruct,HFv2 IFEval,63.68,,hf_open_llm_v2_240829.csv phi_3_small_128k_instruct,HFv2 MMLU Pro,38.78,,hf_open_llm_v2_240829.csv phi_3_small_128k_instruct,HFv2 Math Level 5,0.0,,hf_open_llm_v2_240829.csv phi_3_small_128k_instruct,HFv2 MuSR,14.5,,hf_open_llm_v2_240829.csv pythia_12b,HF OpenLLM v2,5.93,,hf_open_llm_v2_240829.csv pythia_12b,HFv2 BBH,4.99,,hf_open_llm_v2_240829.csv pythia_12b,HFv2 GPQA,0.0,,hf_open_llm_v2_240829.csv pythia_12b,HFv2 IFEval,24.71,,hf_open_llm_v2_240829.csv pythia_12b,HFv2 MMLU Pro,1.21,,hf_open_llm_v2_240829.csv pythia_12b,HFv2 Math Level 5,0.91,,hf_open_llm_v2_240829.csv pythia_12b,HFv2 MuSR,3.79,,hf_open_llm_v2_240829.csv pythia_160m,HF OpenLLM v2,5.62,,hf_open_llm_v2_240829.csv pythia_160m,HFv2 BBH,2.2,,hf_open_llm_v2_240829.csv pythia_160m,HFv2 GPQA,1.12,,hf_open_llm_v2_240829.csv pythia_160m,HFv2 IFEval,18.16,,hf_open_llm_v2_240829.csv pythia_160m,HFv2 MMLU Pro,1.33,,hf_open_llm_v2_240829.csv pythia_160m,HFv2 Math Level 5,0.23,,hf_open_llm_v2_240829.csv pythia_160m,HFv2 MuSR,10.68,,hf_open_llm_v2_240829.csv pythia_2_8b,HF OpenLLM v2,5.44,,hf_open_llm_v2_240829.csv pythia_2_8b,HFv2 BBH,5.08,,hf_open_llm_v2_240829.csv pythia_2_8b,HFv2 GPQA,0.0,,hf_open_llm_v2_240829.csv pythia_2_8b,HFv2 IFEval,21.73,,hf_open_llm_v2_240829.csv pythia_2_8b,HFv2 MMLU Pro,1.52,,hf_open_llm_v2_240829.csv pythia_2_8b,HFv2 Math Level 5,0.68,,hf_open_llm_v2_240829.csv pythia_2_8b,HFv2 MuSR,3.64,,hf_open_llm_v2_240829.csv pythia_410m,HF OpenLLM v2,5.11,,hf_open_llm_v2_240829.csv pythia_410m,HFv2 BBH,2.72,,hf_open_llm_v2_240829.csv pythia_410m,HFv2 GPQA,1.23,,hf_open_llm_v2_240829.csv pythia_410m,HFv2 IFEval,21.95,,hf_open_llm_v2_240829.csv pythia_410m,HFv2 MMLU Pro,1.42,,hf_open_llm_v2_240829.csv pythia_410m,HFv2 Math Level 5,0.3,,hf_open_llm_v2_240829.csv pythia_410m,HFv2 MuSR,3.06,,hf_open_llm_v2_240829.csv pythia_410m_roberta_lr_8e7_kl_01_steps_12000_rlhf_model,HF OpenLLM v2,3.82,,hf_open_llm_v2_240829.csv pythia_410m_roberta_lr_8e7_kl_01_steps_12000_rlhf_model,HFv2 BBH,1.82,,hf_open_llm_v2_240829.csv pythia_410m_roberta_lr_8e7_kl_01_steps_12000_rlhf_model,HFv2 GPQA,1.23,,hf_open_llm_v2_240829.csv pythia_410m_roberta_lr_8e7_kl_01_steps_12000_rlhf_model,HFv2 IFEval,15.72,,hf_open_llm_v2_240829.csv pythia_410m_roberta_lr_8e7_kl_01_steps_12000_rlhf_model,HFv2 MMLU Pro,1.87,,hf_open_llm_v2_240829.csv pythia_410m_roberta_lr_8e7_kl_01_steps_12000_rlhf_model,HFv2 Math Level 5,0.0,,hf_open_llm_v2_240829.csv pythia_410m_roberta_lr_8e7_kl_01_steps_12000_rlhf_model,HFv2 MuSR,2.25,,hf_open_llm_v2_240829.csv pythia_6_9b,HF OpenLLM v2,5.85,,hf_open_llm_v2_240829.csv pythia_6_9b,HFv2 BBH,5.88,,hf_open_llm_v2_240829.csv pythia_6_9b,HFv2 GPQA,0.22,,hf_open_llm_v2_240829.csv pythia_6_9b,HFv2 IFEval,22.81,,hf_open_llm_v2_240829.csv pythia_6_9b,HFv2 MMLU Pro,1.63,,hf_open_llm_v2_240829.csv pythia_6_9b,HFv2 Math Level 5,0.76,,hf_open_llm_v2_240829.csv pythia_6_9b,HFv2 MuSR,3.81,,hf_open_llm_v2_240829.csv qwen1_5_0_5b,HF OpenLLM v2,5.14,,hf_open_llm_v2_240829.csv qwen1_5_0_5b,HFv2 BBH,5.04,,hf_open_llm_v2_240829.csv qwen1_5_0_5b,HFv2 GPQA,0.56,,hf_open_llm_v2_240829.csv qwen1_5_0_5b,HFv2 IFEval,17.06,,hf_open_llm_v2_240829.csv qwen1_5_0_5b,HFv2 MMLU Pro,3.41,,hf_open_llm_v2_240829.csv qwen1_5_0_5b,HFv2 Math Level 5,0.45,,hf_open_llm_v2_240829.csv qwen1_5_0_5b,HFv2 MuSR,4.3,,hf_open_llm_v2_240829.csv qwen1_5_0_5b_chat,HF OpenLLM v2,5.56,,hf_open_llm_v2_240829.csv qwen1_5_0_5b_chat,HFv2 BBH,4.32,,hf_open_llm_v2_240829.csv qwen1_5_0_5b_chat,HFv2 GPQA,2.57,,hf_open_llm_v2_240829.csv qwen1_5_0_5b_chat,HFv2 IFEval,18.07,,hf_open_llm_v2_240829.csv qwen1_5_0_5b_chat,HFv2 MMLU Pro,2.36,,hf_open_llm_v2_240829.csv qwen1_5_0_5b_chat,HFv2 Math Level 5,0.0,,hf_open_llm_v2_240829.csv qwen1_5_0_5b_chat,HFv2 MuSR,6.06,,hf_open_llm_v2_240829.csv qwen1_5_110b,HF OpenLLM v2,29.56,,hf_open_llm_v2_240829.csv qwen1_5_110b,HFv2 BBH,44.28,,hf_open_llm_v2_240829.csv qwen1_5_110b,HFv2 GPQA,13.65,,hf_open_llm_v2_240829.csv qwen1_5_110b,HFv2 IFEval,34.22,,hf_open_llm_v2_240829.csv qwen1_5_110b,HFv2 MMLU Pro,48.45,,hf_open_llm_v2_240829.csv qwen1_5_110b,HFv2 Math Level 5,23.04,,hf_open_llm_v2_240829.csv qwen1_5_110b,HFv2 MuSR,13.71,,hf_open_llm_v2_240829.csv qwen1_5_110b_chat,HF OpenLLM v2,29.22,,hf_open_llm_v2_240829.csv qwen1_5_110b_chat,HFv2 BBH,44.98,,hf_open_llm_v2_240829.csv qwen1_5_110b_chat,HFv2 GPQA,12.19,,hf_open_llm_v2_240829.csv qwen1_5_110b_chat,HFv2 IFEval,59.39,,hf_open_llm_v2_240829.csv qwen1_5_110b_chat,HFv2 MMLU Pro,42.5,,hf_open_llm_v2_240829.csv qwen1_5_110b_chat,HFv2 Math Level 5,0.0,,hf_open_llm_v2_240829.csv qwen1_5_110b_chat,HFv2 MuSR,16.29,,hf_open_llm_v2_240829.csv qwen1_5_14b,HF OpenLLM v2,20.22,,hf_open_llm_v2_240829.csv qwen1_5_14b,HFv2 BBH,30.06,,hf_open_llm_v2_240829.csv qwen1_5_14b,HFv2 GPQA,5.93,,hf_open_llm_v2_240829.csv qwen1_5_14b,HFv2 IFEval,29.05,,hf_open_llm_v2_240829.csv qwen1_5_14b,HFv2 MMLU Pro,29.37,,hf_open_llm_v2_240829.csv qwen1_5_14b,HFv2 Math Level 5,16.47,,hf_open_llm_v2_240829.csv qwen1_5_14b,HFv2 MuSR,10.46,,hf_open_llm_v2_240829.csv qwen1_5_14b_chat,HF OpenLLM v2,21.02,,hf_open_llm_v2_240829.csv qwen1_5_14b_chat,HFv2 BBH,32.76,,hf_open_llm_v2_240829.csv qwen1_5_14b_chat,HFv2 GPQA,2.68,,hf_open_llm_v2_240829.csv qwen1_5_14b_chat,HFv2 IFEval,47.68,,hf_open_llm_v2_240829.csv qwen1_5_14b_chat,HFv2 MMLU Pro,29.09,,hf_open_llm_v2_240829.csv qwen1_5_14b_chat,HFv2 Math Level 5,0.0,,hf_open_llm_v2_240829.csv qwen1_5_14b_chat,HFv2 MuSR,13.93,,hf_open_llm_v2_240829.csv qwen1_5_1_8b,HF OpenLLM v2,9.12,,hf_open_llm_v2_240829.csv qwen1_5_1_8b,HFv2 BBH,9.76,,hf_open_llm_v2_240829.csv qwen1_5_1_8b,HFv2 GPQA,7.38,,hf_open_llm_v2_240829.csv qwen1_5_1_8b,HFv2 IFEval,21.54,,hf_open_llm_v2_240829.csv qwen1_5_1_8b,HFv2 MMLU Pro,9.8,,hf_open_llm_v2_240829.csv qwen1_5_1_8b,HFv2 Math Level 5,2.27,,hf_open_llm_v2_240829.csv qwen1_5_1_8b,HFv2 MuSR,3.96,,hf_open_llm_v2_240829.csv qwen1_5_1_8b_chat,HF OpenLLM v2,9.01,,hf_open_llm_v2_240829.csv qwen1_5_1_8b_chat,HFv2 BBH,5.91,,hf_open_llm_v2_240829.csv qwen1_5_1_8b_chat,HFv2 GPQA,6.38,,hf_open_llm_v2_240829.csv qwen1_5_1_8b_chat,HFv2 IFEval,20.19,,hf_open_llm_v2_240829.csv qwen1_5_1_8b_chat,HFv2 MMLU Pro,8.93,,hf_open_llm_v2_240829.csv qwen1_5_1_8b_chat,HFv2 Math Level 5,0.45,,hf_open_llm_v2_240829.csv qwen1_5_1_8b_chat,HFv2 MuSR,12.18,,hf_open_llm_v2_240829.csv qwen1_5_32b,HF OpenLLM v2,26.69,,hf_open_llm_v2_240829.csv qwen1_5_32b,HFv2 BBH,38.98,,hf_open_llm_v2_240829.csv qwen1_5_32b,HFv2 GPQA,10.63,,hf_open_llm_v2_240829.csv qwen1_5_32b,HFv2 IFEval,32.97,,hf_open_llm_v2_240829.csv qwen1_5_32b,HFv2 MMLU Pro,38.89,,hf_open_llm_v2_240829.csv qwen1_5_32b,HFv2 Math Level 5,26.66,,hf_open_llm_v2_240829.csv qwen1_5_32b,HFv2 MuSR,12.04,,hf_open_llm_v2_240829.csv qwen1_5_32b_chat,HF OpenLLM v2,27.1,,hf_open_llm_v2_240829.csv qwen1_5_32b_chat,HFv2 BBH,44.55,,hf_open_llm_v2_240829.csv qwen1_5_32b_chat,HFv2 GPQA,7.49,,hf_open_llm_v2_240829.csv qwen1_5_32b_chat,HFv2 IFEval,55.32,,hf_open_llm_v2_240829.csv qwen1_5_32b_chat,HFv2 MMLU Pro,38.41,,hf_open_llm_v2_240829.csv qwen1_5_32b_chat,HFv2 Math Level 5,6.65,,hf_open_llm_v2_240829.csv qwen1_5_32b_chat,HFv2 MuSR,10.2,,hf_open_llm_v2_240829.csv qwen1_5_4b,HF OpenLLM v2,11.29,,hf_open_llm_v2_240829.csv qwen1_5_4b,HFv2 BBH,16.25,,hf_open_llm_v2_240829.csv qwen1_5_4b,HFv2 GPQA,3.58,,hf_open_llm_v2_240829.csv qwen1_5_4b,HFv2 IFEval,24.45,,hf_open_llm_v2_240829.csv qwen1_5_4b,HFv2 MMLU Pro,16.22,,hf_open_llm_v2_240829.csv qwen1_5_4b,HFv2 Math Level 5,2.42,,hf_open_llm_v2_240829.csv qwen1_5_4b,HFv2 MuSR,4.82,,hf_open_llm_v2_240829.csv qwen1_5_4b_chat,HF OpenLLM v2,12.33,,hf_open_llm_v2_240829.csv qwen1_5_4b_chat,HFv2 BBH,16.3,,hf_open_llm_v2_240829.csv qwen1_5_4b_chat,HFv2 GPQA,2.24,,hf_open_llm_v2_240829.csv qwen1_5_4b_chat,HFv2 IFEval,31.57,,hf_open_llm_v2_240829.csv qwen1_5_4b_chat,HFv2 MMLU Pro,15.51,,hf_open_llm_v2_240829.csv qwen1_5_4b_chat,HFv2 Math Level 5,0.98,,hf_open_llm_v2_240829.csv qwen1_5_4b_chat,HFv2 MuSR,7.36,,hf_open_llm_v2_240829.csv qwen1_5_7b,HF OpenLLM v2,15.22,,hf_open_llm_v2_240829.csv qwen1_5_7b,HFv2 BBH,23.08,,hf_open_llm_v2_240829.csv qwen1_5_7b,HFv2 GPQA,6.49,,hf_open_llm_v2_240829.csv qwen1_5_7b,HFv2 IFEval,26.84,,hf_open_llm_v2_240829.csv qwen1_5_7b,HFv2 MMLU Pro,21.29,,hf_open_llm_v2_240829.csv qwen1_5_7b,HFv2 Math Level 5,4.46,,hf_open_llm_v2_240829.csv qwen1_5_7b,HFv2 MuSR,9.16,,hf_open_llm_v2_240829.csv qwen1_5_7b_chat,HF OpenLLM v2,16.58,,hf_open_llm_v2_240829.csv qwen1_5_7b_chat,HFv2 BBH,22.38,,hf_open_llm_v2_240829.csv qwen1_5_7b_chat,HFv2 GPQA,7.05,,hf_open_llm_v2_240829.csv qwen1_5_7b_chat,HFv2 IFEval,43.71,,hf_open_llm_v2_240829.csv qwen1_5_7b_chat,HFv2 MMLU Pro,21.68,,hf_open_llm_v2_240829.csv qwen1_5_7b_chat,HFv2 Math Level 5,0.0,,hf_open_llm_v2_240829.csv qwen1_5_7b_chat,HFv2 MuSR,4.64,,hf_open_llm_v2_240829.csv qwen1_5_moe_a2_7b,HF OpenLLM v2,12.42,,hf_open_llm_v2_240829.csv qwen1_5_moe_a2_7b,HFv2 BBH,18.84,,hf_open_llm_v2_240829.csv qwen1_5_moe_a2_7b,HFv2 GPQA,1.23,,hf_open_llm_v2_240829.csv qwen1_5_moe_a2_7b,HFv2 IFEval,26.6,,hf_open_llm_v2_240829.csv qwen1_5_moe_a2_7b,HFv2 MMLU Pro,19.75,,hf_open_llm_v2_240829.csv qwen1_5_moe_a2_7b,HFv2 Math Level 5,0.15,,hf_open_llm_v2_240829.csv qwen1_5_moe_a2_7b,HFv2 MuSR,7.97,,hf_open_llm_v2_240829.csv qwen1_5_moe_a2_7b_chat,HF OpenLLM v2,14.82,,hf_open_llm_v2_240829.csv qwen1_5_moe_a2_7b_chat,HFv2 BBH,20.04,,hf_open_llm_v2_240829.csv qwen1_5_moe_a2_7b_chat,HFv2 GPQA,3.24,,hf_open_llm_v2_240829.csv qwen1_5_moe_a2_7b_chat,HFv2 IFEval,37.95,,hf_open_llm_v2_240829.csv qwen1_5_moe_a2_7b_chat,HFv2 MMLU Pro,21.37,,hf_open_llm_v2_240829.csv qwen1_5_moe_a2_7b_chat,HFv2 Math Level 5,0.0,,hf_open_llm_v2_240829.csv qwen1_5_moe_a2_7b_chat,HFv2 MuSR,6.33,,hf_open_llm_v2_240829.csv qwen2_0_5b,HF OpenLLM v2,7.06,,hf_open_llm_v2_240829.csv qwen2_0_5b,HFv2 BBH,7.99,,hf_open_llm_v2_240829.csv qwen2_0_5b,HFv2 GPQA,0.78,,hf_open_llm_v2_240829.csv qwen2_0_5b,HFv2 IFEval,18.67,,hf_open_llm_v2_240829.csv qwen2_0_5b,HFv2 MMLU Pro,7.76,,hf_open_llm_v2_240829.csv qwen2_0_5b,HFv2 Math Level 5,2.57,,hf_open_llm_v2_240829.csv qwen2_0_5b,HFv2 MuSR,4.6,,hf_open_llm_v2_240829.csv qwen2_0_5b_instruct,HF OpenLLM v2,6.39,,hf_open_llm_v2_240829.csv qwen2_0_5b_instruct,HFv2 BBH,5.88,,hf_open_llm_v2_240829.csv qwen2_0_5b_instruct,HFv2 GPQA,0.0,,hf_open_llm_v2_240829.csv qwen2_0_5b_instruct,HFv2 IFEval,22.47,,hf_open_llm_v2_240829.csv qwen2_0_5b_instruct,HFv2 MMLU Pro,5.9,,hf_open_llm_v2_240829.csv qwen2_0_5b_instruct,HFv2 Math Level 5,1.66,,hf_open_llm_v2_240829.csv qwen2_0_5b_instruct,HFv2 MuSR,2.41,,hf_open_llm_v2_240829.csv qwen2_1_5b,HF OpenLLM v2,10.32,,hf_open_llm_v2_240829.csv qwen2_1_5b,HFv2 BBH,11.78,,hf_open_llm_v2_240829.csv qwen2_1_5b,HFv2 GPQA,1.9,,hf_open_llm_v2_240829.csv qwen2_1_5b,HFv2 IFEval,21.13,,hf_open_llm_v2_240829.csv qwen2_1_5b,HFv2 MMLU Pro,17.24,,hf_open_llm_v2_240829.csv qwen2_1_5b,HFv2 Math Level 5,6.27,,hf_open_llm_v2_240829.csv qwen2_1_5b,HFv2 MuSR,3.59,,hf_open_llm_v2_240829.csv qwen2_1_5b_instruct,HF OpenLLM v2,13.92,,hf_open_llm_v2_240829.csv qwen2_1_5b_instruct,HFv2 BBH,13.7,,hf_open_llm_v2_240829.csv qwen2_1_5b_instruct,HFv2 GPQA,1.57,,hf_open_llm_v2_240829.csv qwen2_1_5b_instruct,HFv2 IFEval,33.71,,hf_open_llm_v2_240829.csv qwen2_1_5b_instruct,HFv2 MMLU Pro,16.68,,hf_open_llm_v2_240829.csv qwen2_1_5b_instruct,HFv2 Math Level 5,5.82,,hf_open_llm_v2_240829.csv qwen2_1_5b_instruct,HFv2 MuSR,12.03,,hf_open_llm_v2_240829.csv qwen2_57b_a14b,HF OpenLLM v2,25.03,,hf_open_llm_v2_240829.csv qwen2_57b_a14b,HFv2 BBH,38.88,,hf_open_llm_v2_240829.csv qwen2_57b_a14b,HFv2 GPQA,7.49,,hf_open_llm_v2_240829.csv qwen2_57b_a14b,HFv2 IFEval,31.13,,hf_open_llm_v2_240829.csv qwen2_57b_a14b,HFv2 MMLU Pro,43.51,,hf_open_llm_v2_240829.csv qwen2_57b_a14b,HFv2 Math Level 5,18.66,,hf_open_llm_v2_240829.csv qwen2_57b_a14b,HFv2 MuSR,10.54,,hf_open_llm_v2_240829.csv qwen2_57b_a14b_instruct,HF OpenLLM v2,29.6,,hf_open_llm_v2_240829.csv qwen2_57b_a14b_instruct,HFv2 BBH,41.79,,hf_open_llm_v2_240829.csv qwen2_57b_a14b_instruct,HFv2 GPQA,10.85,,hf_open_llm_v2_240829.csv qwen2_57b_a14b_instruct,HFv2 IFEval,63.38,,hf_open_llm_v2_240829.csv qwen2_57b_a14b_instruct,HFv2 MMLU Pro,39.73,,hf_open_llm_v2_240829.csv qwen2_57b_a14b_instruct,HFv2 Math Level 5,7.7,,hf_open_llm_v2_240829.csv qwen2_57b_a14b_instruct,HFv2 MuSR,14.18,,hf_open_llm_v2_240829.csv qwen2_72b,HF OpenLLM v2,35.13,,hf_open_llm_v2_240829.csv qwen2_72b,HFv2 BBH,51.86,,hf_open_llm_v2_240829.csv qwen2_72b,HFv2 GPQA,19.24,,hf_open_llm_v2_240829.csv qwen2_72b,HFv2 IFEval,38.24,,hf_open_llm_v2_240829.csv qwen2_72b,HFv2 MMLU Pro,52.56,,hf_open_llm_v2_240829.csv qwen2_72b,HFv2 Math Level 5,29.15,,hf_open_llm_v2_240829.csv qwen2_72b,HFv2 MuSR,19.73,,hf_open_llm_v2_240829.csv qwen2_72b_instruct,HF OpenLLM v2,42.49,,hf_open_llm_v2_240829.csv qwen2_72b_instruct,HFv2 BBH,57.48,,hf_open_llm_v2_240829.csv qwen2_72b_instruct,HFv2 GPQA,16.33,,hf_open_llm_v2_240829.csv qwen2_72b_instruct,HFv2 IFEval,79.89,,hf_open_llm_v2_240829.csv qwen2_72b_instruct,HFv2 MMLU Pro,48.92,,hf_open_llm_v2_240829.csv qwen2_72b_instruct,HFv2 Math Level 5,35.12,,hf_open_llm_v2_240829.csv qwen2_72b_instruct,HFv2 MuSR,17.17,,hf_open_llm_v2_240829.csv qwen2_7b,HF OpenLLM v2,23.66,,hf_open_llm_v2_240829.csv qwen2_7b,HFv2 BBH,34.71,,hf_open_llm_v2_240829.csv qwen2_7b,HFv2 GPQA,7.27,,hf_open_llm_v2_240829.csv qwen2_7b,HFv2 IFEval,31.49,,hf_open_llm_v2_240829.csv qwen2_7b,HFv2 MMLU Pro,35.37,,hf_open_llm_v2_240829.csv qwen2_7b,HFv2 Math Level 5,18.81,,hf_open_llm_v2_240829.csv qwen2_7b,HFv2 MuSR,14.32,,hf_open_llm_v2_240829.csv qwen2_7b_instruct,HF OpenLLM v2,24.76,,hf_open_llm_v2_240829.csv qwen2_7b_instruct,HFv2 BBH,37.81,,hf_open_llm_v2_240829.csv qwen2_7b_instruct,HFv2 GPQA,6.38,,hf_open_llm_v2_240829.csv qwen2_7b_instruct,HFv2 IFEval,56.79,,hf_open_llm_v2_240829.csv qwen2_7b_instruct,HFv2 MMLU Pro,31.64,,hf_open_llm_v2_240829.csv qwen2_7b_instruct,HFv2 Math Level 5,8.61,,hf_open_llm_v2_240829.csv qwen2_7b_instruct,HFv2 MuSR,7.37,,hf_open_llm_v2_240829.csv qwen2_cantonese_7b_instruct,HF OpenLLM v2,23.5,,hf_open_llm_v2_240829.csv qwen2_cantonese_7b_instruct,HFv2 BBH,32.45,,hf_open_llm_v2_240829.csv qwen2_cantonese_7b_instruct,HFv2 GPQA,6.04,,hf_open_llm_v2_240829.csv qwen2_cantonese_7b_instruct,HFv2 IFEval,54.35,,hf_open_llm_v2_240829.csv qwen2_cantonese_7b_instruct,HFv2 MMLU Pro,31.59,,hf_open_llm_v2_240829.csv qwen2_cantonese_7b_instruct,HFv2 Math Level 5,8.76,,hf_open_llm_v2_240829.csv qwen2_cantonese_7b_instruct,HFv2 MuSR,7.81,,hf_open_llm_v2_240829.csv recurrentgemma_2b,HF OpenLLM v2,6.94,,hf_open_llm_v2_240829.csv recurrentgemma_2b,HFv2 BBH,4.82,,hf_open_llm_v2_240829.csv recurrentgemma_2b,HFv2 GPQA,0.0,,hf_open_llm_v2_240829.csv recurrentgemma_2b,HFv2 IFEval,30.17,,hf_open_llm_v2_240829.csv recurrentgemma_2b,HFv2 MMLU Pro,1.96,,hf_open_llm_v2_240829.csv recurrentgemma_2b,HFv2 Math Level 5,1.59,,hf_open_llm_v2_240829.csv recurrentgemma_2b,HFv2 MuSR,3.1,,hf_open_llm_v2_240829.csv recurrentgemma_2b_it,HF OpenLLM v2,7.92,,hf_open_llm_v2_240829.csv recurrentgemma_2b_it,HFv2 BBH,7.98,,hf_open_llm_v2_240829.csv recurrentgemma_2b_it,HFv2 GPQA,0.45,,hf_open_llm_v2_240829.csv recurrentgemma_2b_it,HFv2 IFEval,29.49,,hf_open_llm_v2_240829.csv recurrentgemma_2b_it,HFv2 MMLU Pro,4.47,,hf_open_llm_v2_240829.csv recurrentgemma_2b_it,HFv2 Math Level 5,1.51,,hf_open_llm_v2_240829.csv recurrentgemma_2b_it,HFv2 MuSR,3.62,,hf_open_llm_v2_240829.csv recurrentgemma_9b,HF OpenLLM v2,13.5,,hf_open_llm_v2_240829.csv recurrentgemma_9b,HFv2 BBH,15.32,,hf_open_llm_v2_240829.csv recurrentgemma_9b,HFv2 GPQA,4.7,,hf_open_llm_v2_240829.csv recurrentgemma_9b,HFv2 IFEval,31.16,,hf_open_llm_v2_240829.csv recurrentgemma_9b,HFv2 MMLU Pro,17.83,,hf_open_llm_v2_240829.csv recurrentgemma_9b,HFv2 Math Level 5,5.36,,hf_open_llm_v2_240829.csv recurrentgemma_9b,HFv2 MuSR,6.6,,hf_open_llm_v2_240829.csv recurrentgemma_9b_it,HF OpenLLM v2,19.12,,hf_open_llm_v2_240829.csv recurrentgemma_9b_it,HFv2 BBH,21.62,,hf_open_llm_v2_240829.csv recurrentgemma_9b_it,HFv2 GPQA,2.68,,hf_open_llm_v2_240829.csv recurrentgemma_9b_it,HFv2 IFEval,50.1,,hf_open_llm_v2_240829.csv recurrentgemma_9b_it,HFv2 MMLU Pro,20.48,,hf_open_llm_v2_240829.csv recurrentgemma_9b_it,HFv2 Math Level 5,6.04,,hf_open_llm_v2_240829.csv recurrentgemma_9b_it,HFv2 MuSR,13.77,,hf_open_llm_v2_240829.csv redpajama_incite_7b_base,HF OpenLLM v2,5.46,,hf_open_llm_v2_240829.csv redpajama_incite_7b_base,HFv2 BBH,5.09,,hf_open_llm_v2_240829.csv redpajama_incite_7b_base,HFv2 GPQA,0.67,,hf_open_llm_v2_240829.csv redpajama_incite_7b_base,HFv2 IFEval,20.82,,hf_open_llm_v2_240829.csv redpajama_incite_7b_base,HFv2 MMLU Pro,2.19,,hf_open_llm_v2_240829.csv redpajama_incite_7b_base,HFv2 Math Level 5,0.98,,hf_open_llm_v2_240829.csv redpajama_incite_7b_base,HFv2 MuSR,3.02,,hf_open_llm_v2_240829.csv redpajama_incite_base_3b_v1,HF OpenLLM v2,5.43,,hf_open_llm_v2_240829.csv redpajama_incite_base_3b_v1,HFv2 BBH,3.52,,hf_open_llm_v2_240829.csv redpajama_incite_base_3b_v1,HFv2 GPQA,0.0,,hf_open_llm_v2_240829.csv redpajama_incite_base_3b_v1,HFv2 IFEval,22.94,,hf_open_llm_v2_240829.csv redpajama_incite_base_3b_v1,HFv2 MMLU Pro,1.24,,hf_open_llm_v2_240829.csv redpajama_incite_base_3b_v1,HFv2 Math Level 5,0.91,,hf_open_llm_v2_240829.csv redpajama_incite_base_3b_v1,HFv2 MuSR,4.0,,hf_open_llm_v2_240829.csv rhea_72b_v0_5,HF OpenLLM v2,4.02,,hf_open_llm_v2_240829.csv rhea_72b_v0_5,HFv2 BBH,3.67,,hf_open_llm_v2_240829.csv rhea_72b_v0_5,HFv2 GPQA,0.34,,hf_open_llm_v2_240829.csv rhea_72b_v0_5,HFv2 IFEval,1.45,,hf_open_llm_v2_240829.csv rhea_72b_v0_5,HFv2 MMLU Pro,1.85,,hf_open_llm_v2_240829.csv rhea_72b_v0_5,HFv2 Math Level 5,5.51,,hf_open_llm_v2_240829.csv rhea_72b_v0_5,HFv2 MuSR,11.32,,hf_open_llm_v2_240829.csv roleplay_llama3_8b,HF OpenLLM v2,23.94,,hf_open_llm_v2_240829.csv roleplay_llama3_8b,HFv2 BBH,28.55,,hf_open_llm_v2_240829.csv roleplay_llama3_8b,HFv2 GPQA,1.45,,hf_open_llm_v2_240829.csv roleplay_llama3_8b,HFv2 IFEval,73.2,,hf_open_llm_v2_240829.csv roleplay_llama3_8b,HFv2 MMLU Pro,30.09,,hf_open_llm_v2_240829.csv roleplay_llama3_8b,HFv2 Math Level 5,8.69,,hf_open_llm_v2_240829.csv roleplay_llama3_8b,HFv2 MuSR,1.68,,hf_open_llm_v2_240829.csv rys_llama3_8b_instruct,HF OpenLLM v2,21.81,,hf_open_llm_v2_240829.csv rys_llama3_8b_instruct,HFv2 BBH,25.37,,hf_open_llm_v2_240829.csv rys_llama3_8b_instruct,HFv2 GPQA,1.01,,hf_open_llm_v2_240829.csv rys_llama3_8b_instruct,HFv2 IFEval,69.58,,hf_open_llm_v2_240829.csv rys_llama3_8b_instruct,HFv2 MMLU Pro,28.41,,hf_open_llm_v2_240829.csv rys_llama3_8b_instruct,HFv2 Math Level 5,6.19,,hf_open_llm_v2_240829.csv rys_llama3_8b_instruct,HFv2 MuSR,0.29,,hf_open_llm_v2_240829.csv rys_llama3_huge_instruct,HF OpenLLM v2,34.37,,hf_open_llm_v2_240829.csv rys_llama3_huge_instruct,HFv2 BBH,49.07,,hf_open_llm_v2_240829.csv rys_llama3_huge_instruct,HFv2 GPQA,1.45,,hf_open_llm_v2_240829.csv rys_llama3_huge_instruct,HFv2 IFEval,76.86,,hf_open_llm_v2_240829.csv rys_llama3_huge_instruct,HFv2 MMLU Pro,45.66,,hf_open_llm_v2_240829.csv rys_llama3_huge_instruct,HFv2 Math Level 5,21.22,,hf_open_llm_v2_240829.csv rys_llama3_huge_instruct,HFv2 MuSR,11.93,,hf_open_llm_v2_240829.csv rys_llama3_large_instruct,HF OpenLLM v2,35.78,,hf_open_llm_v2_240829.csv rys_llama3_large_instruct,HFv2 BBH,49.67,,hf_open_llm_v2_240829.csv rys_llama3_large_instruct,HFv2 GPQA,5.26,,hf_open_llm_v2_240829.csv rys_llama3_large_instruct,HFv2 IFEval,80.51,,hf_open_llm_v2_240829.csv rys_llama3_large_instruct,HFv2 MMLU Pro,45.97,,hf_open_llm_v2_240829.csv rys_llama3_large_instruct,HFv2 Math Level 5,21.83,,hf_open_llm_v2_240829.csv rys_llama3_large_instruct,HFv2 MuSR,11.45,,hf_open_llm_v2_240829.csv rys_phi_3_medium_4k_instruct,HF OpenLLM v2,28.38,,hf_open_llm_v2_240829.csv rys_phi_3_medium_4k_instruct,HFv2 BBH,46.75,,hf_open_llm_v2_240829.csv rys_phi_3_medium_4k_instruct,HFv2 GPQA,13.98,,hf_open_llm_v2_240829.csv rys_phi_3_medium_4k_instruct,HFv2 IFEval,43.91,,hf_open_llm_v2_240829.csv rys_phi_3_medium_4k_instruct,HFv2 MMLU Pro,42.74,,hf_open_llm_v2_240829.csv rys_phi_3_medium_4k_instruct,HFv2 Math Level 5,11.78,,hf_open_llm_v2_240829.csv rys_phi_3_medium_4k_instruct,HFv2 MuSR,11.09,,hf_open_llm_v2_240829.csv sauerkrautlm_una_solar_instruct,HF OpenLLM v2,19.71,,hf_open_llm_v2_240829.csv sauerkrautlm_una_solar_instruct,HFv2 BBH,31.82,,hf_open_llm_v2_240829.csv sauerkrautlm_una_solar_instruct,HFv2 GPQA,8.17,,hf_open_llm_v2_240829.csv sauerkrautlm_una_solar_instruct,HFv2 IFEval,45.73,,hf_open_llm_v2_240829.csv sauerkrautlm_una_solar_instruct,HFv2 MMLU Pro,23.93,,hf_open_llm_v2_240829.csv sauerkrautlm_una_solar_instruct,HFv2 Math Level 5,0.0,,hf_open_llm_v2_240829.csv sauerkrautlm_una_solar_instruct,HFv2 MuSR,8.6,,hf_open_llm_v2_240829.csv seallms_v3_7b_chat,HF OpenLLM v2,23.63,,hf_open_llm_v2_240829.csv seallms_v3_7b_chat,HFv2 BBH,33.8,,hf_open_llm_v2_240829.csv seallms_v3_7b_chat,HFv2 GPQA,6.49,,hf_open_llm_v2_240829.csv seallms_v3_7b_chat,HFv2 IFEval,43.77,,hf_open_llm_v2_240829.csv seallms_v3_7b_chat,HFv2 MMLU Pro,32.16,,hf_open_llm_v2_240829.csv seallms_v3_7b_chat,HFv2 Math Level 5,15.11,,hf_open_llm_v2_240829.csv seallms_v3_7b_chat,HFv2 MuSR,10.47,,hf_open_llm_v2_240829.csv sheared_llama_1_3b,HF OpenLLM v2,5.51,,hf_open_llm_v2_240829.csv sheared_llama_1_3b,HFv2 BBH,4.74,,hf_open_llm_v2_240829.csv sheared_llama_1_3b,HFv2 GPQA,0.0,,hf_open_llm_v2_240829.csv sheared_llama_1_3b,HFv2 IFEval,21.98,,hf_open_llm_v2_240829.csv sheared_llama_1_3b,HFv2 MMLU Pro,1.9,,hf_open_llm_v2_240829.csv sheared_llama_1_3b,HFv2 Math Level 5,0.83,,hf_open_llm_v2_240829.csv sheared_llama_1_3b,HFv2 MuSR,3.58,,hf_open_llm_v2_240829.csv sheared_llama_2_7b,HF OpenLLM v2,6.31,,hf_open_llm_v2_240829.csv sheared_llama_2_7b,HFv2 BBH,5.66,,hf_open_llm_v2_240829.csv sheared_llama_2_7b,HFv2 GPQA,3.36,,hf_open_llm_v2_240829.csv sheared_llama_2_7b,HFv2 IFEval,24.17,,hf_open_llm_v2_240829.csv sheared_llama_2_7b,HFv2 MMLU Pro,2.08,,hf_open_llm_v2_240829.csv sheared_llama_2_7b,HFv2 Math Level 5,0.53,,hf_open_llm_v2_240829.csv sheared_llama_2_7b,HFv2 MuSR,2.09,,hf_open_llm_v2_240829.csv silicon_maid_7b,HF OpenLLM v2,19.32,,hf_open_llm_v2_240829.csv silicon_maid_7b,HFv2 BBH,16.69,,hf_open_llm_v2_240829.csv silicon_maid_7b,HFv2 GPQA,5.37,,hf_open_llm_v2_240829.csv silicon_maid_7b,HFv2 IFEval,53.68,,hf_open_llm_v2_240829.csv silicon_maid_7b,HFv2 MMLU Pro,23.15,,hf_open_llm_v2_240829.csv silicon_maid_7b,HFv2 Math Level 5,5.97,,hf_open_llm_v2_240829.csv silicon_maid_7b,HFv2 MuSR,11.09,,hf_open_llm_v2_240829.csv smaug_34b_v0_1,HF OpenLLM v2,23.76,,hf_open_llm_v2_240829.csv smaug_34b_v0_1,HFv2 BBH,34.26,,hf_open_llm_v2_240829.csv smaug_34b_v0_1,HFv2 GPQA,10.63,,hf_open_llm_v2_240829.csv smaug_34b_v0_1,HFv2 IFEval,50.16,,hf_open_llm_v2_240829.csv smaug_34b_v0_1,HFv2 MMLU Pro,39.37,,hf_open_llm_v2_240829.csv smaug_34b_v0_1,HFv2 Math Level 5,0.0,,hf_open_llm_v2_240829.csv smaug_34b_v0_1,HFv2 MuSR,8.13,,hf_open_llm_v2_240829.csv smaug_llama3_70b_instruct_32k,HF OpenLLM v2,34.72,,hf_open_llm_v2_240829.csv smaug_llama3_70b_instruct_32k,HFv2 BBH,49.07,,hf_open_llm_v2_240829.csv smaug_llama3_70b_instruct_32k,HFv2 GPQA,6.15,,hf_open_llm_v2_240829.csv smaug_llama3_70b_instruct_32k,HFv2 IFEval,77.61,,hf_open_llm_v2_240829.csv smaug_llama3_70b_instruct_32k,HFv2 MMLU Pro,41.83,,hf_open_llm_v2_240829.csv smaug_llama3_70b_instruct_32k,HFv2 Math Level 5,21.22,,hf_open_llm_v2_240829.csv smaug_llama3_70b_instruct_32k,HFv2 MuSR,12.43,,hf_open_llm_v2_240829.csv smaug_qwen2_72b_instruct,HF OpenLLM v2,41.08,,hf_open_llm_v2_240829.csv smaug_qwen2_72b_instruct,HFv2 BBH,56.27,,hf_open_llm_v2_240829.csv smaug_qwen2_72b_instruct,HFv2 GPQA,14.88,,hf_open_llm_v2_240829.csv smaug_qwen2_72b_instruct,HFv2 IFEval,78.25,,hf_open_llm_v2_240829.csv smaug_qwen2_72b_instruct,HFv2 MMLU Pro,46.56,,hf_open_llm_v2_240829.csv smaug_qwen2_72b_instruct,HFv2 Math Level 5,35.35,,hf_open_llm_v2_240829.csv smaug_qwen2_72b_instruct,HFv2 MuSR,15.18,,hf_open_llm_v2_240829.csv smol_llama_101m_gqa,HF OpenLLM v2,3.92,,hf_open_llm_v2_240829.csv smol_llama_101m_gqa,HFv2 BBH,3.2,,hf_open_llm_v2_240829.csv smol_llama_101m_gqa,HFv2 GPQA,1.01,,hf_open_llm_v2_240829.csv smol_llama_101m_gqa,HFv2 IFEval,13.84,,hf_open_llm_v2_240829.csv smol_llama_101m_gqa,HFv2 MMLU Pro,1.19,,hf_open_llm_v2_240829.csv smol_llama_101m_gqa,HFv2 Math Level 5,0.0,,hf_open_llm_v2_240829.csv smol_llama_101m_gqa,HFv2 MuSR,4.28,,hf_open_llm_v2_240829.csv smol_llama_220m_gqa,HF OpenLLM v2,6.4,,hf_open_llm_v2_240829.csv smol_llama_220m_gqa,HFv2 BBH,3.04,,hf_open_llm_v2_240829.csv smol_llama_220m_gqa,HFv2 GPQA,0.78,,hf_open_llm_v2_240829.csv smol_llama_220m_gqa,HFv2 IFEval,23.86,,hf_open_llm_v2_240829.csv smol_llama_220m_gqa,HFv2 MMLU Pro,1.66,,hf_open_llm_v2_240829.csv smol_llama_220m_gqa,HFv2 Math Level 5,0.0,,hf_open_llm_v2_240829.csv smol_llama_220m_gqa,HFv2 MuSR,9.07,,hf_open_llm_v2_240829.csv smollm_135m,HF OpenLLM v2,6.84,,hf_open_llm_v2_240829.csv smollm_135m,HFv2 BBH,3.29,,hf_open_llm_v2_240829.csv smollm_135m,HFv2 GPQA,1.12,,hf_open_llm_v2_240829.csv smollm_135m,HFv2 IFEval,21.25,,hf_open_llm_v2_240829.csv smollm_135m,HFv2 MMLU Pro,1.36,,hf_open_llm_v2_240829.csv smollm_135m,HFv2 Math Level 5,0.68,,hf_open_llm_v2_240829.csv smollm_135m,HFv2 MuSR,13.34,,hf_open_llm_v2_240829.csv smollm_135m_instruct,HF OpenLLM v2,4.23,,hf_open_llm_v2_240829.csv smollm_135m_instruct,HFv2 BBH,2.08,,hf_open_llm_v2_240829.csv smollm_135m_instruct,HFv2 GPQA,1.9,,hf_open_llm_v2_240829.csv smollm_135m_instruct,HFv2 IFEval,15.96,,hf_open_llm_v2_240829.csv smollm_135m_instruct,HFv2 MMLU Pro,1.84,,hf_open_llm_v2_240829.csv smollm_135m_instruct,HFv2 Math Level 5,0.0,,hf_open_llm_v2_240829.csv smollm_135m_instruct,HFv2 MuSR,3.62,,hf_open_llm_v2_240829.csv smollm_1_7b,HF OpenLLM v2,5.43,,hf_open_llm_v2_240829.csv smollm_1_7b,HFv2 BBH,4.41,,hf_open_llm_v2_240829.csv smollm_1_7b,HFv2 GPQA,0.0,,hf_open_llm_v2_240829.csv smollm_1_7b,HFv2 IFEval,23.62,,hf_open_llm_v2_240829.csv smollm_1_7b,HFv2 MMLU Pro,1.64,,hf_open_llm_v2_240829.csv smollm_1_7b,HFv2 Math Level 5,0.76,,hf_open_llm_v2_240829.csv smollm_1_7b,HFv2 MuSR,2.13,,hf_open_llm_v2_240829.csv smollm_1_7b_instruct,HF OpenLLM v2,5.14,,hf_open_llm_v2_240829.csv smollm_1_7b_instruct,HFv2 BBH,2.08,,hf_open_llm_v2_240829.csv smollm_1_7b_instruct,HFv2 GPQA,1.34,,hf_open_llm_v2_240829.csv smollm_1_7b_instruct,HFv2 IFEval,23.48,,hf_open_llm_v2_240829.csv smollm_1_7b_instruct,HFv2 MMLU Pro,1.85,,hf_open_llm_v2_240829.csv smollm_1_7b_instruct,HFv2 Math Level 5,0.0,,hf_open_llm_v2_240829.csv smollm_1_7b_instruct,HFv2 MuSR,2.08,,hf_open_llm_v2_240829.csv smollm_360m,HF OpenLLM v2,6.15,,hf_open_llm_v2_240829.csv smollm_360m,HFv2 BBH,3.28,,hf_open_llm_v2_240829.csv smollm_360m,HFv2 GPQA,2.35,,hf_open_llm_v2_240829.csv smollm_360m,HFv2 IFEval,21.34,,hf_open_llm_v2_240829.csv smollm_360m,HFv2 MMLU Pro,1.37,,hf_open_llm_v2_240829.csv smollm_360m,HFv2 Math Level 5,0.45,,hf_open_llm_v2_240829.csv smollm_360m,HFv2 MuSR,8.09,,hf_open_llm_v2_240829.csv solar_10_7b_instruct_v1_0,HF OpenLLM v2,19.63,,hf_open_llm_v2_240829.csv solar_10_7b_instruct_v1_0,HFv2 BBH,31.87,,hf_open_llm_v2_240829.csv solar_10_7b_instruct_v1_0,HFv2 GPQA,7.83,,hf_open_llm_v2_240829.csv solar_10_7b_instruct_v1_0,HFv2 IFEval,47.37,,hf_open_llm_v2_240829.csv solar_10_7b_instruct_v1_0,HFv2 MMLU Pro,23.76,,hf_open_llm_v2_240829.csv solar_10_7b_instruct_v1_0,HFv2 Math Level 5,0.0,,hf_open_llm_v2_240829.csv solar_10_7b_instruct_v1_0,HFv2 MuSR,6.94,,hf_open_llm_v2_240829.csv solar_10_7b_v1_0,HF OpenLLM v2,16.77,,hf_open_llm_v2_240829.csv solar_10_7b_v1_0,HFv2 BBH,29.79,,hf_open_llm_v2_240829.csv solar_10_7b_v1_0,HFv2 GPQA,4.14,,hf_open_llm_v2_240829.csv solar_10_7b_v1_0,HFv2 IFEval,24.21,,hf_open_llm_v2_240829.csv solar_10_7b_v1_0,HFv2 MMLU Pro,26.67,,hf_open_llm_v2_240829.csv solar_10_7b_v1_0,HFv2 Math Level 5,2.11,,hf_open_llm_v2_240829.csv solar_10_7b_v1_0,HFv2 MuSR,13.68,,hf_open_llm_v2_240829.csv stablelm_2_12b,HF OpenLLM v2,13.86,,hf_open_llm_v2_240829.csv stablelm_2_12b,HFv2 BBH,22.69,,hf_open_llm_v2_240829.csv stablelm_2_12b,HFv2 GPQA,3.8,,hf_open_llm_v2_240829.csv stablelm_2_12b,HFv2 IFEval,15.69,,hf_open_llm_v2_240829.csv stablelm_2_12b,HFv2 MMLU Pro,23.02,,hf_open_llm_v2_240829.csv stablelm_2_12b,HFv2 Math Level 5,3.47,,hf_open_llm_v2_240829.csv stablelm_2_12b,HFv2 MuSR,14.49,,hf_open_llm_v2_240829.csv stablelm_2_12b_chat,HF OpenLLM v2,16.22,,hf_open_llm_v2_240829.csv stablelm_2_12b_chat,HFv2 BBH,25.25,,hf_open_llm_v2_240829.csv stablelm_2_12b_chat,HFv2 GPQA,2.24,,hf_open_llm_v2_240829.csv stablelm_2_12b_chat,HFv2 IFEval,40.82,,hf_open_llm_v2_240829.csv stablelm_2_12b_chat,HFv2 MMLU Pro,19.27,,hf_open_llm_v2_240829.csv stablelm_2_12b_chat,HFv2 Math Level 5,2.04,,hf_open_llm_v2_240829.csv stablelm_2_12b_chat,HFv2 MuSR,7.73,,hf_open_llm_v2_240829.csv stablelm_2_1_6b,HF OpenLLM v2,5.22,,hf_open_llm_v2_240829.csv stablelm_2_1_6b,HFv2 BBH,8.63,,hf_open_llm_v2_240829.csv stablelm_2_1_6b,HFv2 GPQA,0.0,,hf_open_llm_v2_240829.csv stablelm_2_1_6b,HFv2 IFEval,11.57,,hf_open_llm_v2_240829.csv stablelm_2_1_6b,HFv2 MMLU Pro,5.15,,hf_open_llm_v2_240829.csv stablelm_2_1_6b,HFv2 Math Level 5,0.15,,hf_open_llm_v2_240829.csv stablelm_2_1_6b,HFv2 MuSR,5.79,,hf_open_llm_v2_240829.csv stablelm_2_1_6b_chat,HF OpenLLM v2,8.63,,hf_open_llm_v2_240829.csv stablelm_2_1_6b_chat,HFv2 BBH,7.49,,hf_open_llm_v2_240829.csv stablelm_2_1_6b_chat,HFv2 GPQA,0.0,,hf_open_llm_v2_240829.csv stablelm_2_1_6b_chat,HFv2 IFEval,30.6,,hf_open_llm_v2_240829.csv stablelm_2_1_6b_chat,HFv2 MMLU Pro,6.91,,hf_open_llm_v2_240829.csv stablelm_2_1_6b_chat,HFv2 Math Level 5,1.06,,hf_open_llm_v2_240829.csv stablelm_2_1_6b_chat,HFv2 MuSR,5.71,,hf_open_llm_v2_240829.csv stablelm_2_zephyr_1_6b,HF OpenLLM v2,9.26,,hf_open_llm_v2_240829.csv stablelm_2_zephyr_1_6b,HFv2 BBH,6.71,,hf_open_llm_v2_240829.csv stablelm_2_zephyr_1_6b,HFv2 GPQA,0.0,,hf_open_llm_v2_240829.csv stablelm_2_zephyr_1_6b,HFv2 IFEval,32.79,,hf_open_llm_v2_240829.csv stablelm_2_zephyr_1_6b,HFv2 MMLU Pro,7.93,,hf_open_llm_v2_240829.csv stablelm_2_zephyr_1_6b,HFv2 Math Level 5,2.11,,hf_open_llm_v2_240829.csv stablelm_2_zephyr_1_6b,HFv2 MuSR,5.99,,hf_open_llm_v2_240829.csv stablelm_3b_4e1t,HF OpenLLM v2,7.26,,hf_open_llm_v2_240829.csv stablelm_3b_4e1t,HFv2 BBH,9.01,,hf_open_llm_v2_240829.csv stablelm_3b_4e1t,HFv2 GPQA,0.0,,hf_open_llm_v2_240829.csv stablelm_3b_4e1t,HFv2 IFEval,22.03,,hf_open_llm_v2_240829.csv stablelm_3b_4e1t,HFv2 MMLU Pro,7.43,,hf_open_llm_v2_240829.csv stablelm_3b_4e1t,HFv2 Math Level 5,0.68,,hf_open_llm_v2_240829.csv stablelm_3b_4e1t,HFv2 MuSR,4.42,,hf_open_llm_v2_240829.csv stablelm_zephyr_3b,HF OpenLLM v2,12.33,,hf_open_llm_v2_240829.csv stablelm_zephyr_3b,HFv2 BBH,14.76,,hf_open_llm_v2_240829.csv stablelm_zephyr_3b,HFv2 GPQA,0.0,,hf_open_llm_v2_240829.csv stablelm_zephyr_3b,HFv2 IFEval,36.83,,hf_open_llm_v2_240829.csv stablelm_zephyr_3b,HFv2 MMLU Pro,8.53,,hf_open_llm_v2_240829.csv stablelm_zephyr_3b,HFv2 Math Level 5,4.08,,hf_open_llm_v2_240829.csv stablelm_zephyr_3b,HFv2 MuSR,9.79,,hf_open_llm_v2_240829.csv starcoder2_15b,HF OpenLLM v2,12.44,,hf_open_llm_v2_240829.csv starcoder2_15b,HFv2 BBH,20.37,,hf_open_llm_v2_240829.csv starcoder2_15b,HFv2 GPQA,3.13,,hf_open_llm_v2_240829.csv starcoder2_15b,HFv2 IFEval,27.8,,hf_open_llm_v2_240829.csv starcoder2_15b,HFv2 MMLU Pro,15.03,,hf_open_llm_v2_240829.csv starcoder2_15b,HFv2 Math Level 5,5.36,,hf_open_llm_v2_240829.csv starcoder2_15b,HFv2 MuSR,2.93,,hf_open_llm_v2_240829.csv starcoder2_3b,HF OpenLLM v2,6.54,,hf_open_llm_v2_240829.csv starcoder2_3b,HFv2 BBH,8.91,,hf_open_llm_v2_240829.csv starcoder2_3b,HFv2 GPQA,0.0,,hf_open_llm_v2_240829.csv starcoder2_3b,HFv2 IFEval,20.37,,hf_open_llm_v2_240829.csv starcoder2_3b,HFv2 MMLU Pro,7.07,,hf_open_llm_v2_240829.csv starcoder2_3b,HFv2 Math Level 5,1.44,,hf_open_llm_v2_240829.csv starcoder2_3b,HFv2 MuSR,1.43,,hf_open_llm_v2_240829.csv starcoder2_7b,HF OpenLLM v2,8.21,,hf_open_llm_v2_240829.csv starcoder2_7b,HFv2 BBH,11.4,,hf_open_llm_v2_240829.csv starcoder2_7b,HFv2 GPQA,0.22,,hf_open_llm_v2_240829.csv starcoder2_7b,HFv2 IFEval,22.09,,hf_open_llm_v2_240829.csv starcoder2_7b,HFv2 MMLU Pro,7.14,,hf_open_llm_v2_240829.csv starcoder2_7b,HFv2 Math Level 5,2.57,,hf_open_llm_v2_240829.csv starcoder2_7b,HFv2 MuSR,5.82,,hf_open_llm_v2_240829.csv starling_lm_7b_alpha,HF OpenLLM v2,20.64,,hf_open_llm_v2_240829.csv starling_lm_7b_alpha,HFv2 BBH,21.95,,hf_open_llm_v2_240829.csv starling_lm_7b_alpha,HFv2 GPQA,6.26,,hf_open_llm_v2_240829.csv starling_lm_7b_alpha,HFv2 IFEval,54.8,,hf_open_llm_v2_240829.csv starling_lm_7b_alpha,HFv2 MMLU Pro,24.13,,hf_open_llm_v2_240829.csv starling_lm_7b_alpha,HFv2 Math Level 5,7.18,,hf_open_llm_v2_240829.csv starling_lm_7b_alpha,HFv2 MuSR,9.5,,hf_open_llm_v2_240829.csv suzume_llama3_8b_multilingual_orpo_borda_full,HF OpenLLM v2,19.53,,hf_open_llm_v2_240829.csv suzume_llama3_8b_multilingual_orpo_borda_full,HFv2 BBH,25.08,,hf_open_llm_v2_240829.csv suzume_llama3_8b_multilingual_orpo_borda_full,HFv2 GPQA,1.23,,hf_open_llm_v2_240829.csv suzume_llama3_8b_multilingual_orpo_borda_full,HFv2 IFEval,58.17,,hf_open_llm_v2_240829.csv suzume_llama3_8b_multilingual_orpo_borda_full,HFv2 MMLU Pro,25.66,,hf_open_llm_v2_240829.csv suzume_llama3_8b_multilingual_orpo_borda_full,HFv2 Math Level 5,3.02,,hf_open_llm_v2_240829.csv suzume_llama3_8b_multilingual_orpo_borda_full,HFv2 MuSR,4.04,,hf_open_llm_v2_240829.csv suzume_llama3_8b_multilingual_orpo_borda_half,HF OpenLLM v2,21.28,,hf_open_llm_v2_240829.csv suzume_llama3_8b_multilingual_orpo_borda_half,HFv2 BBH,26.35,,hf_open_llm_v2_240829.csv suzume_llama3_8b_multilingual_orpo_borda_half,HFv2 GPQA,0.0,,hf_open_llm_v2_240829.csv suzume_llama3_8b_multilingual_orpo_borda_half,HFv2 IFEval,62.49,,hf_open_llm_v2_240829.csv suzume_llama3_8b_multilingual_orpo_borda_half,HFv2 MMLU Pro,29.04,,hf_open_llm_v2_240829.csv suzume_llama3_8b_multilingual_orpo_borda_half,HFv2 Math Level 5,7.7,,hf_open_llm_v2_240829.csv suzume_llama3_8b_multilingual_orpo_borda_half,HFv2 MuSR,2.11,,hf_open_llm_v2_240829.csv suzume_llama3_8b_multilingual_orpo_borda_top25,HF OpenLLM v2,23.37,,hf_open_llm_v2_240829.csv suzume_llama3_8b_multilingual_orpo_borda_top25,HFv2 BBH,27.67,,hf_open_llm_v2_240829.csv suzume_llama3_8b_multilingual_orpo_borda_top25,HFv2 GPQA,3.02,,hf_open_llm_v2_240829.csv suzume_llama3_8b_multilingual_orpo_borda_top25,HFv2 IFEval,66.37,,hf_open_llm_v2_240829.csv suzume_llama3_8b_multilingual_orpo_borda_top25,HFv2 MMLU Pro,29.83,,hf_open_llm_v2_240829.csv suzume_llama3_8b_multilingual_orpo_borda_top25,HFv2 Math Level 5,8.53,,hf_open_llm_v2_240829.csv suzume_llama3_8b_multilingual_orpo_borda_top25,HFv2 MuSR,4.81,,hf_open_llm_v2_240829.csv suzume_llama3_8b_multilingual_orpo_borda_top75,HF OpenLLM v2,23.43,,hf_open_llm_v2_240829.csv suzume_llama3_8b_multilingual_orpo_borda_top75,HFv2 BBH,28.06,,hf_open_llm_v2_240829.csv suzume_llama3_8b_multilingual_orpo_borda_top75,HFv2 GPQA,3.02,,hf_open_llm_v2_240829.csv suzume_llama3_8b_multilingual_orpo_borda_top75,HFv2 IFEval,66.87,,hf_open_llm_v2_240829.csv suzume_llama3_8b_multilingual_orpo_borda_top75,HFv2 MMLU Pro,30.77,,hf_open_llm_v2_240829.csv suzume_llama3_8b_multilingual_orpo_borda_top75,HFv2 Math Level 5,6.57,,hf_open_llm_v2_240829.csv suzume_llama3_8b_multilingual_orpo_borda_top75,HFv2 MuSR,5.31,,hf_open_llm_v2_240829.csv tinyllama_v1_1,HF OpenLLM v2,4.7,,hf_open_llm_v2_240829.csv tinyllama_v1_1,HFv2 BBH,3.21,,hf_open_llm_v2_240829.csv tinyllama_v1_1,HFv2 GPQA,0.0,,hf_open_llm_v2_240829.csv tinyllama_v1_1,HFv2 IFEval,20.01,,hf_open_llm_v2_240829.csv tinyllama_v1_1,HFv2 MMLU Pro,0.54,,hf_open_llm_v2_240829.csv tinyllama_v1_1,HFv2 Math Level 5,0.45,,hf_open_llm_v2_240829.csv tinyllama_v1_1,HFv2 MuSR,3.98,,hf_open_llm_v2_240829.csv tinyyi_7b_test,HF OpenLLM v2,4.5,,hf_open_llm_v2_240829.csv tinyyi_7b_test,HFv2 BBH,2.27,,hf_open_llm_v2_240829.csv tinyyi_7b_test,HFv2 GPQA,1.9,,hf_open_llm_v2_240829.csv tinyyi_7b_test,HFv2 IFEval,18.56,,hf_open_llm_v2_240829.csv tinyyi_7b_test,HFv2 MMLU Pro,1.01,,hf_open_llm_v2_240829.csv tinyyi_7b_test,HFv2 Math Level 5,0.0,,hf_open_llm_v2_240829.csv tinyyi_7b_test,HFv2 MuSR,3.22,,hf_open_llm_v2_240829.csv windyflollm,HF OpenLLM v2,14.17,,hf_open_llm_v2_240829.csv windyflollm,HFv2 BBH,24.4,,hf_open_llm_v2_240829.csv windyflollm,HFv2 GPQA,3.36,,hf_open_llm_v2_240829.csv windyflollm,HFv2 IFEval,26.69,,hf_open_llm_v2_240829.csv windyflollm,HFv2 MMLU Pro,17.57,,hf_open_llm_v2_240829.csv windyflollm,HFv2 Math Level 5,1.13,,hf_open_llm_v2_240829.csv windyflollm,HFv2 MuSR,11.86,,hf_open_llm_v2_240829.csv yi_1_5_34b,HF OpenLLM v2,25.43,,hf_open_llm_v2_240829.csv yi_1_5_34b,HFv2 BBH,42.75,,hf_open_llm_v2_240829.csv yi_1_5_34b,HFv2 GPQA,15.44,,hf_open_llm_v2_240829.csv yi_1_5_34b,HFv2 IFEval,28.41,,hf_open_llm_v2_240829.csv yi_1_5_34b,HFv2 MMLU Pro,40.73,,hf_open_llm_v2_240829.csv yi_1_5_34b,HFv2 Math Level 5,14.05,,hf_open_llm_v2_240829.csv yi_1_5_34b,HFv2 MuSR,11.22,,hf_open_llm_v2_240829.csv yi_1_5_34b_32k,HF OpenLLM v2,26.4,,hf_open_llm_v2_240829.csv yi_1_5_34b_32k,HFv2 BBH,43.38,,hf_open_llm_v2_240829.csv yi_1_5_34b_32k,HFv2 GPQA,15.1,,hf_open_llm_v2_240829.csv yi_1_5_34b_32k,HFv2 IFEval,31.19,,hf_open_llm_v2_240829.csv yi_1_5_34b_32k,HFv2 MMLU Pro,41.21,,hf_open_llm_v2_240829.csv yi_1_5_34b_32k,HFv2 Math Level 5,13.44,,hf_open_llm_v2_240829.csv yi_1_5_34b_32k,HFv2 MuSR,14.08,,hf_open_llm_v2_240829.csv yi_1_5_34b_chat,HF OpenLLM v2,32.63,,hf_open_llm_v2_240829.csv yi_1_5_34b_chat,HFv2 BBH,44.26,,hf_open_llm_v2_240829.csv yi_1_5_34b_chat,HFv2 GPQA,15.32,,hf_open_llm_v2_240829.csv yi_1_5_34b_chat,HFv2 IFEval,60.67,,hf_open_llm_v2_240829.csv yi_1_5_34b_chat,HFv2 MMLU Pro,39.12,,hf_open_llm_v2_240829.csv yi_1_5_34b_chat,HFv2 Math Level 5,23.34,,hf_open_llm_v2_240829.csv yi_1_5_34b_chat,HFv2 MuSR,13.06,,hf_open_llm_v2_240829.csv yi_1_5_34b_chat_16k,HF OpenLLM v2,28.98,,hf_open_llm_v2_240829.csv yi_1_5_34b_chat_16k,HFv2 BBH,44.54,,hf_open_llm_v2_240829.csv yi_1_5_34b_chat_16k,HFv2 GPQA,11.74,,hf_open_llm_v2_240829.csv yi_1_5_34b_chat_16k,HFv2 IFEval,45.64,,hf_open_llm_v2_240829.csv yi_1_5_34b_chat_16k,HFv2 MMLU Pro,39.38,,hf_open_llm_v2_240829.csv yi_1_5_34b_chat_16k,HFv2 Math Level 5,18.81,,hf_open_llm_v2_240829.csv yi_1_5_34b_chat_16k,HFv2 MuSR,13.74,,hf_open_llm_v2_240829.csv yi_1_5_6b,HF OpenLLM v2,16.53,,hf_open_llm_v2_240829.csv yi_1_5_6b,HFv2 BBH,22.03,,hf_open_llm_v2_240829.csv yi_1_5_6b,HFv2 GPQA,8.5,,hf_open_llm_v2_240829.csv yi_1_5_6b,HFv2 IFEval,26.17,,hf_open_llm_v2_240829.csv yi_1_5_6b,HFv2 MMLU Pro,23.82,,hf_open_llm_v2_240829.csv yi_1_5_6b,HFv2 Math Level 5,5.36,,hf_open_llm_v2_240829.csv yi_1_5_6b,HFv2 MuSR,13.31,,hf_open_llm_v2_240829.csv yi_1_5_6b_chat,HF OpenLLM v2,22.05,,hf_open_llm_v2_240829.csv yi_1_5_6b_chat,HFv2 BBH,23.55,,hf_open_llm_v2_240829.csv yi_1_5_6b_chat,HFv2 GPQA,9.06,,hf_open_llm_v2_240829.csv yi_1_5_6b_chat,HFv2 IFEval,48.02,,hf_open_llm_v2_240829.csv yi_1_5_6b_chat,HFv2 MMLU Pro,24.41,,hf_open_llm_v2_240829.csv yi_1_5_6b_chat,HFv2 Math Level 5,12.54,,hf_open_llm_v2_240829.csv yi_1_5_6b_chat,HFv2 MuSR,14.7,,hf_open_llm_v2_240829.csv yi_1_5_9b,HF OpenLLM v2,21.95,,hf_open_llm_v2_240829.csv yi_1_5_9b,HFv2 BBH,30.5,,hf_open_llm_v2_240829.csv yi_1_5_9b,HFv2 GPQA,17.23,,hf_open_llm_v2_240829.csv yi_1_5_9b,HFv2 IFEval,29.36,,hf_open_llm_v2_240829.csv yi_1_5_9b,HFv2 MMLU Pro,32.4,,hf_open_llm_v2_240829.csv yi_1_5_9b,HFv2 Math Level 5,10.2,,hf_open_llm_v2_240829.csv yi_1_5_9b,HFv2 MuSR,12.03,,hf_open_llm_v2_240829.csv yi_1_5_9b_32k,HF OpenLLM v2,19.61,,hf_open_llm_v2_240829.csv yi_1_5_9b_32k,HFv2 BBH,28.94,,hf_open_llm_v2_240829.csv yi_1_5_9b_32k,HFv2 GPQA,14.54,,hf_open_llm_v2_240829.csv yi_1_5_9b_32k,HFv2 IFEval,23.03,,hf_open_llm_v2_240829.csv yi_1_5_9b_32k,HFv2 MMLU Pro,30.72,,hf_open_llm_v2_240829.csv yi_1_5_9b_32k,HFv2 Math Level 5,9.59,,hf_open_llm_v2_240829.csv yi_1_5_9b_32k,HFv2 MuSR,10.83,,hf_open_llm_v2_240829.csv yi_1_5_9b_chat,HF OpenLLM v2,27.71,,hf_open_llm_v2_240829.csv yi_1_5_9b_chat,HFv2 BBH,36.95,,hf_open_llm_v2_240829.csv yi_1_5_9b_chat,HFv2 GPQA,11.3,,hf_open_llm_v2_240829.csv yi_1_5_9b_chat,HFv2 IFEval,60.46,,hf_open_llm_v2_240829.csv yi_1_5_9b_chat,HFv2 MMLU Pro,33.06,,hf_open_llm_v2_240829.csv yi_1_5_9b_chat,HFv2 Math Level 5,11.63,,hf_open_llm_v2_240829.csv yi_1_5_9b_chat,HFv2 MuSR,12.84,,hf_open_llm_v2_240829.csv yi_1_5_9b_chat_16k,HF OpenLLM v2,22.9,,hf_open_llm_v2_240829.csv yi_1_5_9b_chat_16k,HFv2 BBH,31.5,,hf_open_llm_v2_240829.csv yi_1_5_9b_chat_16k,HFv2 GPQA,7.83,,hf_open_llm_v2_240829.csv yi_1_5_9b_chat_16k,HFv2 IFEval,42.14,,hf_open_llm_v2_240829.csv yi_1_5_9b_chat_16k,HFv2 MMLU Pro,33.26,,hf_open_llm_v2_240829.csv yi_1_5_9b_chat_16k,HFv2 Math Level 5,12.61,,hf_open_llm_v2_240829.csv yi_1_5_9b_chat_16k,HFv2 MuSR,10.04,,hf_open_llm_v2_240829.csv yi_34b,HF OpenLLM v2,22.26,,hf_open_llm_v2_240829.csv yi_34b,HFv2 BBH,35.54,,hf_open_llm_v2_240829.csv yi_34b,HFv2 GPQA,15.55,,hf_open_llm_v2_240829.csv yi_34b,HFv2 IFEval,30.46,,hf_open_llm_v2_240829.csv yi_34b,HFv2 MMLU Pro,37.91,,hf_open_llm_v2_240829.csv yi_34b,HFv2 Math Level 5,4.46,,hf_open_llm_v2_240829.csv yi_34b,HFv2 MuSR,9.65,,hf_open_llm_v2_240829.csv yi_34b_200k,HF OpenLLM v2,19.8,,hf_open_llm_v2_240829.csv yi_34b_200k,HFv2 BBH,36.02,,hf_open_llm_v2_240829.csv yi_34b_200k,HFv2 GPQA,14.21,,hf_open_llm_v2_240829.csv yi_34b_200k,HFv2 IFEval,15.42,,hf_open_llm_v2_240829.csv yi_34b_200k,HFv2 MMLU Pro,39.27,,hf_open_llm_v2_240829.csv yi_34b_200k,HFv2 Math Level 5,4.46,,hf_open_llm_v2_240829.csv yi_34b_200k,HFv2 MuSR,9.41,,hf_open_llm_v2_240829.csv yi_34b_chat,HF OpenLLM v2,23.9,,hf_open_llm_v2_240829.csv yi_34b_chat,HFv2 BBH,37.62,,hf_open_llm_v2_240829.csv yi_34b_chat,HFv2 GPQA,11.74,,hf_open_llm_v2_240829.csv yi_34b_chat,HFv2 IFEval,46.99,,hf_open_llm_v2_240829.csv yi_34b_chat,HFv2 MMLU Pro,34.37,,hf_open_llm_v2_240829.csv yi_34b_chat,HFv2 Math Level 5,4.31,,hf_open_llm_v2_240829.csv yi_34b_chat,HFv2 MuSR,8.36,,hf_open_llm_v2_240829.csv yi_34bx2_moe_60b_dpo,HF OpenLLM v2,25.91,,hf_open_llm_v2_240829.csv yi_34bx2_moe_60b_dpo,HFv2 BBH,31.26,,hf_open_llm_v2_240829.csv yi_34bx2_moe_60b_dpo,HFv2 GPQA,9.62,,hf_open_llm_v2_240829.csv yi_34bx2_moe_60b_dpo,HFv2 IFEval,53.19,,hf_open_llm_v2_240829.csv yi_34bx2_moe_60b_dpo,HFv2 MMLU Pro,40.85,,hf_open_llm_v2_240829.csv yi_34bx2_moe_60b_dpo,HFv2 Math Level 5,6.19,,hf_open_llm_v2_240829.csv yi_34bx2_moe_60b_dpo,HFv2 MuSR,14.32,,hf_open_llm_v2_240829.csv yi_6b,HF OpenLLM v2,13.6,,hf_open_llm_v2_240829.csv yi_6b,HFv2 BBH,19.41,,hf_open_llm_v2_240829.csv yi_6b,HFv2 GPQA,2.57,,hf_open_llm_v2_240829.csv yi_6b,HFv2 IFEval,28.93,,hf_open_llm_v2_240829.csv yi_6b,HFv2 MMLU Pro,22.12,,hf_open_llm_v2_240829.csv yi_6b,HFv2 Math Level 5,1.51,,hf_open_llm_v2_240829.csv yi_6b,HFv2 MuSR,7.04,,hf_open_llm_v2_240829.csv yi_6b_200k,HF OpenLLM v2,11.9,,hf_open_llm_v2_240829.csv yi_6b_200k,HFv2 BBH,20.15,,hf_open_llm_v2_240829.csv yi_6b_200k,HFv2 GPQA,4.25,,hf_open_llm_v2_240829.csv yi_6b_200k,HFv2 IFEval,8.43,,hf_open_llm_v2_240829.csv yi_6b_200k,HFv2 MMLU Pro,20.49,,hf_open_llm_v2_240829.csv yi_6b_200k,HFv2 Math Level 5,1.21,,hf_open_llm_v2_240829.csv yi_6b_200k,HFv2 MuSR,16.84,,hf_open_llm_v2_240829.csv yi_6b_chat,HF OpenLLM v2,14.0,,hf_open_llm_v2_240829.csv yi_6b_chat,HFv2 BBH,17.0,,hf_open_llm_v2_240829.csv yi_6b_chat,HFv2 GPQA,5.93,,hf_open_llm_v2_240829.csv yi_6b_chat,HFv2 IFEval,33.95,,hf_open_llm_v2_240829.csv yi_6b_chat,HFv2 MMLU Pro,22.9,,hf_open_llm_v2_240829.csv yi_6b_chat,HFv2 Math Level 5,0.68,,hf_open_llm_v2_240829.csv yi_6b_chat,HFv2 MuSR,3.57,,hf_open_llm_v2_240829.csv yi_9b,HF OpenLLM v2,17.61,,hf_open_llm_v2_240829.csv yi_9b,HFv2 BBH,27.63,,hf_open_llm_v2_240829.csv yi_9b,HFv2 GPQA,9.06,,hf_open_llm_v2_240829.csv yi_9b,HFv2 IFEval,27.09,,hf_open_llm_v2_240829.csv yi_9b,HFv2 MMLU Pro,28.6,,hf_open_llm_v2_240829.csv yi_9b,HFv2 Math Level 5,4.38,,hf_open_llm_v2_240829.csv yi_9b,HFv2 MuSR,8.91,,hf_open_llm_v2_240829.csv yi_9b_200k,HF OpenLLM v2,17.59,,hf_open_llm_v2_240829.csv yi_9b_200k,HFv2 BBH,26.49,,hf_open_llm_v2_240829.csv yi_9b_200k,HFv2 GPQA,8.72,,hf_open_llm_v2_240829.csv yi_9b_200k,HFv2 IFEval,23.27,,hf_open_llm_v2_240829.csv yi_9b_200k,HFv2 MMLU Pro,29.13,,hf_open_llm_v2_240829.csv yi_9b_200k,HFv2 Math Level 5,5.82,,hf_open_llm_v2_240829.csv yi_9b_200k,HFv2 MuSR,12.11,,hf_open_llm_v2_240829.csv zephyr_7b_alpha,HF OpenLLM v2,18.53,,hf_open_llm_v2_240829.csv zephyr_7b_alpha,HFv2 BBH,23.96,,hf_open_llm_v2_240829.csv zephyr_7b_alpha,HFv2 GPQA,6.38,,hf_open_llm_v2_240829.csv zephyr_7b_alpha,HFv2 IFEval,51.91,,hf_open_llm_v2_240829.csv zephyr_7b_alpha,HFv2 MMLU Pro,19.94,,hf_open_llm_v2_240829.csv zephyr_7b_alpha,HFv2 Math Level 5,1.51,,hf_open_llm_v2_240829.csv zephyr_7b_alpha,HFv2 MuSR,7.5,,hf_open_llm_v2_240829.csv zephyr_7b_beta,HF OpenLLM v2,17.72,,hf_open_llm_v2_240829.csv zephyr_7b_beta,HFv2 BBH,21.49,,hf_open_llm_v2_240829.csv zephyr_7b_beta,HFv2 GPQA,5.37,,hf_open_llm_v2_240829.csv zephyr_7b_beta,HFv2 IFEval,49.5,,hf_open_llm_v2_240829.csv zephyr_7b_beta,HFv2 MMLU Pro,19.79,,hf_open_llm_v2_240829.csv zephyr_7b_beta,HFv2 Math Level 5,2.42,,hf_open_llm_v2_240829.csv zephyr_7b_beta,HFv2 MuSR,7.73,,hf_open_llm_v2_240829.csv zephyr_7b_gemma_v0_1,HF OpenLLM v2,15.78,,hf_open_llm_v2_240829.csv zephyr_7b_gemma_v0_1,HFv2 BBH,23.75,,hf_open_llm_v2_240829.csv zephyr_7b_gemma_v0_1,HFv2 GPQA,5.93,,hf_open_llm_v2_240829.csv zephyr_7b_gemma_v0_1,HFv2 IFEval,33.64,,hf_open_llm_v2_240829.csv zephyr_7b_gemma_v0_1,HFv2 MMLU Pro,20.53,,hf_open_llm_v2_240829.csv zephyr_7b_gemma_v0_1,HFv2 Math Level 5,6.65,,hf_open_llm_v2_240829.csv zephyr_7b_gemma_v0_1,HFv2 MuSR,4.18,,hf_open_llm_v2_240829.csv zephyr_orpo_141b_a35b_v0_1,HF OpenLLM v2,33.77,,hf_open_llm_v2_240829.csv zephyr_orpo_141b_a35b_v0_1,HFv2 BBH,47.5,,hf_open_llm_v2_240829.csv zephyr_orpo_141b_a35b_v0_1,HFv2 GPQA,17.11,,hf_open_llm_v2_240829.csv zephyr_orpo_141b_a35b_v0_1,HFv2 IFEval,65.11,,hf_open_llm_v2_240829.csv zephyr_orpo_141b_a35b_v0_1,HFv2 MMLU Pro,39.85,,hf_open_llm_v2_240829.csv zephyr_orpo_141b_a35b_v0_1,HFv2 Math Level 5,18.35,,hf_open_llm_v2_240829.csv zephyr_orpo_141b_a35b_v0_1,HFv2 MuSR,14.72,,hf_open_llm_v2_240829.csv llama2_7b_chat,tablebench_overall_dp,16.98,[],tablebench_241002.csv codellama_7b_instruct,tablebench_overall_dp,17.01,[],tablebench_241002.csv gemma_7b_instruct,tablebench_overall_dp,14.82,[],tablebench_241002.csv mistral_7b_instruct,tablebench_overall_dp,19.15,[],tablebench_241002.csv deepseek_coder_7b_instruct,tablebench_overall_dp,13.82,[],tablebench_241002.csv codeqwen1_5_7b_chat,tablebench_overall_dp,16.76,[],tablebench_241002.csv qwen1_5_7b_chat,tablebench_overall_dp,15.84,[],tablebench_241002.csv qwen2_7b_instruct,tablebench_overall_dp,21.23,[],tablebench_241002.csv structlm_7b,tablebench_overall_dp,12.06,[],tablebench_241002.csv map_neo_7b_instruct,tablebench_overall_dp,12.66,[],tablebench_241002.csv llama3_8b_chat,tablebench_overall_dp,27.28,[],tablebench_241002.csv llama3_1_8b_instruct,tablebench_overall_dp,23.47,[],tablebench_241002.csv llama2_13b_chat,tablebench_overall_dp,18.58,[],tablebench_241002.csv structlm_13b,tablebench_overall_dp,11.52,[],tablebench_241002.csv wizardlm_13b,tablebench_overall_dp,20.8,[],tablebench_241002.csv qwen1_5_14b_chat,tablebench_overall_dp,17.76,[],tablebench_241002.csv qwen1_5_32b_chat,tablebench_overall_dp,20.21,[],tablebench_241002.csv deepseek_coder_33b_instruct,tablebench_overall_dp,9.74,[],tablebench_241002.csv codellama34b_instruct,tablebench_overall_dp,21.6,[],tablebench_241002.csv structlm_34b,tablebench_overall_dp,0.6,[],tablebench_241002.csv mixtral_8x7b_instruct,tablebench_overall_dp,24.98,[],tablebench_241002.csv qwen1_5_72b_chat,tablebench_overall_dp,28.45,[],tablebench_241002.csv qwen2_72b_instruct,tablebench_overall_dp,32.52,[],tablebench_241002.csv qwen1_5_110b_chat,tablebench_overall_dp,29.72,[],tablebench_241002.csv llama3_70b_chat,tablebench_overall_dp,30.91,[],tablebench_241002.csv llama3_1_70b_instruct,tablebench_overall_dp,33.63,[],tablebench_241002.csv gpt_3_5_turbo,tablebench_overall_dp,27.75,[],tablebench_241002.csv qwen_max,tablebench_overall_dp,29.63,[],tablebench_241002.csv yi_large,tablebench_overall_dp,32.43,[],tablebench_241002.csv glm_4,tablebench_overall_dp,31.23,[],tablebench_241002.csv deepseek_chat_v2,tablebench_overall_dp,40.65,[],tablebench_241002.csv deepseek_coder_v2,tablebench_overall_dp,35.21,[],tablebench_241002.csv gpt_4_turbo,tablebench_overall_dp,40.38,[],tablebench_241002.csv gpt_4o,tablebench_overall_dp,42.73,[],tablebench_241002.csv tablellm_codeqwen_7b,tablebench_overall_dp,26.08,[],tablebench_241002.csv tablellm_deepseek_coder_7b,tablebench_overall_dp,27.98,[],tablebench_241002.csv tablellm_llama3_1_8b,tablebench_overall_dp,27.19,[],tablebench_241002.csv tablellm_llama3_8b,tablebench_overall_dp,26.93,[],tablebench_241002.csv tablellm_qwen2_7b,tablebench_overall_dp,27.14,[],tablebench_241002.csv gemma_2b_it,Trustworthy Average,0.5981804397270656,[],llm_trustworthy_241001.csv gemma_7b_it,Trustworthy Average,0.6099317664897647,[],llm_trustworthy_241001.csv vicuna_7b_v1_3,Trustworthy Average,0.8468536770280516,[],llm_trustworthy_241001.csv llama_2_7b_chat,Trustworthy Average,0.31235784685367685,[],llm_trustworthy_241001.csv llama3_8b_instruct,Trustworthy Average,0.0890826383623956,[],llm_trustworthy_241001.csv mpt_7b_chat,Trustworthy Average,0.7835481425322213,[],llm_trustworthy_241001.csv gpt_3_5_turbo_0301,Trustworthy Average,0.3984078847611824,[],llm_trustworthy_241001.csv gpt_4_0314,Trustworthy Average,0.5200909780136467,[],llm_trustworthy_241001.csv gpt_4o_2024_05_13,Trustworthy Average,0.0,[],llm_trustworthy_241001.csv gpt_4o_mini_2024_07_18,Trustworthy Average,0.2520849128127366,[],llm_trustworthy_241001.csv falcon_7b_instruct,Trustworthy Average,0.8896891584533736,[],llm_trustworthy_241001.csv redpajama_incite_7b_instruct,Trustworthy Average,1.0,[],llm_trustworthy_241001.csv gemini_pro_1_0,Trustworthy Average,0.0890826383623956,[],llm_trustworthy_241001.csv gemma_2b_it,trustworthy_non_toxicity,0.13716038562664334,[],llm_trustworthy_241001.csv gemma_7b_it,trustworthy_non_toxicity,0.15980134385042355,[],llm_trustworthy_241001.csv vicuna_7b_v1_3,trustworthy_non_toxicity,0.8539293017820625,[],llm_trustworthy_241001.csv llama_2_7b_chat,trustworthy_non_toxicity,0.09436167104878757,[],llm_trustworthy_241001.csv llama3_8b_instruct,trustworthy_non_toxicity,0.13044113350861808,[],llm_trustworthy_241001.csv mpt_7b_chat,trustworthy_non_toxicity,0.6786444639205376,[],llm_trustworthy_241001.csv gpt_3_5_turbo_0301,trustworthy_non_toxicity,0.5763949751679813,[],llm_trustworthy_241001.csv gpt_4_0314,trustworthy_non_toxicity,0.6640373940987437,[],llm_trustworthy_241001.csv gpt_4o_2024_05_13,trustworthy_non_toxicity,0.0,[],llm_trustworthy_241001.csv gpt_4o_mini_2024_07_18,trustworthy_non_toxicity,0.4008179959100203,[],llm_trustworthy_241001.csv falcon_7b_instruct,trustworthy_non_toxicity,0.6932515337423313,[],llm_trustworthy_241001.csv redpajama_incite_7b_instruct,trustworthy_non_toxicity,1.0,[],llm_trustworthy_241001.csv gemini_pro_1_0,trustworthy_non_toxicity,0.13044113350861808,[],llm_trustworthy_241001.csv gemma_2b_it,trustworthy_non_stereotype,0.9877777777777779,[],llm_trustworthy_241001.csv gemma_7b_it,trustworthy_non_stereotype,0.0,[],llm_trustworthy_241001.csv vicuna_7b_v1_3,trustworthy_non_stereotype,0.7037037037037037,[],llm_trustworthy_241001.csv llama_2_7b_chat,trustworthy_non_stereotype,0.08888888888888913,[],llm_trustworthy_241001.csv llama3_8b_instruct,trustworthy_non_stereotype,0.061851851851851936,[],llm_trustworthy_241001.csv mpt_7b_chat,trustworthy_non_stereotype,0.5703703703703706,[],llm_trustworthy_241001.csv gpt_3_5_turbo_0301,trustworthy_non_stereotype,0.4814814814814815,[],llm_trustworthy_241001.csv gpt_4_0314,trustworthy_non_stereotype,0.8518518518518519,[],llm_trustworthy_241001.csv gpt_4o_2024_05_13,trustworthy_non_stereotype,0.012222222222222134,[],llm_trustworthy_241001.csv gpt_4o_mini_2024_07_18,trustworthy_non_stereotype,0.4688888888888888,[],llm_trustworthy_241001.csv falcon_7b_instruct,trustworthy_non_stereotype,0.4814814814814815,[],llm_trustworthy_241001.csv redpajama_incite_7b_instruct,trustworthy_non_stereotype,1.0,[],llm_trustworthy_241001.csv gemini_pro_1_0,trustworthy_non_stereotype,0.061851851851851936,[],llm_trustworthy_241001.csv gemma_2b_it,trustworthy_advglue_pp,1.0,[],llm_trustworthy_241001.csv gemma_7b_it,trustworthy_advglue_pp,0.9908599916909016,[],llm_trustworthy_241001.csv vicuna_7b_v1_3,trustworthy_advglue_pp,0.6281678437889491,[],llm_trustworthy_241001.csv llama_2_7b_chat,trustworthy_advglue_pp,0.6759451599501456,[],llm_trustworthy_241001.csv llama3_8b_instruct,trustworthy_advglue_pp,0.0,[],llm_trustworthy_241001.csv mpt_7b_chat,trustworthy_advglue_pp,0.875778977980889,[],llm_trustworthy_241001.csv gpt_3_5_turbo_0301,trustworthy_advglue_pp,0.43996676360614884,[],llm_trustworthy_241001.csv gpt_4_0314,trustworthy_advglue_pp,0.13460739509763164,[],llm_trustworthy_241001.csv gpt_4o_2024_05_13,trustworthy_advglue_pp,0.661404237640216,[],llm_trustworthy_241001.csv gpt_4o_mini_2024_07_18,trustworthy_advglue_pp,0.7075197341088493,[],llm_trustworthy_241001.csv falcon_7b_instruct,trustworthy_advglue_pp,0.9680099709181555,[],llm_trustworthy_241001.csv redpajama_incite_7b_instruct,trustworthy_advglue_pp,0.9335272122974657,[],llm_trustworthy_241001.csv gemini_pro_1_0,trustworthy_advglue_pp,0.0,[],llm_trustworthy_241001.csv gemma_2b_it,trustworthy_ood,1.0,[],llm_trustworthy_241001.csv gemma_7b_it,trustworthy_ood,0.713455149501661,[],llm_trustworthy_241001.csv vicuna_7b_v1_3,trustworthy_ood,0.7876522702104096,[],llm_trustworthy_241001.csv llama_2_7b_chat,trustworthy_ood,0.3294573643410851,[],llm_trustworthy_241001.csv llama3_8b_instruct,trustworthy_ood,0.4623477297895904,[],llm_trustworthy_241001.csv mpt_7b_chat,trustworthy_ood,0.6447951273532667,[],llm_trustworthy_241001.csv gpt_3_5_turbo_0301,trustworthy_ood,0.38676633444075303,[],llm_trustworthy_241001.csv gpt_4_0314,trustworthy_ood,0.0,[],llm_trustworthy_241001.csv gpt_4o_2024_05_13,trustworthy_ood,0.026578073089700838,[],llm_trustworthy_241001.csv gpt_4o_mini_2024_07_18,trustworthy_ood,0.2347729789590256,[],llm_trustworthy_241001.csv falcon_7b_instruct,trustworthy_ood,0.9994462901439645,[],llm_trustworthy_241001.csv redpajama_incite_7b_instruct,trustworthy_ood,0.9230343300110742,[],llm_trustworthy_241001.csv gemini_pro_1_0,trustworthy_ood,0.4623477297895904,[],llm_trustworthy_241001.csv gemma_2b_it,trustworthy_adv_demo,0.9597534445250181,[],llm_trustworthy_241001.csv gemma_7b_it,trustworthy_adv_demo,1.0,[],llm_trustworthy_241001.csv vicuna_7b_v1_3,trustworthy_adv_demo,0.552936910804931,[],llm_trustworthy_241001.csv llama_2_7b_chat,trustworthy_adv_demo,0.5973531544597535,[],llm_trustworthy_241001.csv llama3_8b_instruct,trustworthy_adv_demo,0.23477157360406076,[],llm_trustworthy_241001.csv mpt_7b_chat,trustworthy_adv_demo,0.548223350253807,[],llm_trustworthy_241001.csv gpt_3_5_turbo_0301,trustworthy_adv_demo,0.13071065989847708,[],llm_trustworthy_241001.csv gpt_4_0314,trustworthy_adv_demo,0.19126178390137782,[],llm_trustworthy_241001.csv gpt_4o_2024_05_13,trustworthy_adv_demo,0.007070340826686006,[],llm_trustworthy_241001.csv gpt_4o_mini_2024_07_18,trustworthy_adv_demo,0.0,[],llm_trustworthy_241001.csv falcon_7b_instruct,trustworthy_adv_demo,0.9887599709934735,[],llm_trustworthy_241001.csv redpajama_incite_7b_instruct,trustworthy_adv_demo,0.543509789702683,[],llm_trustworthy_241001.csv gemini_pro_1_0,trustworthy_adv_demo,0.23477157360406076,[],llm_trustworthy_241001.csv gemma_2b_it,trustworthy_privacy,0.2755754475703326,[],llm_trustworthy_241001.csv gemma_7b_it,trustworthy_privacy,0.4379795396419438,[],llm_trustworthy_241001.csv vicuna_7b_v1_3,trustworthy_privacy,0.7810102301790283,[],llm_trustworthy_241001.csv llama_2_7b_chat,trustworthy_privacy,0.0,[],llm_trustworthy_241001.csv llama3_8b_instruct,trustworthy_privacy,0.5051150895140664,[],llm_trustworthy_241001.csv mpt_7b_chat,trustworthy_privacy,0.5901534526854217,[],llm_trustworthy_241001.csv gpt_3_5_turbo_0301,trustworthy_privacy,0.8714833759590794,[],llm_trustworthy_241001.csv gpt_4_0314,trustworthy_privacy,1.0,[],llm_trustworthy_241001.csv gpt_4o_2024_05_13,trustworthy_privacy,0.011189258312020334,[],llm_trustworthy_241001.csv gpt_4o_mini_2024_07_18,trustworthy_privacy,0.2560741687979541,[],llm_trustworthy_241001.csv falcon_7b_instruct,trustworthy_privacy,0.8673273657289,[],llm_trustworthy_241001.csv redpajama_incite_7b_instruct,trustworthy_privacy,0.6633631713554987,[],llm_trustworthy_241001.csv gemini_pro_1_0,trustworthy_privacy,0.5051150895140664,[],llm_trustworthy_241001.csv gemma_2b_it,trustworthy_ethics,0.2766523732071565,[],llm_trustworthy_241001.csv gemma_7b_it,trustworthy_ethics,0.7453792695549313,[],llm_trustworthy_241001.csv vicuna_7b_v1_3,trustworthy_ethics,0.6730740795504953,[],llm_trustworthy_241001.csv llama_2_7b_chat,trustworthy_ethics,0.7860416974715363,[],llm_trustworthy_241001.csv llama3_8b_instruct,trustworthy_ethics,0.0,[],llm_trustworthy_241001.csv mpt_7b_chat,trustworthy_ethics,1.0,[],llm_trustworthy_241001.csv gpt_3_5_turbo_0301,trustworthy_ethics,0.10882744344225936,[],llm_trustworthy_241001.csv gpt_4_0314,trustworthy_ethics,0.2534378234511312,[],llm_trustworthy_241001.csv gpt_4o_2024_05_13,trustworthy_ethics,0.025432500369658384,[],llm_trustworthy_241001.csv gpt_4o_mini_2024_07_18,trustworthy_ethics,0.09670264675439877,[],llm_trustworthy_241001.csv falcon_7b_instruct,trustworthy_ethics,0.642614224456602,[],llm_trustworthy_241001.csv redpajama_incite_7b_instruct,trustworthy_ethics,0.9795948543545764,[],llm_trustworthy_241001.csv gemini_pro_1_0,trustworthy_ethics,0.0,[],llm_trustworthy_241001.csv gemma_2b_it,trustworthy_fairness,0.18598454569677603,[],llm_trustworthy_241001.csv gemma_7b_it,trustworthy_fairness,0.1630695443645085,[],llm_trustworthy_241001.csv vicuna_7b_v1_3,trustworthy_fairness,0.38555822009059415,[],llm_trustworthy_241001.csv llama_2_7b_chat,trustworthy_fairness,0.0,[],llm_trustworthy_241001.csv llama3_8b_instruct,trustworthy_fairness,0.5315747402078338,[],llm_trustworthy_241001.csv mpt_7b_chat,trustworthy_fairness,0.0,[],llm_trustworthy_241001.csv gpt_3_5_turbo_0301,trustworthy_fairness,0.5976552091660007,[],llm_trustworthy_241001.csv gpt_4_0314,trustworthy_fairness,0.9680255795363708,[],llm_trustworthy_241001.csv gpt_4o_2024_05_13,trustworthy_fairness,1.0,[],llm_trustworthy_241001.csv gpt_4o_mini_2024_07_18,trustworthy_fairness,0.8062883026911805,[],llm_trustworthy_241001.csv falcon_7b_instruct,trustworthy_fairness,0.0,[],llm_trustworthy_241001.csv redpajama_incite_7b_instruct,trustworthy_fairness,0.0,[],llm_trustworthy_241001.csv gemini_pro_1_0,trustworthy_fairness,0.5315747402078338,[],llm_trustworthy_241001.csv gpt_4o_20240513,OpenCompass Academic,77.0,[],opencompass_academic_240829.csv qwen2_72b_instruct,OpenCompass Academic,73.1,[],opencompass_academic_240829.csv gpt_4o_mini_20240718,OpenCompass Academic,72.5,[],opencompass_academic_240829.csv llama3_70b_instruct,OpenCompass Academic,66.6,[],opencompass_academic_240829.csv qwen1_5_110b_chat,OpenCompass Academic,61.7,[],opencompass_academic_240829.csv yi_1_5_34b_chat,OpenCompass Academic,60.4,[],opencompass_academic_240829.csv internlm2_5_chat_7b,OpenCompass Academic,60.3,[],opencompass_academic_240829.csv glm_4_9b_chat,OpenCompass Academic,59.5,[],opencompass_academic_240829.csv qwen1_5_32b_chat,OpenCompass Academic,57.1,[],opencompass_academic_240829.csv qwen1_5_72b_chat,OpenCompass Academic,56.9,[],opencompass_academic_240829.csv yi_1_5_9b_chat,OpenCompass Academic,56.1,[],opencompass_academic_240829.csv qwen2_7b_instruct,OpenCompass Academic,52.0,[],opencompass_academic_240829.csv llama3_8b_instruct,OpenCompass Academic,50.6,[],opencompass_academic_240829.csv qwen1_5_14b_chat,OpenCompass Academic,49.7,[],opencompass_academic_240829.csv internlm2_chat_20b,OpenCompass Academic,45.2,[],opencompass_academic_240829.csv yi_1_5_6b_chat,OpenCompass Academic,43.5,[],opencompass_academic_240829.csv mixtral_8x7b_instruct_v0_1,OpenCompass Academic,42.6,[],opencompass_academic_240829.csv internlm2_chat_7b,OpenCompass Academic,42.1,[],opencompass_academic_240829.csv qwen1_5_7b_chat,OpenCompass Academic,35.4,[],opencompass_academic_240829.csv mistral_7b_instruct_v0_3,OpenCompass Academic,31.2,[],opencompass_academic_240829.csv gpt_4o_20240513,OpenCompass MMLU,88.0,[],opencompass_academic_240829.csv qwen2_72b_instruct,OpenCompass MMLU,83.1,[],opencompass_academic_240829.csv gpt_4o_mini_20240718,OpenCompass MMLU,82.9,[],opencompass_academic_240829.csv llama3_70b_instruct,OpenCompass MMLU,80.7,[],opencompass_academic_240829.csv qwen1_5_110b_chat,OpenCompass MMLU,74.0,[],opencompass_academic_240829.csv yi_1_5_34b_chat,OpenCompass MMLU,71.3,[],opencompass_academic_240829.csv internlm2_5_chat_7b,OpenCompass MMLU,70.6,[],opencompass_academic_240829.csv glm_4_9b_chat,OpenCompass MMLU,72.9,[],opencompass_academic_240829.csv qwen1_5_32b_chat,OpenCompass MMLU,72.5,[],opencompass_academic_240829.csv qwen1_5_72b_chat,OpenCompass MMLU,70.9,[],opencompass_academic_240829.csv yi_1_5_9b_chat,OpenCompass MMLU,67.8,[],opencompass_academic_240829.csv qwen2_7b_instruct,OpenCompass MMLU,51.1,[],opencompass_academic_240829.csv llama3_8b_instruct,OpenCompass MMLU,66.7,[],opencompass_academic_240829.csv qwen1_5_14b_chat,OpenCompass MMLU,67.0,[],opencompass_academic_240829.csv internlm2_chat_20b,OpenCompass MMLU,55.8,[],opencompass_academic_240829.csv yi_1_5_6b_chat,OpenCompass MMLU,48.4,[],opencompass_academic_240829.csv mixtral_8x7b_instruct_v0_1,OpenCompass MMLU,67.2,[],opencompass_academic_240829.csv internlm2_chat_7b,OpenCompass MMLU,58.8,[],opencompass_academic_240829.csv qwen1_5_7b_chat,OpenCompass MMLU,41.9,[],opencompass_academic_240829.csv mistral_7b_instruct_v0_3,OpenCompass MMLU,30.9,[],opencompass_academic_240829.csv gpt_4o_20240513,OpenCompass MMLU Pro,73.8,[],opencompass_academic_240829.csv qwen2_72b_instruct,OpenCompass MMLU Pro,65.1,[],opencompass_academic_240829.csv gpt_4o_mini_20240718,OpenCompass MMLU Pro,63.2,[],opencompass_academic_240829.csv llama3_70b_instruct,OpenCompass MMLU Pro,61.8,[],opencompass_academic_240829.csv qwen1_5_110b_chat,OpenCompass MMLU Pro,51.8,[],opencompass_academic_240829.csv yi_1_5_34b_chat,OpenCompass MMLU Pro,50.9,[],opencompass_academic_240829.csv internlm2_5_chat_7b,OpenCompass MMLU Pro,44.9,[],opencompass_academic_240829.csv glm_4_9b_chat,OpenCompass MMLU Pro,48.3,[],opencompass_academic_240829.csv qwen1_5_32b_chat,OpenCompass MMLU Pro,49.8,[],opencompass_academic_240829.csv qwen1_5_72b_chat,OpenCompass MMLU Pro,47.1,[],opencompass_academic_240829.csv yi_1_5_9b_chat,OpenCompass MMLU Pro,45.9,[],opencompass_academic_240829.csv qwen2_7b_instruct,OpenCompass MMLU Pro,38.8,[],opencompass_academic_240829.csv llama3_8b_instruct,OpenCompass MMLU Pro,42.3,[],opencompass_academic_240829.csv qwen1_5_14b_chat,OpenCompass MMLU Pro,40.5,[],opencompass_academic_240829.csv internlm2_chat_20b,OpenCompass MMLU Pro,36.3,[],opencompass_academic_240829.csv yi_1_5_6b_chat,OpenCompass MMLU Pro,30.1,[],opencompass_academic_240829.csv mixtral_8x7b_instruct_v0_1,OpenCompass MMLU Pro,42.7,[],opencompass_academic_240829.csv internlm2_chat_7b,OpenCompass MMLU Pro,32.4,[],opencompass_academic_240829.csv qwen1_5_7b_chat,OpenCompass MMLU Pro,25.6,[],opencompass_academic_240829.csv mistral_7b_instruct_v0_3,OpenCompass MMLU Pro,22.1,[],opencompass_academic_240829.csv gpt_4o_20240513,OpenCompass CMMLU,78.3,[],opencompass_academic_240829.csv qwen2_72b_instruct,OpenCompass CMMLU,79.8,[],opencompass_academic_240829.csv gpt_4o_mini_20240718,OpenCompass CMMLU,65.6,[],opencompass_academic_240829.csv llama3_70b_instruct,OpenCompass CMMLU,66.2,[],opencompass_academic_240829.csv qwen1_5_110b_chat,OpenCompass CMMLU,79.4,[],opencompass_academic_240829.csv yi_1_5_34b_chat,OpenCompass CMMLU,63.4,[],opencompass_academic_240829.csv internlm2_5_chat_7b,OpenCompass CMMLU,73.8,[],opencompass_academic_240829.csv glm_4_9b_chat,OpenCompass CMMLU,71.6,[],opencompass_academic_240829.csv qwen1_5_32b_chat,OpenCompass CMMLU,76.3,[],opencompass_academic_240829.csv qwen1_5_72b_chat,OpenCompass CMMLU,67.8,[],opencompass_academic_240829.csv yi_1_5_9b_chat,OpenCompass CMMLU,65.0,[],opencompass_academic_240829.csv qwen2_7b_instruct,OpenCompass CMMLU,59.3,[],opencompass_academic_240829.csv llama3_8b_instruct,OpenCompass CMMLU,51.5,[],opencompass_academic_240829.csv qwen1_5_14b_chat,OpenCompass CMMLU,73.3,[],opencompass_academic_240829.csv internlm2_chat_20b,OpenCompass CMMLU,44.7,[],opencompass_academic_240829.csv yi_1_5_6b_chat,OpenCompass CMMLU,53.9,[],opencompass_academic_240829.csv mixtral_8x7b_instruct_v0_1,OpenCompass CMMLU,33.9,[],opencompass_academic_240829.csv internlm2_chat_7b,OpenCompass CMMLU,47.8,[],opencompass_academic_240829.csv qwen1_5_7b_chat,OpenCompass CMMLU,42.3,[],opencompass_academic_240829.csv mistral_7b_instruct_v0_3,OpenCompass CMMLU,35.5,[],opencompass_academic_240829.csv gpt_4o_20240513,OpenCompass BBH,87.6,[],opencompass_academic_240829.csv qwen2_72b_instruct,OpenCompass BBH,85.2,[],opencompass_academic_240829.csv gpt_4o_mini_20240718,OpenCompass BBH,81.9,[],opencompass_academic_240829.csv llama3_70b_instruct,OpenCompass BBH,83.2,[],opencompass_academic_240829.csv qwen1_5_110b_chat,OpenCompass BBH,74.2,[],opencompass_academic_240829.csv yi_1_5_34b_chat,OpenCompass BBH,73.8,[],opencompass_academic_240829.csv internlm2_5_chat_7b,OpenCompass BBH,74.5,[],opencompass_academic_240829.csv glm_4_9b_chat,OpenCompass BBH,60.6,[],opencompass_academic_240829.csv qwen1_5_32b_chat,OpenCompass BBH,68.2,[],opencompass_academic_240829.csv qwen1_5_72b_chat,OpenCompass BBH,72.8,[],opencompass_academic_240829.csv yi_1_5_9b_chat,OpenCompass BBH,67.9,[],opencompass_academic_240829.csv qwen2_7b_instruct,OpenCompass BBH,65.4,[],opencompass_academic_240829.csv llama3_8b_instruct,OpenCompass BBH,54.4,[],opencompass_academic_240829.csv qwen1_5_14b_chat,OpenCompass BBH,58.3,[],opencompass_academic_240829.csv internlm2_chat_20b,OpenCompass BBH,65.6,[],opencompass_academic_240829.csv yi_1_5_6b_chat,OpenCompass BBH,56.5,[],opencompass_academic_240829.csv mixtral_8x7b_instruct_v0_1,OpenCompass BBH,55.7,[],opencompass_academic_240829.csv internlm2_chat_7b,OpenCompass BBH,60.3,[],opencompass_academic_240829.csv qwen1_5_7b_chat,OpenCompass BBH,41.0,[],opencompass_academic_240829.csv gpt_4o_20240513,OpenCompass GQPA-Dimand,49.5,[],opencompass_academic_240829.csv qwen2_72b_instruct,OpenCompass GQPA-Dimand,42.9,[],opencompass_academic_240829.csv gpt_4o_mini_20240718,OpenCompass GQPA-Dimand,47.5,[],opencompass_academic_240829.csv llama3_70b_instruct,OpenCompass GQPA-Dimand,39.4,[],opencompass_academic_240829.csv qwen1_5_110b_chat,OpenCompass GQPA-Dimand,28.3,[],opencompass_academic_240829.csv yi_1_5_34b_chat,OpenCompass GQPA-Dimand,32.8,[],opencompass_academic_240829.csv internlm2_5_chat_7b,OpenCompass GQPA-Dimand,29.3,[],opencompass_academic_240829.csv glm_4_9b_chat,OpenCompass GQPA-Dimand,26.8,[],opencompass_academic_240829.csv qwen1_5_32b_chat,OpenCompass GQPA-Dimand,31.3,[],opencompass_academic_240829.csv qwen1_5_72b_chat,OpenCompass GQPA-Dimand,28.3,[],opencompass_academic_240829.csv yi_1_5_9b_chat,OpenCompass GQPA-Dimand,25.2,[],opencompass_academic_240829.csv qwen2_7b_instruct,OpenCompass GQPA-Dimand,25.8,[],opencompass_academic_240829.csv llama3_8b_instruct,OpenCompass GQPA-Dimand,33.8,[],opencompass_academic_240829.csv qwen1_5_14b_chat,OpenCompass GQPA-Dimand,26.3,[],opencompass_academic_240829.csv internlm2_chat_20b,OpenCompass GQPA-Dimand,21.7,[],opencompass_academic_240829.csv yi_1_5_6b_chat,OpenCompass GQPA-Dimand,23.2,[],opencompass_academic_240829.csv mixtral_8x7b_instruct_v0_1,OpenCompass GQPA-Dimand,29.3,[],opencompass_academic_240829.csv internlm2_chat_7b,OpenCompass GQPA-Dimand,26.3,[],opencompass_academic_240829.csv qwen1_5_7b_chat,OpenCompass GQPA-Dimand,21.2,[],opencompass_academic_240829.csv gpt_4o_20240513,OpenCompass HumanEval,86.0,[],opencompass_academic_240829.csv qwen2_72b_instruct,OpenCompass HumanEval,84.2,[],opencompass_academic_240829.csv gpt_4o_mini_20240718,OpenCompass HumanEval,87.8,[],opencompass_academic_240829.csv llama3_70b_instruct,OpenCompass HumanEval,76.2,[],opencompass_academic_240829.csv qwen1_5_110b_chat,OpenCompass HumanEval,77.4,[],opencompass_academic_240829.csv yi_1_5_34b_chat,OpenCompass HumanEval,77.4,[],opencompass_academic_240829.csv internlm2_5_chat_7b,OpenCompass HumanEval,73.2,[],opencompass_academic_240829.csv glm_4_9b_chat,OpenCompass HumanEval,75.6,[],opencompass_academic_240829.csv qwen1_5_32b_chat,OpenCompass HumanEval,67.7,[],opencompass_academic_240829.csv qwen1_5_72b_chat,OpenCompass HumanEval,67.7,[],opencompass_academic_240829.csv yi_1_5_9b_chat,OpenCompass HumanEval,68.9,[],opencompass_academic_240829.csv qwen2_7b_instruct,OpenCompass HumanEval,76.8,[],opencompass_academic_240829.csv llama3_8b_instruct,OpenCompass HumanEval,59.8,[],opencompass_academic_240829.csv qwen1_5_14b_chat,OpenCompass HumanEval,60.4,[],opencompass_academic_240829.csv internlm2_chat_20b,OpenCompass HumanEval,67.7,[],opencompass_academic_240829.csv yi_1_5_6b_chat,OpenCompass HumanEval,45.7,[],opencompass_academic_240829.csv mixtral_8x7b_instruct_v0_1,OpenCompass HumanEval,34.8,[],opencompass_academic_240829.csv internlm2_chat_7b,OpenCompass HumanEval,50.6,[],opencompass_academic_240829.csv qwen1_5_7b_chat,OpenCompass HumanEval,50.0,[],opencompass_academic_240829.csv gpt_4o_20240513,OpenCompass IFEval,79.0,[],opencompass_academic_240829.csv qwen2_72b_instruct,OpenCompass IFEval,76.5,[],opencompass_academic_240829.csv gpt_4o_mini_20240718,OpenCompass IFEval,81.0,[],opencompass_academic_240829.csv llama3_70b_instruct,OpenCompass IFEval,77.5,[],opencompass_academic_240829.csv qwen1_5_110b_chat,OpenCompass IFEval,54.3,[],opencompass_academic_240829.csv yi_1_5_34b_chat,OpenCompass IFEval,59.5,[],opencompass_academic_240829.csv internlm2_5_chat_7b,OpenCompass IFEval,54.5,[],opencompass_academic_240829.csv glm_4_9b_chat,OpenCompass IFEval,69.1,[],opencompass_academic_240829.csv qwen1_5_32b_chat,OpenCompass IFEval,48.4,[],opencompass_academic_240829.csv qwen1_5_72b_chat,OpenCompass IFEval,53.8,[],opencompass_academic_240829.csv yi_1_5_9b_chat,OpenCompass IFEval,56.8,[],opencompass_academic_240829.csv qwen2_7b_instruct,OpenCompass IFEval,49.7,[],opencompass_academic_240829.csv llama3_8b_instruct,OpenCompass IFEval,68.4,[],opencompass_academic_240829.csv qwen1_5_14b_chat,OpenCompass IFEval,42.0,[],opencompass_academic_240829.csv internlm2_chat_20b,OpenCompass IFEval,35.5,[],opencompass_academic_240829.csv yi_1_5_6b_chat,OpenCompass IFEval,47.3,[],opencompass_academic_240829.csv mixtral_8x7b_instruct_v0_1,OpenCompass IFEval,50.8,[],opencompass_academic_240829.csv internlm2_chat_7b,OpenCompass IFEval,32.4,[],opencompass_academic_240829.csv qwen1_5_7b_chat,OpenCompass IFEval,38.6,[],opencompass_academic_240829.csv claude_3_5_sonnet_20240620,Helm MMLU,0.865,[],helm_mmlu_240829.csv claude_3_opus_20240229,Helm MMLU,0.846,[],helm_mmlu_240829.csv llama3_1_instruct_turbo_405b,Helm MMLU,0.845,[],helm_mmlu_240829.csv gpt_4o_2024_05_13,Helm MMLU,0.842,[],helm_mmlu_240829.csv gemini_1_5_pro_001,Helm MMLU,0.827,[],helm_mmlu_240829.csv gpt_4_0613,Helm MMLU,0.824,[],helm_mmlu_240829.csv qwen2_instruct_72b,Helm MMLU,0.824,[],helm_mmlu_240829.csv gpt_4_turbo_2024_04_09,Helm MMLU,0.813,[],helm_mmlu_240829.csv gemini_1_5_pro_0409_preview,Helm MMLU,0.81,[],helm_mmlu_240829.csv llama3_1_instruct_turbo_70b,Helm MMLU,0.801,[],helm_mmlu_240829.csv mistral_large_2_2407,Helm MMLU,0.8,[],helm_mmlu_240829.csv gpt_4_turbo_1106_preview,Helm MMLU,0.796,[],helm_mmlu_240829.csv llama3_70b,Helm MMLU,0.793,[],helm_mmlu_240829.csv yi_large_preview,Helm MMLU,0.793,[],helm_mmlu_240829.csv palmyra_x_v3_72b,Helm MMLU,0.786,[],helm_mmlu_240829.csv palm_2_unicorn,Helm MMLU,0.786,[],helm_mmlu_240829.csv gemini_1_5_flash_001,Helm MMLU,0.779,[],helm_mmlu_240829.csv mixtral_8x22b,Helm MMLU,0.778,[],helm_mmlu_240829.csv gemini_1_5_flash_0514_preview,Helm MMLU,0.778,[],helm_mmlu_240829.csv phi_3_14b,Helm MMLU,0.775,[],helm_mmlu_240829.csv qwen1_5_72b,Helm MMLU,0.774,[],helm_mmlu_240829.csv qwen1_5_chat_110b,Helm MMLU,0.768,[],helm_mmlu_240829.csv gpt_4o_mini_2024_07_18,Helm MMLU,0.767,[],helm_mmlu_240829.csv yi_34b,Helm MMLU,0.762,[],helm_mmlu_240829.csv claude_3_sonnet_20240229,Helm MMLU,0.759,[],helm_mmlu_240829.csv gemma_2_27b,Helm MMLU,0.757,[],helm_mmlu_240829.csv phi_3_7b,Helm MMLU,0.757,[],helm_mmlu_240829.csv qwen1_5_32b,Helm MMLU,0.744,[],helm_mmlu_240829.csv dbrx_instructruct,Helm MMLU,0.741,[],helm_mmlu_240829.csv claude_3_haiku_20240307,Helm MMLU,0.738,[],helm_mmlu_240829.csv claude_2_1,Helm MMLU,0.735,[],helm_mmlu_240829.csv deepseek_llm_chat_67b,Helm MMLU,0.725,[],helm_mmlu_240829.csv gemma_2_9b,Helm MMLU,0.721,[],helm_mmlu_240829.csv mixtral_8x7b_32k_seqlen,Helm MMLU,0.717,[],helm_mmlu_240829.csv gemini_1_0_pro_001,Helm MMLU,0.7,[],helm_mmlu_240829.csv llama_2_70b,Helm MMLU,0.695,[],helm_mmlu_240829.csv command_r_plus,Helm MMLU,0.694,[],helm_mmlu_240829.csv palm_2_bison,Helm MMLU,0.692,[],helm_mmlu_240829.csv gpt_3_5_turbo_0613,Helm MMLU,0.689,[],helm_mmlu_240829.csv claude_instant_1_2,Helm MMLU,0.688,[],helm_mmlu_240829.csv mistral_large_2402,Helm MMLU,0.688,[],helm_mmlu_240829.csv mistral_small_2402,Helm MMLU,0.687,[],helm_mmlu_240829.csv qwen1_5_14b,Helm MMLU,0.686,[],helm_mmlu_240829.csv arctic_instruct,Helm MMLU,0.677,[],helm_mmlu_240829.csv llama3_8b,Helm MMLU,0.668,[],helm_mmlu_240829.csv gemma_7b,Helm MMLU,0.661,[],helm_mmlu_240829.csv jamba_instruct,Helm MMLU,0.659,[],helm_mmlu_240829.csv mistral_nemo_2402,Helm MMLU,0.653,[],helm_mmlu_240829.csv command_r,Helm MMLU,0.652,[],helm_mmlu_240829.csv yi_6b,Helm MMLU,0.64,[],helm_mmlu_240829.csv qwen1_5_7b,Helm MMLU,0.626,[],helm_mmlu_240829.csv mistral_instruct_v0_3_7b,Helm MMLU,0.599,[],helm_mmlu_240829.csv phi_2,Helm MMLU,0.584,[],helm_mmlu_240829.csv mistral_v0_1_7b,Helm MMLU,0.566,[],helm_mmlu_240829.csv llama3_1_instruct_turbo_8b,Helm MMLU,0.561,[],helm_mmlu_240829.csv llama_2_13b,Helm MMLU,0.554,[],helm_mmlu_240829.csv olmo_1_7_7b,Helm MMLU,0.538,[],helm_mmlu_240829.csv llama_2_7b,Helm MMLU,0.458,[],helm_mmlu_240829.csv olmo_7b,Helm MMLU,0.295,[],helm_mmlu_240829.csv llama_2_70b,Helm Classic,0.944,[],helm_classic_240829.csv llama_65b,Helm Classic,0.908,[],helm_classic_240829.csv text_davinci_002,Helm Classic,0.905,[],helm_classic_240829.csv mistral_v0_1_7b,Helm Classic,0.884,[],helm_classic_240829.csv cohere_command_beta_52_4b,Helm Classic,0.874,[],helm_classic_240829.csv text_davinci_003,Helm Classic,0.872,[],helm_classic_240829.csv jurassic_2_jumbo_178b,Helm Classic,0.824,[],helm_classic_240829.csv llama_2_13b,Helm Classic,0.823,[],helm_classic_240829.csv tnlg_v2_530b,Helm Classic,0.787,[],helm_classic_240829.csv gpt_3_5_turbo_0613,Helm Classic,0.783,[],helm_classic_240829.csv llama30b,Helm Classic,0.781,[],helm_classic_240829.csv anthropic_lm_v4_s3_52b,Helm Classic,0.78,[],helm_classic_240829.csv gpt_3_5_turbo_0301,Helm Classic,0.76,[],helm_classic_240829.csv jurassic_2_grande_17b,Helm Classic,0.743,[],helm_classic_240829.csv palmyra_x_43b,Helm Classic,0.732,[],helm_classic_240829.csv falcon_40b,Helm Classic,0.729,[],helm_classic_240829.csv falcon_instruct_40b,Helm Classic,0.727,[],helm_classic_240829.csv mpt_instruct_30b,Helm Classic,0.716,[],helm_classic_240829.csv mpt_30b,Helm Classic,0.714,[],helm_classic_240829.csv j1_grande_v2_beta_17b,Helm Classic,0.706,[],helm_classic_240829.csv vicuna_v1_3_13b,Helm Classic,0.706,[],helm_classic_240829.csv cohere_command_beta_6_1b,Helm Classic,0.675,[],helm_classic_240829.csv cohere_xlarge_v20221108_52_4b,Helm Classic,0.664,[],helm_classic_240829.csv luminous_supreme_70b,Helm Classic,0.662,[],helm_classic_240829.csv vicuna_v1_3_7b,Helm Classic,0.625,[],helm_classic_240829.csv opt_175b,Helm Classic,0.609,[],helm_classic_240829.csv llama_2_7b,Helm Classic,0.607,[],helm_classic_240829.csv llama_13b,Helm Classic,0.595,[],helm_classic_240829.csv instructpalmyra_30b,Helm Classic,0.568,[],helm_classic_240829.csv cohere_xlarge_v20220609_52_4b,Helm Classic,0.56,[],helm_classic_240829.csv jurassic_2_large_7_5b,Helm Classic,0.553,[],helm_classic_240829.csv davinci_175b,Helm Classic,0.538,[],helm_classic_240829.csv llama_7b,Helm Classic,0.533,[],helm_classic_240829.csv redpajama_incite_instruct_7b,Helm Classic,0.524,[],helm_classic_240829.csv j1_jumbo_v1_178b,Helm Classic,0.517,[],helm_classic_240829.csv glm_130b,Helm Classic,0.512,[],helm_classic_240829.csv luminous_extended_30b,Helm Classic,0.485,[],helm_classic_240829.csv opt_66b,Helm Classic,0.448,[],helm_classic_240829.csv bloom_176b,Helm Classic,0.446,[],helm_classic_240829.csv j1_grande_v1_17b,Helm Classic,0.433,[],helm_classic_240829.csv alpaca_7b,Helm Classic,0.381,[],helm_classic_240829.csv falcon_7b,Helm Classic,0.378,[],helm_classic_240829.csv redpajama_incite_base_7b,Helm Classic,0.378,[],helm_classic_240829.csv cohere_large_v20220720_13_1b,Helm Classic,0.372,[],helm_classic_240829.csv redpajama_incite_instruct_v1_3b,Helm Classic,0.366,[],helm_classic_240829.csv text_curie_001,Helm Classic,0.36,[],helm_classic_240829.csv gpt_neox_20b,Helm Classic,0.351,[],helm_classic_240829.csv luminous_base_13b,Helm Classic,0.315,[],helm_classic_240829.csv cohere_medium_v20221108_6_1b,Helm Classic,0.312,[],helm_classic_240829.csv redpajama_incite_base_v1_3b,Helm Classic,0.311,[],helm_classic_240829.csv tnlg_v2_6_7b,Helm Classic,0.309,[],helm_classic_240829.csv j1_large_v1_7_5b,Helm Classic,0.285,[],helm_classic_240829.csv gpt_j_6b,Helm Classic,0.273,[],helm_classic_240829.csv pythia_12b,Helm Classic,0.257,[],helm_classic_240829.csv curie_6_7b,Helm Classic,0.247,[],helm_classic_240829.csv falcon_instruct_7b,Helm Classic,0.244,[],helm_classic_240829.csv cohere_medium_v20220720_6_1b,Helm Classic,0.23,[],helm_classic_240829.csv text_babbage_001,Helm Classic,0.229,[],helm_classic_240829.csv t0pp_11b,Helm Classic,0.197,[],helm_classic_240829.csv pythia_6_9b,Helm Classic,0.196,[],helm_classic_240829.csv flan-ul2_20b,Helm Classic,0.167,[],helm_classic_240829.csv t5_11b,Helm Classic,0.131,[],helm_classic_240829.csv babbage_1_3b,Helm Classic,0.114,[],helm_classic_240829.csv cohere_small_v20220720_410m,Helm Classic,0.109,[],helm_classic_240829.csv ada_350m,Helm Classic,0.108,[],helm_classic_240829.csv text_ada_001,Helm Classic,0.107,[],helm_classic_240829.csv yalm_100b,Helm Classic,0.075,[],helm_classic_240829.csv llama_2_70b,Helm BoolQ,0.886,[],helm_classic_240829.csv llama_65b,Helm BoolQ,0.871,[],helm_classic_240829.csv text_davinci_002,Helm BoolQ,0.877,[],helm_classic_240829.csv mistral_v0_1_7b,Helm BoolQ,0.874,[],helm_classic_240829.csv cohere_command_beta_52_4b,Helm BoolQ,0.856,[],helm_classic_240829.csv text_davinci_003,Helm BoolQ,0.881,[],helm_classic_240829.csv jurassic_2_jumbo_178b,Helm BoolQ,0.829,[],helm_classic_240829.csv llama_2_13b,Helm BoolQ,0.811,[],helm_classic_240829.csv tnlg_v2_530b,Helm BoolQ,0.809,[],helm_classic_240829.csv gpt_3_5_turbo_0613,Helm BoolQ,0.87,[],helm_classic_240829.csv llama30b,Helm BoolQ,0.861,[],helm_classic_240829.csv anthropic_lm_v4_s3_52b,Helm BoolQ,0.815,[],helm_classic_240829.csv gpt_3_5_turbo_0301,Helm BoolQ,0.74,[],helm_classic_240829.csv jurassic_2_grande_17b,Helm BoolQ,0.826,[],helm_classic_240829.csv palmyra_x_43b,Helm BoolQ,0.896,[],helm_classic_240829.csv falcon_40b,Helm BoolQ,0.819,[],helm_classic_240829.csv falcon_instruct_40b,Helm BoolQ,0.829,[],helm_classic_240829.csv mpt_instruct_30b,Helm BoolQ,0.85,[],helm_classic_240829.csv mpt_30b,Helm BoolQ,0.704,[],helm_classic_240829.csv j1_grande_v2_beta_17b,Helm BoolQ,0.812,[],helm_classic_240829.csv vicuna_v1_3_13b,Helm BoolQ,0.808,[],helm_classic_240829.csv cohere_command_beta_6_1b,Helm BoolQ,0.798,[],helm_classic_240829.csv cohere_xlarge_v20221108_52_4b,Helm BoolQ,0.762,[],helm_classic_240829.csv luminous_supreme_70b,Helm BoolQ,0.775,[],helm_classic_240829.csv vicuna_v1_3_7b,Helm BoolQ,0.76,[],helm_classic_240829.csv opt_175b,Helm BoolQ,0.793,[],helm_classic_240829.csv llama_2_7b,Helm BoolQ,0.762,[],helm_classic_240829.csv llama_13b,Helm BoolQ,0.714,[],helm_classic_240829.csv instructpalmyra_30b,Helm BoolQ,0.751,[],helm_classic_240829.csv cohere_xlarge_v20220609_52_4b,Helm BoolQ,0.718,[],helm_classic_240829.csv jurassic_2_large_7_5b,Helm BoolQ,0.742,[],helm_classic_240829.csv davinci_175b,Helm BoolQ,0.722,[],helm_classic_240829.csv llama_7b,Helm BoolQ,0.756,[],helm_classic_240829.csv redpajama_incite_instruct_7b,Helm BoolQ,0.705,[],helm_classic_240829.csv j1_jumbo_v1_178b,Helm BoolQ,0.776,[],helm_classic_240829.csv glm_130b,Helm BoolQ,0.784,[],helm_classic_240829.csv luminous_extended_30b,Helm BoolQ,0.767,[],helm_classic_240829.csv opt_66b,Helm BoolQ,0.76,[],helm_classic_240829.csv bloom_176b,Helm BoolQ,0.704,[],helm_classic_240829.csv j1_grande_v1_17b,Helm BoolQ,0.722,[],helm_classic_240829.csv alpaca_7b,Helm BoolQ,0.778,[],helm_classic_240829.csv falcon_7b,Helm BoolQ,0.753,[],helm_classic_240829.csv redpajama_incite_base_7b,Helm BoolQ,0.713,[],helm_classic_240829.csv cohere_large_v20220720_13_1b,Helm BoolQ,0.725,[],helm_classic_240829.csv redpajama_incite_instruct_v1_3b,Helm BoolQ,0.677,[],helm_classic_240829.csv text_curie_001,Helm BoolQ,0.62,[],helm_classic_240829.csv gpt_neox_20b,Helm BoolQ,0.683,[],helm_classic_240829.csv luminous_base_13b,Helm BoolQ,0.719,[],helm_classic_240829.csv cohere_medium_v20221108_6_1b,Helm BoolQ,0.7,[],helm_classic_240829.csv redpajama_incite_base_v1_3b,Helm BoolQ,0.685,[],helm_classic_240829.csv tnlg_v2_6_7b,Helm BoolQ,0.698,[],helm_classic_240829.csv j1_large_v1_7_5b,Helm BoolQ,0.683,[],helm_classic_240829.csv gpt_j_6b,Helm BoolQ,0.649,[],helm_classic_240829.csv pythia_12b,Helm BoolQ,0.662,[],helm_classic_240829.csv curie_6_7b,Helm BoolQ,0.656,[],helm_classic_240829.csv falcon_instruct_7b,Helm BoolQ,0.72,[],helm_classic_240829.csv cohere_medium_v20220720_6_1b,Helm BoolQ,0.659,[],helm_classic_240829.csv text_babbage_001,Helm BoolQ,0.451,[],helm_classic_240829.csv t0pp_11b,Helm BoolQ,0.0,[],helm_classic_240829.csv pythia_6_9b,Helm BoolQ,0.631,[],helm_classic_240829.csv flan-ul2_20b,Helm BoolQ,0.746,[],helm_classic_240829.csv t5_11b,Helm BoolQ,0.761,[],helm_classic_240829.csv babbage_1_3b,Helm BoolQ,0.574,[],helm_classic_240829.csv cohere_small_v20220720_410m,Helm BoolQ,0.457,[],helm_classic_240829.csv ada_350m,Helm BoolQ,0.581,[],helm_classic_240829.csv text_ada_001,Helm BoolQ,0.464,[],helm_classic_240829.csv yalm_100b,Helm BoolQ,0.634,[],helm_classic_240829.csv llama_2_70b,Helm NarrativeQA,0.77,[],helm_classic_240829.csv llama_65b,Helm NarrativeQA,0.755,[],helm_classic_240829.csv text_davinci_002,Helm NarrativeQA,0.727,[],helm_classic_240829.csv mistral_v0_1_7b,Helm NarrativeQA,0.716,[],helm_classic_240829.csv cohere_command_beta_52_4b,Helm NarrativeQA,0.752,[],helm_classic_240829.csv text_davinci_003,Helm NarrativeQA,0.727,[],helm_classic_240829.csv jurassic_2_jumbo_178b,Helm NarrativeQA,0.733,[],helm_classic_240829.csv llama_2_13b,Helm NarrativeQA,0.744,[],helm_classic_240829.csv tnlg_v2_530b,Helm NarrativeQA,0.722,[],helm_classic_240829.csv gpt_3_5_turbo_0613,Helm NarrativeQA,0.625,[],helm_classic_240829.csv llama30b,Helm NarrativeQA,0.752,[],helm_classic_240829.csv anthropic_lm_v4_s3_52b,Helm NarrativeQA,0.728,[],helm_classic_240829.csv gpt_3_5_turbo_0301,Helm NarrativeQA,0.663,[],helm_classic_240829.csv jurassic_2_grande_17b,Helm NarrativeQA,0.737,[],helm_classic_240829.csv palmyra_x_43b,Helm NarrativeQA,0.742,[],helm_classic_240829.csv falcon_40b,Helm NarrativeQA,0.673,[],helm_classic_240829.csv falcon_instruct_40b,Helm NarrativeQA,0.625,[],helm_classic_240829.csv mpt_instruct_30b,Helm NarrativeQA,0.733,[],helm_classic_240829.csv mpt_30b,Helm NarrativeQA,0.732,[],helm_classic_240829.csv j1_grande_v2_beta_17b,Helm NarrativeQA,0.725,[],helm_classic_240829.csv vicuna_v1_3_13b,Helm NarrativeQA,0.691,[],helm_classic_240829.csv cohere_command_beta_6_1b,Helm NarrativeQA,0.709,[],helm_classic_240829.csv cohere_xlarge_v20221108_52_4b,Helm NarrativeQA,0.672,[],helm_classic_240829.csv luminous_supreme_70b,Helm NarrativeQA,0.711,[],helm_classic_240829.csv vicuna_v1_3_7b,Helm NarrativeQA,0.643,[],helm_classic_240829.csv opt_175b,Helm NarrativeQA,0.671,[],helm_classic_240829.csv llama_2_7b,Helm NarrativeQA,0.691,[],helm_classic_240829.csv llama_13b,Helm NarrativeQA,0.711,[],helm_classic_240829.csv instructpalmyra_30b,Helm NarrativeQA,0.496,[],helm_classic_240829.csv cohere_xlarge_v20220609_52_4b,Helm NarrativeQA,0.65,[],helm_classic_240829.csv davinci_175b,Helm NarrativeQA,0.687,[],helm_classic_240829.csv llama_7b,Helm NarrativeQA,0.669,[],helm_classic_240829.csv redpajama_incite_instruct_7b,Helm NarrativeQA,0.638,[],helm_classic_240829.csv j1_jumbo_v1_178b,Helm NarrativeQA,0.695,[],helm_classic_240829.csv glm_130b,Helm NarrativeQA,0.706,[],helm_classic_240829.csv luminous_extended_30b,Helm NarrativeQA,0.665,[],helm_classic_240829.csv opt_66b,Helm NarrativeQA,0.638,[],helm_classic_240829.csv bloom_176b,Helm NarrativeQA,0.662,[],helm_classic_240829.csv j1_grande_v1_17b,Helm NarrativeQA,0.672,[],helm_classic_240829.csv alpaca_7b,Helm NarrativeQA,0.396,[],helm_classic_240829.csv falcon_7b,Helm NarrativeQA,0.621,[],helm_classic_240829.csv redpajama_incite_base_7b,Helm NarrativeQA,0.617,[],helm_classic_240829.csv cohere_large_v20220720_13_1b,Helm NarrativeQA,0.625,[],helm_classic_240829.csv redpajama_incite_instruct_v1_3b,Helm NarrativeQA,0.638,[],helm_classic_240829.csv text_curie_001,Helm NarrativeQA,0.582,[],helm_classic_240829.csv gpt_neox_20b,Helm NarrativeQA,0.599,[],helm_classic_240829.csv luminous_base_13b,Helm NarrativeQA,0.605,[],helm_classic_240829.csv cohere_medium_v20221108_6_1b,Helm NarrativeQA,0.61,[],helm_classic_240829.csv redpajama_incite_base_v1_3b,Helm NarrativeQA,0.555,[],helm_classic_240829.csv tnlg_v2_6_7b,Helm NarrativeQA,0.631,[],helm_classic_240829.csv j1_large_v1_7_5b,Helm NarrativeQA,0.623,[],helm_classic_240829.csv gpt_j_6b,Helm NarrativeQA,0.545,[],helm_classic_240829.csv pythia_12b,Helm NarrativeQA,0.596,[],helm_classic_240829.csv curie_6_7b,Helm NarrativeQA,0.604,[],helm_classic_240829.csv falcon_instruct_7b,Helm NarrativeQA,0.476,[],helm_classic_240829.csv cohere_medium_v20220720_6_1b,Helm NarrativeQA,0.559,[],helm_classic_240829.csv text_babbage_001,Helm NarrativeQA,0.429,[],helm_classic_240829.csv t0pp_11b,Helm NarrativeQA,0.151,[],helm_classic_240829.csv pythia_6_9b,Helm NarrativeQA,0.528,[],helm_classic_240829.csv flan-ul2_20b,Helm NarrativeQA,0.083,[],helm_classic_240829.csv t5_11b,Helm NarrativeQA,0.086,[],helm_classic_240829.csv babbage_1_3b,Helm NarrativeQA,0.491,[],helm_classic_240829.csv cohere_small_v20220720_410m,Helm NarrativeQA,0.294,[],helm_classic_240829.csv ada_350m,Helm NarrativeQA,0.326,[],helm_classic_240829.csv text_ada_001,Helm NarrativeQA,0.238,[],helm_classic_240829.csv yalm_100b,Helm NarrativeQA,0.252,[],helm_classic_240829.csv llama_2_70b,Helm NaturalQuestionsClosed,0.458,[],helm_classic_240829.csv llama_65b,Helm NaturalQuestionsClosed,0.431,[],helm_classic_240829.csv text_davinci_002,Helm NaturalQuestionsClosed,0.383,[],helm_classic_240829.csv mistral_v0_1_7b,Helm NaturalQuestionsClosed,0.365,[],helm_classic_240829.csv cohere_command_beta_52_4b,Helm NaturalQuestionsClosed,0.372,[],helm_classic_240829.csv text_davinci_003,Helm NaturalQuestionsClosed,0.406,[],helm_classic_240829.csv jurassic_2_jumbo_178b,Helm NaturalQuestionsClosed,0.385,[],helm_classic_240829.csv llama_2_13b,Helm NaturalQuestionsClosed,0.376,[],helm_classic_240829.csv tnlg_v2_530b,Helm NaturalQuestionsClosed,0.384,[],helm_classic_240829.csv gpt_3_5_turbo_0613,Helm NaturalQuestionsClosed,0.348,[],helm_classic_240829.csv llama30b,Helm NaturalQuestionsClosed,0.408,[],helm_classic_240829.csv anthropic_lm_v4_s3_52b,Helm NaturalQuestionsClosed,0.288,[],helm_classic_240829.csv gpt_3_5_turbo_0301,Helm NaturalQuestionsClosed,0.39,[],helm_classic_240829.csv jurassic_2_grande_17b,Helm NaturalQuestionsClosed,0.356,[],helm_classic_240829.csv palmyra_x_43b,Helm NaturalQuestionsClosed,0.413,[],helm_classic_240829.csv falcon_40b,Helm NaturalQuestionsClosed,0.392,[],helm_classic_240829.csv falcon_instruct_40b,Helm NaturalQuestionsClosed,0.377,[],helm_classic_240829.csv mpt_instruct_30b,Helm NaturalQuestionsClosed,0.304,[],helm_classic_240829.csv mpt_30b,Helm NaturalQuestionsClosed,0.347,[],helm_classic_240829.csv j1_grande_v2_beta_17b,Helm NaturalQuestionsClosed,0.337,[],helm_classic_240829.csv vicuna_v1_3_13b,Helm NaturalQuestionsClosed,0.346,[],helm_classic_240829.csv cohere_command_beta_6_1b,Helm NaturalQuestionsClosed,0.229,[],helm_classic_240829.csv cohere_xlarge_v20221108_52_4b,Helm NaturalQuestionsClosed,0.361,[],helm_classic_240829.csv luminous_supreme_70b,Helm NaturalQuestionsClosed,0.293,[],helm_classic_240829.csv vicuna_v1_3_7b,Helm NaturalQuestionsClosed,0.287,[],helm_classic_240829.csv opt_175b,Helm NaturalQuestionsClosed,0.297,[],helm_classic_240829.csv llama_2_7b,Helm NaturalQuestionsClosed,0.337,[],helm_classic_240829.csv llama_13b,Helm NaturalQuestionsClosed,0.346,[],helm_classic_240829.csv instructpalmyra_30b,Helm NaturalQuestionsClosed,0.33,[],helm_classic_240829.csv cohere_xlarge_v20220609_52_4b,Helm NaturalQuestionsClosed,0.312,[],helm_classic_240829.csv jurassic_2_large_7_5b,Helm NaturalQuestionsClosed,0.274,[],helm_classic_240829.csv davinci_175b,Helm NaturalQuestionsClosed,0.329,[],helm_classic_240829.csv llama_7b,Helm NaturalQuestionsClosed,0.297,[],helm_classic_240829.csv redpajama_incite_instruct_7b,Helm NaturalQuestionsClosed,0.232,[],helm_classic_240829.csv j1_jumbo_v1_178b,Helm NaturalQuestionsClosed,0.293,[],helm_classic_240829.csv glm_130b,Helm NaturalQuestionsClosed,0.148,[],helm_classic_240829.csv luminous_extended_30b,Helm NaturalQuestionsClosed,0.254,[],helm_classic_240829.csv opt_66b,Helm NaturalQuestionsClosed,0.258,[],helm_classic_240829.csv bloom_176b,Helm NaturalQuestionsClosed,0.216,[],helm_classic_240829.csv j1_grande_v1_17b,Helm NaturalQuestionsClosed,0.233,[],helm_classic_240829.csv alpaca_7b,Helm NaturalQuestionsClosed,0.266,[],helm_classic_240829.csv falcon_7b,Helm NaturalQuestionsClosed,0.285,[],helm_classic_240829.csv redpajama_incite_base_7b,Helm NaturalQuestionsClosed,0.25,[],helm_classic_240829.csv cohere_large_v20220720_13_1b,Helm NaturalQuestionsClosed,0.232,[],helm_classic_240829.csv redpajama_incite_instruct_v1_3b,Helm NaturalQuestionsClosed,0.203,[],helm_classic_240829.csv text_curie_001,Helm NaturalQuestionsClosed,0.175,[],helm_classic_240829.csv gpt_neox_20b,Helm NaturalQuestionsClosed,0.193,[],helm_classic_240829.csv luminous_base_13b,Helm NaturalQuestionsClosed,0.202,[],helm_classic_240829.csv cohere_medium_v20221108_6_1b,Helm NaturalQuestionsClosed,0.199,[],helm_classic_240829.csv redpajama_incite_base_v1_3b,Helm NaturalQuestionsClosed,0.207,[],helm_classic_240829.csv tnlg_v2_6_7b,Helm NaturalQuestionsClosed,0.21,[],helm_classic_240829.csv j1_large_v1_7_5b,Helm NaturalQuestionsClosed,0.19,[],helm_classic_240829.csv gpt_j_6b,Helm NaturalQuestionsClosed,0.156,[],helm_classic_240829.csv pythia_12b,Helm NaturalQuestionsClosed,0.175,[],helm_classic_240829.csv curie_6_7b,Helm NaturalQuestionsClosed,0.199,[],helm_classic_240829.csv falcon_instruct_7b,Helm NaturalQuestionsClosed,0.194,[],helm_classic_240829.csv cohere_medium_v20220720_6_1b,Helm NaturalQuestionsClosed,0.177,[],helm_classic_240829.csv text_babbage_001,Helm NaturalQuestionsClosed,0.07,[],helm_classic_240829.csv t0pp_11b,Helm NaturalQuestionsClosed,0.039,[],helm_classic_240829.csv pythia_6_9b,Helm NaturalQuestionsClosed,0.142,[],helm_classic_240829.csv flan-ul2_20b,Helm NaturalQuestionsClosed,0.204,[],helm_classic_240829.csv t5_11b,Helm NaturalQuestionsClosed,0.194,[],helm_classic_240829.csv babbage_1_3b,Helm NaturalQuestionsClosed,0.119,[],helm_classic_240829.csv cohere_small_v20220720_410m,Helm NaturalQuestionsClosed,0.078,[],helm_classic_240829.csv ada_350m,Helm NaturalQuestionsClosed,0.082,[],helm_classic_240829.csv text_ada_001,Helm NaturalQuestionsClosed,0.025,[],helm_classic_240829.csv yalm_100b,Helm NaturalQuestionsClosed,0.068,[],helm_classic_240829.csv llama_2_70b,Helm NaturalQuestionsOpen,0.674,[],helm_classic_240829.csv llama_65b,Helm NaturalQuestionsOpen,0.672,[],helm_classic_240829.csv text_davinci_002,Helm NaturalQuestionsOpen,0.713,[],helm_classic_240829.csv mistral_v0_1_7b,Helm NaturalQuestionsOpen,0.687,[],helm_classic_240829.csv cohere_command_beta_52_4b,Helm NaturalQuestionsOpen,0.76,[],helm_classic_240829.csv text_davinci_003,Helm NaturalQuestionsOpen,0.77,[],helm_classic_240829.csv jurassic_2_jumbo_178b,Helm NaturalQuestionsOpen,0.669,[],helm_classic_240829.csv llama_2_13b,Helm NaturalQuestionsOpen,0.637,[],helm_classic_240829.csv tnlg_v2_530b,Helm NaturalQuestionsOpen,0.642,[],helm_classic_240829.csv gpt_3_5_turbo_0613,Helm NaturalQuestionsOpen,0.675,[],helm_classic_240829.csv llama30b,Helm NaturalQuestionsOpen,0.666,[],helm_classic_240829.csv anthropic_lm_v4_s3_52b,Helm NaturalQuestionsOpen,0.686,[],helm_classic_240829.csv gpt_3_5_turbo_0301,Helm NaturalQuestionsOpen,0.624,[],helm_classic_240829.csv jurassic_2_grande_17b,Helm NaturalQuestionsOpen,0.639,[],helm_classic_240829.csv falcon_40b,Helm NaturalQuestionsOpen,0.675,[],helm_classic_240829.csv falcon_instruct_40b,Helm NaturalQuestionsOpen,0.666,[],helm_classic_240829.csv mpt_instruct_30b,Helm NaturalQuestionsOpen,0.697,[],helm_classic_240829.csv mpt_30b,Helm NaturalQuestionsOpen,0.673,[],helm_classic_240829.csv j1_grande_v2_beta_17b,Helm NaturalQuestionsOpen,0.625,[],helm_classic_240829.csv vicuna_v1_3_13b,Helm NaturalQuestionsOpen,0.686,[],helm_classic_240829.csv cohere_command_beta_6_1b,Helm NaturalQuestionsOpen,0.717,[],helm_classic_240829.csv cohere_xlarge_v20221108_52_4b,Helm NaturalQuestionsOpen,0.628,[],helm_classic_240829.csv luminous_supreme_70b,Helm NaturalQuestionsOpen,0.649,[],helm_classic_240829.csv vicuna_v1_3_7b,Helm NaturalQuestionsOpen,0.634,[],helm_classic_240829.csv opt_175b,Helm NaturalQuestionsOpen,0.615,[],helm_classic_240829.csv llama_2_7b,Helm NaturalQuestionsOpen,0.611,[],helm_classic_240829.csv llama_13b,Helm NaturalQuestionsOpen,0.614,[],helm_classic_240829.csv instructpalmyra_30b,Helm NaturalQuestionsOpen,0.682,[],helm_classic_240829.csv cohere_xlarge_v20220609_52_4b,Helm NaturalQuestionsOpen,0.595,[],helm_classic_240829.csv jurassic_2_large_7_5b,Helm NaturalQuestionsOpen,0.589,[],helm_classic_240829.csv davinci_175b,Helm NaturalQuestionsOpen,0.625,[],helm_classic_240829.csv llama_7b,Helm NaturalQuestionsOpen,0.589,[],helm_classic_240829.csv redpajama_incite_instruct_7b,Helm NaturalQuestionsOpen,0.659,[],helm_classic_240829.csv j1_jumbo_v1_178b,Helm NaturalQuestionsOpen,0.595,[],helm_classic_240829.csv glm_130b,Helm NaturalQuestionsOpen,0.642,[],helm_classic_240829.csv luminous_extended_30b,Helm NaturalQuestionsOpen,0.609,[],helm_classic_240829.csv opt_66b,Helm NaturalQuestionsOpen,0.596,[],helm_classic_240829.csv bloom_176b,Helm NaturalQuestionsOpen,0.621,[],helm_classic_240829.csv j1_grande_v1_17b,Helm NaturalQuestionsOpen,0.578,[],helm_classic_240829.csv alpaca_7b,Helm NaturalQuestionsOpen,0.592,[],helm_classic_240829.csv falcon_7b,Helm NaturalQuestionsOpen,0.579,[],helm_classic_240829.csv redpajama_incite_base_7b,Helm NaturalQuestionsOpen,0.586,[],helm_classic_240829.csv cohere_large_v20220720_13_1b,Helm NaturalQuestionsOpen,0.573,[],helm_classic_240829.csv redpajama_incite_instruct_v1_3b,Helm NaturalQuestionsOpen,0.637,[],helm_classic_240829.csv text_curie_001,Helm NaturalQuestionsOpen,0.571,[],helm_classic_240829.csv gpt_neox_20b,Helm NaturalQuestionsOpen,0.596,[],helm_classic_240829.csv luminous_base_13b,Helm NaturalQuestionsOpen,0.568,[],helm_classic_240829.csv cohere_medium_v20221108_6_1b,Helm NaturalQuestionsOpen,0.517,[],helm_classic_240829.csv redpajama_incite_base_v1_3b,Helm NaturalQuestionsOpen,0.52,[],helm_classic_240829.csv tnlg_v2_6_7b,Helm NaturalQuestionsOpen,0.561,[],helm_classic_240829.csv j1_large_v1_7_5b,Helm NaturalQuestionsOpen,0.532,[],helm_classic_240829.csv gpt_j_6b,Helm NaturalQuestionsOpen,0.559,[],helm_classic_240829.csv pythia_12b,Helm NaturalQuestionsOpen,0.581,[],helm_classic_240829.csv curie_6_7b,Helm NaturalQuestionsOpen,0.552,[],helm_classic_240829.csv falcon_instruct_7b,Helm NaturalQuestionsOpen,0.449,[],helm_classic_240829.csv cohere_medium_v20220720_6_1b,Helm NaturalQuestionsOpen,0.504,[],helm_classic_240829.csv text_babbage_001,Helm NaturalQuestionsOpen,0.33,[],helm_classic_240829.csv t0pp_11b,Helm NaturalQuestionsOpen,0.19,[],helm_classic_240829.csv pythia_6_9b,Helm NaturalQuestionsOpen,0.539,[],helm_classic_240829.csv flan-ul2_20b,Helm NaturalQuestionsOpen,0.349,[],helm_classic_240829.csv t5_11b,Helm NaturalQuestionsOpen,0.477,[],helm_classic_240829.csv babbage_1_3b,Helm NaturalQuestionsOpen,0.451,[],helm_classic_240829.csv cohere_small_v20220720_410m,Helm NaturalQuestionsOpen,0.309,[],helm_classic_240829.csv ada_350m,Helm NaturalQuestionsOpen,0.365,[],helm_classic_240829.csv text_ada_001,Helm NaturalQuestionsOpen,0.149,[],helm_classic_240829.csv yalm_100b,Helm NaturalQuestionsOpen,0.227,[],helm_classic_240829.csv llama_2_70b,Helm QuAC,0.484,[],helm_classic_240829.csv llama_65b,Helm QuAC,0.401,[],helm_classic_240829.csv text_davinci_002,Helm QuAC,0.445,[],helm_classic_240829.csv mistral_v0_1_7b,Helm QuAC,0.423,[],helm_classic_240829.csv cohere_command_beta_52_4b,Helm QuAC,0.432,[],helm_classic_240829.csv text_davinci_003,Helm QuAC,0.525,[],helm_classic_240829.csv jurassic_2_jumbo_178b,Helm QuAC,0.435,[],helm_classic_240829.csv llama_2_13b,Helm QuAC,0.424,[],helm_classic_240829.csv tnlg_v2_530b,Helm QuAC,0.39,[],helm_classic_240829.csv gpt_3_5_turbo_0613,Helm QuAC,0.485,[],helm_classic_240829.csv llama30b,Helm QuAC,0.39,[],helm_classic_240829.csv anthropic_lm_v4_s3_52b,Helm QuAC,0.431,[],helm_classic_240829.csv gpt_3_5_turbo_0301,Helm QuAC,0.512,[],helm_classic_240829.csv jurassic_2_grande_17b,Helm QuAC,0.418,[],helm_classic_240829.csv palmyra_x_43b,Helm QuAC,0.473,[],helm_classic_240829.csv falcon_40b,Helm QuAC,0.307,[],helm_classic_240829.csv falcon_instruct_40b,Helm QuAC,0.371,[],helm_classic_240829.csv mpt_instruct_30b,Helm QuAC,0.327,[],helm_classic_240829.csv mpt_30b,Helm QuAC,0.393,[],helm_classic_240829.csv j1_grande_v2_beta_17b,Helm QuAC,0.392,[],helm_classic_240829.csv vicuna_v1_3_13b,Helm QuAC,0.403,[],helm_classic_240829.csv cohere_command_beta_6_1b,Helm QuAC,0.375,[],helm_classic_240829.csv cohere_xlarge_v20221108_52_4b,Helm QuAC,0.374,[],helm_classic_240829.csv luminous_supreme_70b,Helm QuAC,0.37,[],helm_classic_240829.csv vicuna_v1_3_7b,Helm QuAC,0.392,[],helm_classic_240829.csv opt_175b,Helm QuAC,0.36,[],helm_classic_240829.csv llama_2_7b,Helm QuAC,0.406,[],helm_classic_240829.csv llama_13b,Helm QuAC,0.347,[],helm_classic_240829.csv instructpalmyra_30b,Helm QuAC,0.433,[],helm_classic_240829.csv cohere_xlarge_v20220609_52_4b,Helm QuAC,0.361,[],helm_classic_240829.csv davinci_175b,Helm QuAC,0.36,[],helm_classic_240829.csv llama_7b,Helm QuAC,0.338,[],helm_classic_240829.csv redpajama_incite_instruct_7b,Helm QuAC,0.26,[],helm_classic_240829.csv j1_jumbo_v1_178b,Helm QuAC,0.358,[],helm_classic_240829.csv glm_130b,Helm QuAC,0.272,[],helm_classic_240829.csv luminous_extended_30b,Helm QuAC,0.349,[],helm_classic_240829.csv opt_66b,Helm QuAC,0.357,[],helm_classic_240829.csv bloom_176b,Helm QuAC,0.361,[],helm_classic_240829.csv j1_grande_v1_17b,Helm QuAC,0.362,[],helm_classic_240829.csv alpaca_7b,Helm QuAC,0.27,[],helm_classic_240829.csv falcon_7b,Helm QuAC,0.332,[],helm_classic_240829.csv redpajama_incite_base_7b,Helm QuAC,0.336,[],helm_classic_240829.csv cohere_large_v20220720_13_1b,Helm QuAC,0.338,[],helm_classic_240829.csv redpajama_incite_instruct_v1_3b,Helm QuAC,0.259,[],helm_classic_240829.csv text_curie_001,Helm QuAC,0.358,[],helm_classic_240829.csv gpt_neox_20b,Helm QuAC,0.326,[],helm_classic_240829.csv luminous_base_13b,Helm QuAC,0.334,[],helm_classic_240829.csv cohere_medium_v20221108_6_1b,Helm QuAC,0.314,[],helm_classic_240829.csv redpajama_incite_base_v1_3b,Helm QuAC,0.309,[],helm_classic_240829.csv tnlg_v2_6_7b,Helm QuAC,0.345,[],helm_classic_240829.csv j1_large_v1_7_5b,Helm QuAC,0.328,[],helm_classic_240829.csv gpt_j_6b,Helm QuAC,0.33,[],helm_classic_240829.csv pythia_12b,Helm QuAC,0.313,[],helm_classic_240829.csv curie_6_7b,Helm QuAC,0.321,[],helm_classic_240829.csv falcon_instruct_7b,Helm QuAC,0.311,[],helm_classic_240829.csv cohere_medium_v20220720_6_1b,Helm QuAC,0.279,[],helm_classic_240829.csv text_babbage_001,Helm QuAC,0.284,[],helm_classic_240829.csv t0pp_11b,Helm QuAC,0.121,[],helm_classic_240829.csv pythia_6_9b,Helm QuAC,0.296,[],helm_classic_240829.csv flan-ul2_20b,Helm QuAC,0.144,[],helm_classic_240829.csv t5_11b,Helm QuAC,0.116,[],helm_classic_240829.csv babbage_1_3b,Helm QuAC,0.273,[],helm_classic_240829.csv cohere_small_v20220720_410m,Helm QuAC,0.219,[],helm_classic_240829.csv ada_350m,Helm QuAC,0.242,[],helm_classic_240829.csv text_ada_001,Helm QuAC,0.176,[],helm_classic_240829.csv yalm_100b,Helm QuAC,0.162,[],helm_classic_240829.csv text_davinci_002,helm_hellaswag,0.815,[],helm_classic_240829.csv cohere_command_beta_52_4b,helm_hellaswag,0.811,[],helm_classic_240829.csv text_davinci_003,helm_hellaswag,0.822,[],helm_classic_240829.csv jurassic_2_jumbo_178b,helm_hellaswag,0.788,[],helm_classic_240829.csv tnlg_v2_530b,helm_hellaswag,0.799,[],helm_classic_240829.csv anthropic_lm_v4_s3_52b,helm_hellaswag,0.807,[],helm_classic_240829.csv jurassic_2_grande_17b,helm_hellaswag,0.781,[],helm_classic_240829.csv j1_grande_v2_beta_17b,helm_hellaswag,0.764,[],helm_classic_240829.csv cohere_command_beta_6_1b,helm_hellaswag,0.752,[],helm_classic_240829.csv cohere_xlarge_v20221108_52_4b,helm_hellaswag,0.81,[],helm_classic_240829.csv opt_175b,helm_hellaswag,0.791,[],helm_classic_240829.csv cohere_xlarge_v20220609_52_4b,helm_hellaswag,0.811,[],helm_classic_240829.csv jurassic_2_large_7_5b,helm_hellaswag,0.729,[],helm_classic_240829.csv davinci_175b,helm_hellaswag,0.775,[],helm_classic_240829.csv j1_jumbo_v1_178b,helm_hellaswag,0.765,[],helm_classic_240829.csv opt_66b,helm_hellaswag,0.745,[],helm_classic_240829.csv bloom_176b,helm_hellaswag,0.744,[],helm_classic_240829.csv j1_grande_v1_17b,helm_hellaswag,0.739,[],helm_classic_240829.csv cohere_large_v20220720_13_1b,helm_hellaswag,0.736,[],helm_classic_240829.csv text_curie_001,helm_hellaswag,0.676,[],helm_classic_240829.csv gpt_neox_20b,helm_hellaswag,0.718,[],helm_classic_240829.csv cohere_medium_v20221108_6_1b,helm_hellaswag,0.726,[],helm_classic_240829.csv tnlg_v2_6_7b,helm_hellaswag,0.704,[],helm_classic_240829.csv j1_large_v1_7_5b,helm_hellaswag,0.7,[],helm_classic_240829.csv gpt_j_6b,helm_hellaswag,0.663,[],helm_classic_240829.csv curie_6_7b,helm_hellaswag,0.682,[],helm_classic_240829.csv cohere_medium_v20220720_6_1b,helm_hellaswag,0.706,[],helm_classic_240829.csv text_babbage_001,helm_hellaswag,0.561,[],helm_classic_240829.csv babbage_1_3b,helm_hellaswag,0.555,[],helm_classic_240829.csv cohere_small_v20220720_410m,helm_hellaswag,0.483,[],helm_classic_240829.csv ada_350m,helm_hellaswag,0.435,[],helm_classic_240829.csv text_ada_001,helm_hellaswag,0.429,[],helm_classic_240829.csv llama_2_70b,Helm OpenBookQA,0.554,[],helm_classic_240829.csv llama_65b,Helm OpenBookQA,0.508,[],helm_classic_240829.csv text_davinci_002,Helm OpenBookQA,0.594,[],helm_classic_240829.csv mistral_v0_1_7b,Helm OpenBookQA,0.422,[],helm_classic_240829.csv cohere_command_beta_52_4b,Helm OpenBookQA,0.582,[],helm_classic_240829.csv text_davinci_003,Helm OpenBookQA,0.646,[],helm_classic_240829.csv jurassic_2_jumbo_178b,Helm OpenBookQA,0.558,[],helm_classic_240829.csv llama_2_13b,Helm OpenBookQA,0.33,[],helm_classic_240829.csv tnlg_v2_530b,Helm OpenBookQA,0.562,[],helm_classic_240829.csv gpt_3_5_turbo_0613,Helm OpenBookQA,0.339,[],helm_classic_240829.csv llama30b,Helm OpenBookQA,0.344,[],helm_classic_240829.csv anthropic_lm_v4_s3_52b,Helm OpenBookQA,0.558,[],helm_classic_240829.csv gpt_3_5_turbo_0301,Helm OpenBookQA,0.609,[],helm_classic_240829.csv jurassic_2_grande_17b,Helm OpenBookQA,0.542,[],helm_classic_240829.csv palmyra_x_43b,Helm OpenBookQA,0.616,[],helm_classic_240829.csv falcon_40b,Helm OpenBookQA,0.353,[],helm_classic_240829.csv falcon_instruct_40b,Helm OpenBookQA,0.384,[],helm_classic_240829.csv mpt_instruct_30b,Helm OpenBookQA,0.234,[],helm_classic_240829.csv mpt_30b,Helm OpenBookQA,0.231,[],helm_classic_240829.csv j1_grande_v2_beta_17b,Helm OpenBookQA,0.56,[],helm_classic_240829.csv vicuna_v1_3_13b,Helm OpenBookQA,0.385,[],helm_classic_240829.csv cohere_command_beta_6_1b,Helm OpenBookQA,0.55,[],helm_classic_240829.csv cohere_xlarge_v20221108_52_4b,Helm OpenBookQA,0.588,[],helm_classic_240829.csv luminous_supreme_70b,Helm OpenBookQA,0.222,[],helm_classic_240829.csv vicuna_v1_3_7b,Helm OpenBookQA,0.292,[],helm_classic_240829.csv opt_175b,Helm OpenBookQA,0.586,[],helm_classic_240829.csv llama_2_7b,Helm OpenBookQA,0.272,[],helm_classic_240829.csv llama_13b,Helm OpenBookQA,0.324,[],helm_classic_240829.csv instructpalmyra_30b,Helm OpenBookQA,0.185,[],helm_classic_240829.csv cohere_xlarge_v20220609_52_4b,Helm OpenBookQA,0.55,[],helm_classic_240829.csv jurassic_2_large_7_5b,Helm OpenBookQA,0.53,[],helm_classic_240829.csv davinci_175b,Helm OpenBookQA,0.586,[],helm_classic_240829.csv llama_7b,Helm OpenBookQA,0.28,[],helm_classic_240829.csv redpajama_incite_instruct_7b,Helm OpenBookQA,0.243,[],helm_classic_240829.csv j1_jumbo_v1_178b,Helm OpenBookQA,0.534,[],helm_classic_240829.csv glm_130b,Helm OpenBookQA,0.218,[],helm_classic_240829.csv luminous_extended_30b,Helm OpenBookQA,0.221,[],helm_classic_240829.csv opt_66b,Helm OpenBookQA,0.534,[],helm_classic_240829.csv bloom_176b,Helm OpenBookQA,0.534,[],helm_classic_240829.csv j1_grande_v1_17b,Helm OpenBookQA,0.52,[],helm_classic_240829.csv alpaca_7b,Helm OpenBookQA,0.243,[],helm_classic_240829.csv falcon_7b,Helm OpenBookQA,0.234,[],helm_classic_240829.csv redpajama_incite_base_7b,Helm OpenBookQA,0.205,[],helm_classic_240829.csv cohere_large_v20220720_13_1b,Helm OpenBookQA,0.542,[],helm_classic_240829.csv redpajama_incite_instruct_v1_3b,Helm OpenBookQA,0.208,[],helm_classic_240829.csv text_curie_001,Helm OpenBookQA,0.514,[],helm_classic_240829.csv gpt_neox_20b,Helm OpenBookQA,0.524,[],helm_classic_240829.csv luminous_base_13b,Helm OpenBookQA,0.182,[],helm_classic_240829.csv cohere_medium_v20221108_6_1b,Helm OpenBookQA,0.538,[],helm_classic_240829.csv redpajama_incite_base_v1_3b,Helm OpenBookQA,0.277,[],helm_classic_240829.csv tnlg_v2_6_7b,Helm OpenBookQA,0.478,[],helm_classic_240829.csv j1_large_v1_7_5b,Helm OpenBookQA,0.514,[],helm_classic_240829.csv gpt_j_6b,Helm OpenBookQA,0.514,[],helm_classic_240829.csv pythia_12b,Helm OpenBookQA,0.177,[],helm_classic_240829.csv curie_6_7b,Helm OpenBookQA,0.502,[],helm_classic_240829.csv falcon_instruct_7b,Helm OpenBookQA,0.213,[],helm_classic_240829.csv cohere_medium_v20220720_6_1b,Helm OpenBookQA,0.496,[],helm_classic_240829.csv text_babbage_001,Helm OpenBookQA,0.452,[],helm_classic_240829.csv t0pp_11b,Helm OpenBookQA,0.377,[],helm_classic_240829.csv pythia_6_9b,Helm OpenBookQA,0.213,[],helm_classic_240829.csv flan-ul2_20b,Helm OpenBookQA,0.193,[],helm_classic_240829.csv t5_11b,Helm OpenBookQA,0.133,[],helm_classic_240829.csv babbage_1_3b,Helm OpenBookQA,0.438,[],helm_classic_240829.csv cohere_small_v20220720_410m,Helm OpenBookQA,0.348,[],helm_classic_240829.csv ada_350m,Helm OpenBookQA,0.38,[],helm_classic_240829.csv text_ada_001,Helm OpenBookQA,0.346,[],helm_classic_240829.csv yalm_100b,Helm OpenBookQA,0.202,[],helm_classic_240829.csv text_davinci_002,helm_truthfulqa,0.61,[],helm_classic_240829.csv cohere_command_beta_52_4b,helm_truthfulqa,0.269,[],helm_classic_240829.csv text_davinci_003,helm_truthfulqa,0.593,[],helm_classic_240829.csv jurassic_2_jumbo_178b,helm_truthfulqa,0.437,[],helm_classic_240829.csv tnlg_v2_530b,helm_truthfulqa,0.251,[],helm_classic_240829.csv anthropic_lm_v4_s3_52b,helm_truthfulqa,0.368,[],helm_classic_240829.csv jurassic_2_grande_17b,helm_truthfulqa,0.348,[],helm_classic_240829.csv j1_grande_v2_beta_17b,helm_truthfulqa,0.306,[],helm_classic_240829.csv cohere_command_beta_6_1b,helm_truthfulqa,0.203,[],helm_classic_240829.csv cohere_xlarge_v20221108_52_4b,helm_truthfulqa,0.169,[],helm_classic_240829.csv opt_175b,helm_truthfulqa,0.25,[],helm_classic_240829.csv cohere_xlarge_v20220609_52_4b,helm_truthfulqa,0.198,[],helm_classic_240829.csv jurassic_2_large_7_5b,helm_truthfulqa,0.245,[],helm_classic_240829.csv davinci_175b,helm_truthfulqa,0.194,[],helm_classic_240829.csv j1_jumbo_v1_178b,helm_truthfulqa,0.175,[],helm_classic_240829.csv opt_66b,helm_truthfulqa,0.201,[],helm_classic_240829.csv bloom_176b,helm_truthfulqa,0.205,[],helm_classic_240829.csv j1_grande_v1_17b,helm_truthfulqa,0.193,[],helm_classic_240829.csv cohere_large_v20220720_13_1b,helm_truthfulqa,0.181,[],helm_classic_240829.csv text_curie_001,helm_truthfulqa,0.257,[],helm_classic_240829.csv gpt_neox_20b,helm_truthfulqa,0.216,[],helm_classic_240829.csv cohere_medium_v20221108_6_1b,helm_truthfulqa,0.215,[],helm_classic_240829.csv tnlg_v2_6_7b,helm_truthfulqa,0.167,[],helm_classic_240829.csv j1_large_v1_7_5b,helm_truthfulqa,0.197,[],helm_classic_240829.csv gpt_j_6b,helm_truthfulqa,0.199,[],helm_classic_240829.csv curie_6_7b,helm_truthfulqa,0.232,[],helm_classic_240829.csv cohere_medium_v20220720_6_1b,helm_truthfulqa,0.19,[],helm_classic_240829.csv text_babbage_001,helm_truthfulqa,0.233,[],helm_classic_240829.csv babbage_1_3b,helm_truthfulqa,0.188,[],helm_classic_240829.csv cohere_small_v20220720_410m,helm_truthfulqa,0.217,[],helm_classic_240829.csv ada_350m,helm_truthfulqa,0.215,[],helm_classic_240829.csv text_ada_001,helm_truthfulqa,0.232,[],helm_classic_240829.csv text_davinci_002,Helm MSMARCO Regular,0.421,[],helm_classic_240829.csv cohere_command_beta_52_4b,Helm MSMARCO Regular,0.472,[],helm_classic_240829.csv text_davinci_003,Helm MSMARCO Regular,0.368,[],helm_classic_240829.csv jurassic_2_jumbo_178b,Helm MSMARCO Regular,0.398,[],helm_classic_240829.csv tnlg_v2_530b,Helm MSMARCO Regular,0.377,[],helm_classic_240829.csv jurassic_2_grande_17b,Helm MSMARCO Regular,0.293,[],helm_classic_240829.csv j1_grande_v2_beta_17b,Helm MSMARCO Regular,0.285,[],helm_classic_240829.csv cohere_command_beta_6_1b,Helm MSMARCO Regular,0.434,[],helm_classic_240829.csv cohere_xlarge_v20221108_52_4b,Helm MSMARCO Regular,0.315,[],helm_classic_240829.csv opt_175b,Helm MSMARCO Regular,0.288,[],helm_classic_240829.csv cohere_xlarge_v20220609_52_4b,Helm MSMARCO Regular,0.273,[],helm_classic_240829.csv jurassic_2_large_7_5b,Helm MSMARCO Regular,0.247,[],helm_classic_240829.csv davinci_175b,Helm MSMARCO Regular,0.211,[],helm_classic_240829.csv j1_jumbo_v1_178b,Helm MSMARCO Regular,0.21,[],helm_classic_240829.csv opt_66b,Helm MSMARCO Regular,0.237,[],helm_classic_240829.csv bloom_176b,Helm MSMARCO Regular,0.236,[],helm_classic_240829.csv j1_grande_v1_17b,Helm MSMARCO Regular,0.161,[],helm_classic_240829.csv cohere_large_v20220720_13_1b,Helm MSMARCO Regular,0.19,[],helm_classic_240829.csv text_curie_001,Helm MSMARCO Regular,0.271,[],helm_classic_240829.csv gpt_neox_20b,Helm MSMARCO Regular,0.184,[],helm_classic_240829.csv cohere_medium_v20221108_6_1b,Helm MSMARCO Regular,0.175,[],helm_classic_240829.csv tnlg_v2_6_7b,Helm MSMARCO Regular,0.158,[],helm_classic_240829.csv j1_large_v1_7_5b,Helm MSMARCO Regular,0.147,[],helm_classic_240829.csv gpt_j_6b,Helm MSMARCO Regular,0.152,[],helm_classic_240829.csv curie_6_7b,Helm MSMARCO Regular,0.162,[],helm_classic_240829.csv cohere_medium_v20220720_6_1b,Helm MSMARCO Regular,0.152,[],helm_classic_240829.csv text_babbage_001,Helm MSMARCO Regular,0.208,[],helm_classic_240829.csv babbage_1_3b,Helm MSMARCO Regular,0.122,[],helm_classic_240829.csv ada_350m,Helm MSMARCO Regular,0.102,[],helm_classic_240829.csv text_ada_001,Helm MSMARCO Regular,0.134,[],helm_classic_240829.csv text_davinci_002,Helm MSMARCO Trec,0.664,[],helm_classic_240829.csv cohere_command_beta_52_4b,Helm MSMARCO Trec,0.762,[],helm_classic_240829.csv text_davinci_003,Helm MSMARCO Trec,0.644,[],helm_classic_240829.csv jurassic_2_jumbo_178b,Helm MSMARCO Trec,0.661,[],helm_classic_240829.csv tnlg_v2_530b,Helm MSMARCO Trec,0.643,[],helm_classic_240829.csv anthropic_lm_v4_s3_52b,Helm MSMARCO Trec,-0.154,[],helm_classic_240829.csv jurassic_2_grande_17b,Helm MSMARCO Trec,0.514,[],helm_classic_240829.csv palmyra_x_43b,Helm MSMARCO Trec,0.049,[],helm_classic_240829.csv j1_grande_v2_beta_17b,Helm MSMARCO Trec,0.46,[],helm_classic_240829.csv cohere_command_beta_6_1b,Helm MSMARCO Trec,0.709,[],helm_classic_240829.csv cohere_xlarge_v20221108_52_4b,Helm MSMARCO Trec,0.55,[],helm_classic_240829.csv luminous_supreme_70b,Helm MSMARCO Trec,0.15,[],helm_classic_240829.csv opt_175b,Helm MSMARCO Trec,0.448,[],helm_classic_240829.csv instructpalmyra_30b,Helm MSMARCO Trec,0.152,[],helm_classic_240829.csv cohere_xlarge_v20220609_52_4b,Helm MSMARCO Trec,0.459,[],helm_classic_240829.csv jurassic_2_large_7_5b,Helm MSMARCO Trec,0.464,[],helm_classic_240829.csv davinci_175b,Helm MSMARCO Trec,0.378,[],helm_classic_240829.csv j1_jumbo_v1_178b,Helm MSMARCO Trec,0.363,[],helm_classic_240829.csv glm_130b,Helm MSMARCO Trec,0.154,[],helm_classic_240829.csv luminous_extended_30b,Helm MSMARCO Trec,0.139,[],helm_classic_240829.csv opt_66b,Helm MSMARCO Trec,0.482,[],helm_classic_240829.csv bloom_176b,Helm MSMARCO Trec,0.386,[],helm_classic_240829.csv j1_grande_v1_17b,Helm MSMARCO Trec,0.341,[],helm_classic_240829.csv cohere_large_v20220720_13_1b,Helm MSMARCO Trec,0.33,[],helm_classic_240829.csv text_curie_001,Helm MSMARCO Trec,0.507,[],helm_classic_240829.csv gpt_neox_20b,Helm MSMARCO Trec,0.398,[],helm_classic_240829.csv luminous_base_13b,Helm MSMARCO Trec,0.11,[],helm_classic_240829.csv cohere_medium_v20221108_6_1b,Helm MSMARCO Trec,0.373,[],helm_classic_240829.csv tnlg_v2_6_7b,Helm MSMARCO Trec,0.332,[],helm_classic_240829.csv j1_large_v1_7_5b,Helm MSMARCO Trec,0.292,[],helm_classic_240829.csv gpt_j_6b,Helm MSMARCO Trec,0.345,[],helm_classic_240829.csv curie_6_7b,Helm MSMARCO Trec,0.3,[],helm_classic_240829.csv cohere_medium_v20220720_6_1b,Helm MSMARCO Trec,0.374,[],helm_classic_240829.csv text_babbage_001,Helm MSMARCO Trec,0.449,[],helm_classic_240829.csv t0pp_11b,Helm MSMARCO Trec,0.122,[],helm_classic_240829.csv flan-ul2_20b,Helm MSMARCO Trec,0.03,[],helm_classic_240829.csv t5_11b,Helm MSMARCO Trec,0.043,[],helm_classic_240829.csv babbage_1_3b,Helm MSMARCO Trec,0.317,[],helm_classic_240829.csv cohere_small_v20220720_410m,Helm MSMARCO Trec,0.304,[],helm_classic_240829.csv ada_350m,Helm MSMARCO Trec,0.29,[],helm_classic_240829.csv text_ada_001,Helm MSMARCO Trec,0.302,[],helm_classic_240829.csv yalm_100b,Helm MSMARCO Trec,0.017,[],helm_classic_240829.csv text_davinci_002,helm_cnn/dailymail,0.153,[],helm_classic_240829.csv cohere_command_beta_52_4b,helm_cnn/dailymail,0.161,[],helm_classic_240829.csv text_davinci_003,helm_cnn/dailymail,0.156,[],helm_classic_240829.csv jurassic_2_jumbo_178b,helm_cnn/dailymail,0.149,[],helm_classic_240829.csv tnlg_v2_530b,helm_cnn/dailymail,0.161,[],helm_classic_240829.csv anthropic_lm_v4_s3_52b,helm_cnn/dailymail,0.134,[],helm_classic_240829.csv jurassic_2_grande_17b,helm_cnn/dailymail,0.144,[],helm_classic_240829.csv palmyra_x_43b,helm_cnn/dailymail,0.149,[],helm_classic_240829.csv j1_grande_v2_beta_17b,helm_cnn/dailymail,0.146,[],helm_classic_240829.csv cohere_command_beta_6_1b,helm_cnn/dailymail,0.153,[],helm_classic_240829.csv cohere_xlarge_v20221108_52_4b,helm_cnn/dailymail,0.153,[],helm_classic_240829.csv luminous_supreme_70b,helm_cnn/dailymail,0.136,[],helm_classic_240829.csv opt_175b,helm_cnn/dailymail,0.146,[],helm_classic_240829.csv instructpalmyra_30b,helm_cnn/dailymail,0.104,[],helm_classic_240829.csv cohere_xlarge_v20220609_52_4b,helm_cnn/dailymail,0.144,[],helm_classic_240829.csv jurassic_2_large_7_5b,helm_cnn/dailymail,0.136,[],helm_classic_240829.csv davinci_175b,helm_cnn/dailymail,0.127,[],helm_classic_240829.csv j1_jumbo_v1_178b,helm_cnn/dailymail,0.144,[],helm_classic_240829.csv glm_130b,helm_cnn/dailymail,0.132,[],helm_classic_240829.csv luminous_extended_30b,helm_cnn/dailymail,0.124,[],helm_classic_240829.csv opt_66b,helm_cnn/dailymail,0.136,[],helm_classic_240829.csv bloom_176b,helm_cnn/dailymail,0.08,[],helm_classic_240829.csv j1_grande_v1_17b,helm_cnn/dailymail,0.143,[],helm_classic_240829.csv cohere_large_v20220720_13_1b,helm_cnn/dailymail,0.126,[],helm_classic_240829.csv text_curie_001,helm_cnn/dailymail,0.152,[],helm_classic_240829.csv gpt_neox_20b,helm_cnn/dailymail,0.123,[],helm_classic_240829.csv luminous_base_13b,helm_cnn/dailymail,0.105,[],helm_classic_240829.csv cohere_medium_v20221108_6_1b,helm_cnn/dailymail,0.121,[],helm_classic_240829.csv tnlg_v2_6_7b,helm_cnn/dailymail,0.146,[],helm_classic_240829.csv j1_large_v1_7_5b,helm_cnn/dailymail,0.134,[],helm_classic_240829.csv gpt_j_6b,helm_cnn/dailymail,0.131,[],helm_classic_240829.csv curie_6_7b,helm_cnn/dailymail,0.113,[],helm_classic_240829.csv cohere_medium_v20220720_6_1b,helm_cnn/dailymail,0.077,[],helm_classic_240829.csv text_babbage_001,helm_cnn/dailymail,0.151,[],helm_classic_240829.csv t0pp_11b,helm_cnn/dailymail,0.09,[],helm_classic_240829.csv flan-ul2_20b,helm_cnn/dailymail,0.058,[],helm_classic_240829.csv t5_11b,helm_cnn/dailymail,0.015,[],helm_classic_240829.csv babbage_1_3b,helm_cnn/dailymail,0.079,[],helm_classic_240829.csv cohere_small_v20220720_410m,helm_cnn/dailymail,0.063,[],helm_classic_240829.csv ada_350m,helm_cnn/dailymail,0.09,[],helm_classic_240829.csv text_ada_001,helm_cnn/dailymail,0.136,[],helm_classic_240829.csv yalm_100b,helm_cnn/dailymail,0.021,[],helm_classic_240829.csv text_davinci_002,Helm XSUM,0.144,[],helm_classic_240829.csv cohere_command_beta_52_4b,Helm XSUM,0.152,[],helm_classic_240829.csv text_davinci_003,Helm XSUM,0.124,[],helm_classic_240829.csv jurassic_2_jumbo_178b,Helm XSUM,0.182,[],helm_classic_240829.csv tnlg_v2_530b,Helm XSUM,0.169,[],helm_classic_240829.csv anthropic_lm_v4_s3_52b,Helm XSUM,0.934,[],helm_classic_240829.csv jurassic_2_grande_17b,Helm XSUM,0.167,[],helm_classic_240829.csv palmyra_x_43b,Helm XSUM,0.935,[],helm_classic_240829.csv j1_grande_v2_beta_17b,Helm XSUM,0.152,[],helm_classic_240829.csv cohere_command_beta_6_1b,Helm XSUM,0.122,[],helm_classic_240829.csv cohere_xlarge_v20221108_52_4b,Helm XSUM,0.153,[],helm_classic_240829.csv luminous_supreme_70b,Helm XSUM,0.959,[],helm_classic_240829.csv opt_175b,Helm XSUM,0.155,[],helm_classic_240829.csv instructpalmyra_30b,Helm XSUM,0.94,[],helm_classic_240829.csv cohere_xlarge_v20220609_52_4b,Helm XSUM,0.129,[],helm_classic_240829.csv jurassic_2_large_7_5b,Helm XSUM,0.142,[],helm_classic_240829.csv davinci_175b,Helm XSUM,0.126,[],helm_classic_240829.csv j1_jumbo_v1_178b,Helm XSUM,0.129,[],helm_classic_240829.csv glm_130b,Helm XSUM,0.955,[],helm_classic_240829.csv luminous_extended_30b,Helm XSUM,0.947,[],helm_classic_240829.csv opt_66b,Helm XSUM,0.126,[],helm_classic_240829.csv bloom_176b,Helm XSUM,0.03,[],helm_classic_240829.csv j1_grande_v1_17b,Helm XSUM,0.122,[],helm_classic_240829.csv cohere_large_v20220720_13_1b,Helm XSUM,0.108,[],helm_classic_240829.csv text_curie_001,Helm XSUM,0.076,[],helm_classic_240829.csv gpt_neox_20b,Helm XSUM,0.102,[],helm_classic_240829.csv luminous_base_13b,Helm XSUM,0.939,[],helm_classic_240829.csv cohere_medium_v20221108_6_1b,Helm XSUM,0.099,[],helm_classic_240829.csv tnlg_v2_6_7b,Helm XSUM,0.11,[],helm_classic_240829.csv j1_large_v1_7_5b,Helm XSUM,0.102,[],helm_classic_240829.csv gpt_j_6b,Helm XSUM,0.096,[],helm_classic_240829.csv curie_6_7b,Helm XSUM,0.091,[],helm_classic_240829.csv cohere_medium_v20220720_6_1b,Helm XSUM,0.087,[],helm_classic_240829.csv text_babbage_001,Helm XSUM,0.046,[],helm_classic_240829.csv t0pp_11b,Helm XSUM,0.207,[],helm_classic_240829.csv flan-ul2_20b,Helm XSUM,0.337,[],helm_classic_240829.csv t5_11b,Helm XSUM,0.379,[],helm_classic_240829.csv babbage_1_3b,Helm XSUM,0.045,[],helm_classic_240829.csv cohere_small_v20220720_410m,Helm XSUM,0.033,[],helm_classic_240829.csv ada_350m,Helm XSUM,0.022,[],helm_classic_240829.csv text_ada_001,Helm XSUM,0.034,[],helm_classic_240829.csv yalm_100b,Helm XSUM,0.836,[],helm_classic_240829.csv llama_2_70b,Helm IMDB,0.961,[],helm_classic_240829.csv llama_65b,Helm IMDB,0.962,[],helm_classic_240829.csv text_davinci_002,Helm IMDB,0.948,[],helm_classic_240829.csv mistral_v0_1_7b,Helm IMDB,0.962,[],helm_classic_240829.csv cohere_command_beta_52_4b,Helm IMDB,0.96,[],helm_classic_240829.csv text_davinci_003,Helm IMDB,0.848,[],helm_classic_240829.csv jurassic_2_jumbo_178b,Helm IMDB,0.938,[],helm_classic_240829.csv llama_2_13b,Helm IMDB,0.962,[],helm_classic_240829.csv tnlg_v2_530b,Helm IMDB,0.941,[],helm_classic_240829.csv gpt_3_5_turbo_0613,Helm IMDB,0.943,[],helm_classic_240829.csv llama30b,Helm IMDB,0.927,[],helm_classic_240829.csv anthropic_lm_v4_s3_52b,Helm IMDB,0.61,[],helm_classic_240829.csv gpt_3_5_turbo_0301,Helm IMDB,0.899,[],helm_classic_240829.csv jurassic_2_grande_17b,Helm IMDB,0.938,[],helm_classic_240829.csv palmyra_x_43b,Helm IMDB,0.008,[],helm_classic_240829.csv falcon_40b,Helm IMDB,0.959,[],helm_classic_240829.csv falcon_instruct_40b,Helm IMDB,0.959,[],helm_classic_240829.csv mpt_instruct_30b,Helm IMDB,0.956,[],helm_classic_240829.csv mpt_30b,Helm IMDB,0.959,[],helm_classic_240829.csv j1_grande_v2_beta_17b,Helm IMDB,0.957,[],helm_classic_240829.csv vicuna_v1_3_13b,Helm IMDB,0.762,[],helm_classic_240829.csv cohere_command_beta_6_1b,Helm IMDB,0.961,[],helm_classic_240829.csv cohere_xlarge_v20221108_52_4b,Helm IMDB,0.956,[],helm_classic_240829.csv luminous_supreme_70b,Helm IMDB,0.562,[],helm_classic_240829.csv vicuna_v1_3_7b,Helm IMDB,0.916,[],helm_classic_240829.csv opt_175b,Helm IMDB,0.947,[],helm_classic_240829.csv llama_2_7b,Helm IMDB,0.907,[],helm_classic_240829.csv llama_13b,Helm IMDB,0.928,[],helm_classic_240829.csv instructpalmyra_30b,Helm IMDB,0.555,[],helm_classic_240829.csv cohere_xlarge_v20220609_52_4b,Helm IMDB,0.956,[],helm_classic_240829.csv jurassic_2_large_7_5b,Helm IMDB,0.956,[],helm_classic_240829.csv davinci_175b,Helm IMDB,0.933,[],helm_classic_240829.csv llama_7b,Helm IMDB,0.947,[],helm_classic_240829.csv redpajama_incite_instruct_7b,Helm IMDB,0.927,[],helm_classic_240829.csv j1_jumbo_v1_178b,Helm IMDB,0.943,[],helm_classic_240829.csv glm_130b,Helm IMDB,0.5,[],helm_classic_240829.csv luminous_extended_30b,Helm IMDB,0.524,[],helm_classic_240829.csv opt_66b,Helm IMDB,0.917,[],helm_classic_240829.csv bloom_176b,Helm IMDB,0.945,[],helm_classic_240829.csv j1_grande_v1_17b,Helm IMDB,0.953,[],helm_classic_240829.csv alpaca_7b,Helm IMDB,0.738,[],helm_classic_240829.csv falcon_7b,Helm IMDB,0.836,[],helm_classic_240829.csv redpajama_incite_base_7b,Helm IMDB,0.752,[],helm_classic_240829.csv cohere_large_v20220720_13_1b,Helm IMDB,0.933,[],helm_classic_240829.csv redpajama_incite_instruct_v1_3b,Helm IMDB,0.894,[],helm_classic_240829.csv text_curie_001,Helm IMDB,0.923,[],helm_classic_240829.csv gpt_neox_20b,Helm IMDB,0.948,[],helm_classic_240829.csv luminous_base_13b,Helm IMDB,0.544,[],helm_classic_240829.csv cohere_medium_v20221108_6_1b,Helm IMDB,0.935,[],helm_classic_240829.csv redpajama_incite_base_v1_3b,Helm IMDB,0.907,[],helm_classic_240829.csv tnlg_v2_6_7b,Helm IMDB,0.927,[],helm_classic_240829.csv j1_large_v1_7_5b,Helm IMDB,0.956,[],helm_classic_240829.csv gpt_j_6b,Helm IMDB,0.939,[],helm_classic_240829.csv pythia_12b,Helm IMDB,0.931,[],helm_classic_240829.csv curie_6_7b,Helm IMDB,0.889,[],helm_classic_240829.csv falcon_instruct_7b,Helm IMDB,0.852,[],helm_classic_240829.csv cohere_medium_v20220720_6_1b,Helm IMDB,0.935,[],helm_classic_240829.csv text_babbage_001,Helm IMDB,0.913,[],helm_classic_240829.csv t0pp_11b,Helm IMDB,0.234,[],helm_classic_240829.csv pythia_6_9b,Helm IMDB,0.928,[],helm_classic_240829.csv flan-ul2_20b,Helm IMDB,0.521,[],helm_classic_240829.csv t5_11b,Helm IMDB,0.509,[],helm_classic_240829.csv babbage_1_3b,Helm IMDB,0.597,[],helm_classic_240829.csv cohere_small_v20220720_410m,Helm IMDB,0.578,[],helm_classic_240829.csv ada_350m,Helm IMDB,0.849,[],helm_classic_240829.csv text_ada_001,Helm IMDB,0.822,[],helm_classic_240829.csv yalm_100b,Helm IMDB,0.49,[],helm_classic_240829.csv llama_2_70b,Helm CivilComments,0.652,[],helm_classic_240829.csv llama_65b,Helm CivilComments,0.655,[],helm_classic_240829.csv text_davinci_002,Helm CivilComments,0.668,[],helm_classic_240829.csv mistral_v0_1_7b,Helm CivilComments,0.624,[],helm_classic_240829.csv cohere_command_beta_52_4b,Helm CivilComments,0.601,[],helm_classic_240829.csv text_davinci_003,Helm CivilComments,0.684,[],helm_classic_240829.csv jurassic_2_jumbo_178b,Helm CivilComments,0.57,[],helm_classic_240829.csv llama_2_13b,Helm CivilComments,0.588,[],helm_classic_240829.csv tnlg_v2_530b,Helm CivilComments,0.601,[],helm_classic_240829.csv gpt_3_5_turbo_0613,Helm CivilComments,0.696,[],helm_classic_240829.csv llama30b,Helm CivilComments,0.549,[],helm_classic_240829.csv anthropic_lm_v4_s3_52b,Helm CivilComments,0.699,[],helm_classic_240829.csv gpt_3_5_turbo_0301,Helm CivilComments,0.674,[],helm_classic_240829.csv jurassic_2_grande_17b,Helm CivilComments,0.547,[],helm_classic_240829.csv palmyra_x_43b,Helm CivilComments,0.701,[],helm_classic_240829.csv falcon_40b,Helm CivilComments,0.552,[],helm_classic_240829.csv falcon_instruct_40b,Helm CivilComments,0.603,[],helm_classic_240829.csv mpt_instruct_30b,Helm CivilComments,0.573,[],helm_classic_240829.csv mpt_30b,Helm CivilComments,0.599,[],helm_classic_240829.csv j1_grande_v2_beta_17b,Helm CivilComments,0.546,[],helm_classic_240829.csv vicuna_v1_3_13b,Helm CivilComments,0.645,[],helm_classic_240829.csv cohere_command_beta_6_1b,Helm CivilComments,0.54,[],helm_classic_240829.csv cohere_xlarge_v20221108_52_4b,Helm CivilComments,0.524,[],helm_classic_240829.csv luminous_supreme_70b,Helm CivilComments,0.653,[],helm_classic_240829.csv vicuna_v1_3_7b,Helm CivilComments,0.62,[],helm_classic_240829.csv opt_175b,Helm CivilComments,0.505,[],helm_classic_240829.csv llama_2_7b,Helm CivilComments,0.562,[],helm_classic_240829.csv llama_13b,Helm CivilComments,0.6,[],helm_classic_240829.csv instructpalmyra_30b,Helm CivilComments,0.652,[],helm_classic_240829.csv cohere_xlarge_v20220609_52_4b,Helm CivilComments,0.532,[],helm_classic_240829.csv jurassic_2_large_7_5b,Helm CivilComments,0.57,[],helm_classic_240829.csv davinci_175b,Helm CivilComments,0.532,[],helm_classic_240829.csv llama_7b,Helm CivilComments,0.563,[],helm_classic_240829.csv redpajama_incite_instruct_7b,Helm CivilComments,0.664,[],helm_classic_240829.csv j1_jumbo_v1_178b,Helm CivilComments,0.553,[],helm_classic_240829.csv glm_130b,Helm CivilComments,0.598,[],helm_classic_240829.csv luminous_extended_30b,Helm CivilComments,0.523,[],helm_classic_240829.csv opt_66b,Helm CivilComments,0.506,[],helm_classic_240829.csv bloom_176b,Helm CivilComments,0.62,[],helm_classic_240829.csv j1_grande_v1_17b,Helm CivilComments,0.529,[],helm_classic_240829.csv alpaca_7b,Helm CivilComments,0.566,[],helm_classic_240829.csv falcon_7b,Helm CivilComments,0.514,[],helm_classic_240829.csv redpajama_incite_base_7b,Helm CivilComments,0.547,[],helm_classic_240829.csv cohere_large_v20220720_13_1b,Helm CivilComments,0.507,[],helm_classic_240829.csv redpajama_incite_instruct_v1_3b,Helm CivilComments,0.549,[],helm_classic_240829.csv text_curie_001,Helm CivilComments,0.537,[],helm_classic_240829.csv gpt_neox_20b,Helm CivilComments,0.516,[],helm_classic_240829.csv luminous_base_13b,Helm CivilComments,0.473,[],helm_classic_240829.csv cohere_medium_v20221108_6_1b,Helm CivilComments,0.5,[],helm_classic_240829.csv redpajama_incite_base_v1_3b,Helm CivilComments,0.549,[],helm_classic_240829.csv tnlg_v2_6_7b,Helm CivilComments,0.532,[],helm_classic_240829.csv j1_large_v1_7_5b,Helm CivilComments,0.532,[],helm_classic_240829.csv gpt_j_6b,Helm CivilComments,0.52,[],helm_classic_240829.csv pythia_12b,Helm CivilComments,0.531,[],helm_classic_240829.csv curie_6_7b,Helm CivilComments,0.539,[],helm_classic_240829.csv falcon_instruct_7b,Helm CivilComments,0.511,[],helm_classic_240829.csv cohere_medium_v20220720_6_1b,Helm CivilComments,0.504,[],helm_classic_240829.csv text_babbage_001,Helm CivilComments,0.499,[],helm_classic_240829.csv t0pp_11b,Helm CivilComments,0.118,[],helm_classic_240829.csv pythia_6_9b,Helm CivilComments,0.511,[],helm_classic_240829.csv flan-ul2_20b,Helm CivilComments,0.404,[],helm_classic_240829.csv t5_11b,Helm CivilComments,0.37,[],helm_classic_240829.csv babbage_1_3b,Helm CivilComments,0.519,[],helm_classic_240829.csv cohere_small_v20220720_410m,Helm CivilComments,0.501,[],helm_classic_240829.csv ada_350m,Helm CivilComments,0.517,[],helm_classic_240829.csv text_ada_001,Helm CivilComments,0.503,[],helm_classic_240829.csv yalm_100b,Helm CivilComments,0.395,[],helm_classic_240829.csv llama_2_70b,Helm RAFT,0.727,[],helm_classic_240829.csv llama_65b,Helm RAFT,0.702,[],helm_classic_240829.csv text_davinci_002,Helm RAFT,0.733,[],helm_classic_240829.csv mistral_v0_1_7b,Helm RAFT,0.707,[],helm_classic_240829.csv cohere_command_beta_52_4b,Helm RAFT,0.667,[],helm_classic_240829.csv text_davinci_003,Helm RAFT,0.759,[],helm_classic_240829.csv jurassic_2_jumbo_178b,Helm RAFT,0.746,[],helm_classic_240829.csv llama_2_13b,Helm RAFT,0.707,[],helm_classic_240829.csv tnlg_v2_530b,Helm RAFT,0.679,[],helm_classic_240829.csv gpt_3_5_turbo_0613,Helm RAFT,0.748,[],helm_classic_240829.csv llama30b,Helm RAFT,0.752,[],helm_classic_240829.csv gpt_3_5_turbo_0301,Helm RAFT,0.768,[],helm_classic_240829.csv jurassic_2_grande_17b,Helm RAFT,0.712,[],helm_classic_240829.csv falcon_40b,Helm RAFT,0.661,[],helm_classic_240829.csv falcon_instruct_40b,Helm RAFT,0.586,[],helm_classic_240829.csv mpt_instruct_30b,Helm RAFT,0.68,[],helm_classic_240829.csv mpt_30b,Helm RAFT,0.723,[],helm_classic_240829.csv j1_grande_v2_beta_17b,Helm RAFT,0.679,[],helm_classic_240829.csv vicuna_v1_3_13b,Helm RAFT,0.657,[],helm_classic_240829.csv cohere_command_beta_6_1b,Helm RAFT,0.634,[],helm_classic_240829.csv cohere_xlarge_v20221108_52_4b,Helm RAFT,0.624,[],helm_classic_240829.csv vicuna_v1_3_7b,Helm RAFT,0.693,[],helm_classic_240829.csv opt_175b,Helm RAFT,0.606,[],helm_classic_240829.csv llama_2_7b,Helm RAFT,0.643,[],helm_classic_240829.csv llama_13b,Helm RAFT,0.643,[],helm_classic_240829.csv cohere_xlarge_v20220609_52_4b,Helm RAFT,0.633,[],helm_classic_240829.csv jurassic_2_large_7_5b,Helm RAFT,0.622,[],helm_classic_240829.csv davinci_175b,Helm RAFT,0.642,[],helm_classic_240829.csv llama_7b,Helm RAFT,0.573,[],helm_classic_240829.csv redpajama_incite_instruct_7b,Helm RAFT,0.695,[],helm_classic_240829.csv j1_jumbo_v1_178b,Helm RAFT,0.681,[],helm_classic_240829.csv opt_66b,Helm RAFT,0.557,[],helm_classic_240829.csv bloom_176b,Helm RAFT,0.592,[],helm_classic_240829.csv j1_grande_v1_17b,Helm RAFT,0.658,[],helm_classic_240829.csv alpaca_7b,Helm RAFT,0.486,[],helm_classic_240829.csv falcon_7b,Helm RAFT,0.602,[],helm_classic_240829.csv redpajama_incite_base_7b,Helm RAFT,0.648,[],helm_classic_240829.csv cohere_large_v20220720_13_1b,Helm RAFT,0.596,[],helm_classic_240829.csv redpajama_incite_instruct_v1_3b,Helm RAFT,0.661,[],helm_classic_240829.csv text_curie_001,Helm RAFT,0.489,[],helm_classic_240829.csv gpt_neox_20b,Helm RAFT,0.505,[],helm_classic_240829.csv cohere_medium_v20221108_6_1b,Helm RAFT,0.591,[],helm_classic_240829.csv redpajama_incite_base_v1_3b,Helm RAFT,0.502,[],helm_classic_240829.csv tnlg_v2_6_7b,Helm RAFT,0.525,[],helm_classic_240829.csv j1_large_v1_7_5b,Helm RAFT,0.545,[],helm_classic_240829.csv gpt_j_6b,Helm RAFT,0.619,[],helm_classic_240829.csv pythia_12b,Helm RAFT,0.514,[],helm_classic_240829.csv curie_6_7b,Helm RAFT,0.49,[],helm_classic_240829.csv falcon_instruct_7b,Helm RAFT,0.523,[],helm_classic_240829.csv cohere_medium_v20220720_6_1b,Helm RAFT,0.52,[],helm_classic_240829.csv text_babbage_001,Helm RAFT,0.509,[],helm_classic_240829.csv pythia_6_9b,Helm RAFT,0.502,[],helm_classic_240829.csv babbage_1_3b,Helm RAFT,0.455,[],helm_classic_240829.csv cohere_small_v20220720_410m,Helm RAFT,0.492,[],helm_classic_240829.csv ada_350m,Helm RAFT,0.423,[],helm_classic_240829.csv text_ada_001,Helm RAFT,0.406,[],helm_classic_240829.csv arx_0_3,MMLU Pro,0.7824,[],mmlu_pro_240829.csv claude_3_5_sonnet,MMLU Pro,0.7612,[],mmlu_pro_240829.csv grok_2,MMLU Pro,0.7546,[],mmlu_pro_240829.csv gpt_4o_2024_05_13,MMLU Pro,0.7255,[],mmlu_pro_240829.csv grok_2_mini,MMLU Pro,0.7185,[],mmlu_pro_240829.csv gemini_1_5_pro,MMLU Pro,0.6903,[],mmlu_pro_240829.csv claude_3_opus,MMLU Pro,0.6845,[],mmlu_pro_240829.csv qwen2_72b_chat,MMLU Pro,0.6438,[],mmlu_pro_240829.csv magnum_72b_v1,MMLU Pro,0.6393,[],mmlu_pro_240829.csv gpt_4_turbo,MMLU Pro,0.6371,[],mmlu_pro_240829.csv deepseek_coder_v2_instruct,MMLU Pro,0.6363,[],mmlu_pro_240829.csv higgs_llama3_70b,MMLU Pro,0.6316,[],mmlu_pro_240829.csv gpt_4o_mini,MMLU Pro,0.6309,[],mmlu_pro_240829.csv llama3_1_70b_instruct,MMLU Pro,0.6284,[],mmlu_pro_240829.csv gemini_1_5_flash,MMLU Pro,0.5912,[],mmlu_pro_240829.csv yi_large,MMLU Pro,0.5809,[],mmlu_pro_240829.csv claude_3_sonnet,MMLU Pro,0.568,[],mmlu_pro_240829.csv llama3_70b_instruct,MMLU Pro,0.562,[],mmlu_pro_240829.csv phi3_medium_4k,MMLU Pro,0.557,[],mmlu_pro_240829.csv qwen2_72b_32k,MMLU Pro,0.5559,[],mmlu_pro_240829.csv deepseek_v2_chat,MMLU Pro,0.5481,[],mmlu_pro_240829.csv llama3_70b,MMLU Pro,0.5278,[],mmlu_pro_240829.csv qwen1_5_72b_chat,MMLU Pro,0.5264,[],mmlu_pro_240829.csv llama3_1_70b,MMLU Pro,0.5247,[],mmlu_pro_240829.csv yi_1_5_34b_chat,MMLU Pro,0.5229,[],mmlu_pro_240829.csv gemma_2_9b_it,MMLU Pro,0.5208,[],mmlu_pro_240829.csv phi3_medium_128k,MMLU Pro,0.5191,[],mmlu_pro_240829.csv mammoth2_8x7b_plus,MMLU Pro,0.504,[],mmlu_pro_240829.csv qwen1_5_110b,MMLU Pro,0.4993,[],mmlu_pro_240829.csv glm_4_9b_chat,MMLU Pro,0.4801,[],mmlu_pro_240829.csv glm_4_9b,MMLU Pro,0.4792,[],mmlu_pro_240829.csv phi_3_5_mini_instruct,MMLU Pro,0.4787,[],mmlu_pro_240829.csv qwen2_7b_instruct,MMLU Pro,0.4724,[],mmlu_pro_240829.csv yi_1_5_9b_chat,MMLU Pro,0.4595,[],mmlu_pro_240829.csv phi3_mini_4k,MMLU Pro,0.4566,[],mmlu_pro_240829.csv gemma_2_9b,MMLU Pro,0.451,[],mmlu_pro_240829.csv mistral_nemo_instruct_2407,MMLU Pro,0.4481,[],mmlu_pro_240829.csv llama3_1_8b_instruct,MMLU Pro,0.4425,[],mmlu_pro_240829.csv phi3_mini_128k,MMLU Pro,0.4386,[],mmlu_pro_240829.csv mammoth2_8b_plus,MMLU Pro,0.4335,[],mmlu_pro_240829.csv mixtral_8x7b_instruct_v0_1,MMLU Pro,0.4327,[],mmlu_pro_240829.csv yi_34b,MMLU Pro,0.4303,[],mmlu_pro_240829.csv mathstral_7b_v0_1,MMLU Pro,0.42,[],mmlu_pro_240829.csv deepseek_coder_v2_lite_instruct,MMLU Pro,0.4157,[],mmlu_pro_240829.csv mixtral_8x7b_v0_1,MMLU Pro,0.4103,[],mmlu_pro_240829.csv llama3_8b_instruct,MMLU Pro,0.4098,[],mmlu_pro_240829.csv mammoth2_7b_plus,MMLU Pro,0.4085,[],mmlu_pro_240829.csv qwen2_7b,MMLU Pro,0.4073,[],mmlu_pro_240829.csv mistral_nemo_base_2407,MMLU Pro,0.3977,[],mmlu_pro_240829.csv wizardlm_2_8x22b,MMLU Pro,0.3924,[],mmlu_pro_240829.csv yi_1_5_6b_chat,MMLU Pro,0.3823,[],mmlu_pro_240829.csv qwen1_5_14b_chat,MMLU Pro,0.3802,[],mmlu_pro_240829.csv c4ai_command_r_v0_1,MMLU Pro,0.379,[],mmlu_pro_240829.csv staring_7b,MMLU Pro,0.379,[],mmlu_pro_240829.csv llama_2_70b,MMLU Pro,0.3753,[],mmlu_pro_240829.csv openchat_3_5_8b,MMLU Pro,0.3724,[],mmlu_pro_240829.csv internmath_20b_plus,MMLU Pro,0.371,[],mmlu_pro_240829.csv llama3_smaug_8b,MMLU Pro,0.3693,[],mmlu_pro_240829.csv llama3_1_8b,MMLU Pro,0.366,[],mmlu_pro_240829.csv llama3_8b,MMLU Pro,0.3536,[],mmlu_pro_240829.csv deepseekmath_7b_instruct,MMLU Pro,0.353,[],mmlu_pro_240829.csv deepseek_coder_v2_lite_base,MMLU Pro,0.3437,[],mmlu_pro_240829.csv gemma_7b,MMLU Pro,0.3373,[],mmlu_pro_240829.csv internmath_7b_plus,MMLU Pro,0.335,[],mmlu_pro_240829.csv zephyr_7b_beta,MMLU Pro,0.3297,[],mmlu_pro_240829.csv mistral_7b_v0_1,MMLU Pro,0.3088,[],mmlu_pro_240829.csv mistral_7b_instruct_v0_2,MMLU Pro,0.3084,[],mmlu_pro_240829.csv mistral_7b_v0_2,MMLU Pro,0.3043,[],mmlu_pro_240829.csv qwen1_5_7b_chat,MMLU Pro,0.2906,[],mmlu_pro_240829.csv yi_6b_chat,MMLU Pro,0.2884,[],mmlu_pro_240829.csv neo_7b_instruct,MMLU Pro,0.2874,[],mmlu_pro_240829.csv yi_6b,MMLU Pro,0.2651,[],mmlu_pro_240829.csv neo_7b,MMLU Pro,0.2585,[],mmlu_pro_240829.csv mistral_7b_instruct_v0_1,MMLU Pro,0.2575,[],mmlu_pro_240829.csv llama_2_13b,MMLU Pro,0.2534,[],mmlu_pro_240829.csv llemma_7b,MMLU Pro,0.2345,[],mmlu_pro_240829.csv qwen2_1_5b_instruct,MMLU Pro,0.2262,[],mmlu_pro_240829.csv qwen2_1_5b,MMLU Pro,0.2256,[],mmlu_pro_240829.csv llama_2_7b,MMLU Pro,0.2032,[],mmlu_pro_240829.csv qwen2_0_5b_instruct,MMLU Pro,0.1593,[],mmlu_pro_240829.csv gemma_2b,MMLU Pro,0.1585,[],mmlu_pro_240829.csv qwen2_0_5b,MMLU Pro,0.1497,[],mmlu_pro_240829.csv llama3_70b,MixEval,82.2,[],mixeval_240829.csv qwen1_5_72b,MixEval,79.5,[],mixeval_240829.csv yi_34b,MixEval,78.3,[],mixeval_240829.csv qwen1_5_32b,MixEval,77.6,[],mixeval_240829.csv mixtral_8x7b,MixEval,74.0,[],mixeval_240829.csv llama_2_70b,MixEval,73.2,[],mixeval_240829.csv qwen1_5_moe_a2_7b,MixEval,70.2,[],mixeval_240829.csv qwen1_5_7b,MixEval,68.2,[],mixeval_240829.csv llama3_8b,MixEval,65.1,[],mixeval_240829.csv mistral_7b,MixEval,64.8,[],mixeval_240829.csv gemma_7b,MixEval,64.7,[],mixeval_240829.csv yi_6b,MixEval,63.1,[],mixeval_240829.csv qwen1_5_4b,MixEval,58.2,[],mixeval_240829.csv jetmoe_8b,MixEval,57.1,[],mixeval_240829.csv deepseek_7b,MixEval,52.2,[],mixeval_240829.csv phi_2,MixEval,51.9,[],mixeval_240829.csv deepseekmoe_16b,MixEval,51.4,[],mixeval_240829.csv llama_2_7b,MixEval,43.1,[],mixeval_240829.csv gemma_2b,MixEval,38.9,[],mixeval_240829.csv olmo_7b,MixEval,31.8,[],mixeval_240829.csv mpt_7b,MixEval,30.8,[],mixeval_240829.csv claude_3_5_sonnet_0620,MixEval,89.9,[],mixeval_240829.csv gpt_4o_2024_05_13,MixEval,87.9,[],mixeval_240829.csv claude_3_opus,MixEval,88.1,[],mixeval_240829.csv gpt_4_turbo_2024_04_09,MixEval,88.8,[],mixeval_240829.csv gemini_1_5_pro_api_0409,MixEval,84.2,[],mixeval_240829.csv gemini_1_5_pro_api_0514,MixEval,84.8,[],mixeval_240829.csv mistral_large_2,MixEval,86.1,[],mixeval_240829.csv yi_large_preview,MixEval,84.4,[],mixeval_240829.csv llama3_70b_instruct,MixEval,84.0,[],mixeval_240829.csv qwen_max_0428,MixEval,86.1,[],mixeval_240829.csv claude_3_sonnet,MixEval,81.7,[],mixeval_240829.csv reka_core_20240415,MixEval,83.3,[],mixeval_240829.csv mammoth2_8x7b_plus,MixEval,81.5,[],mixeval_240829.csv deepseek_v2,MixEval,83.7,[],mixeval_240829.csv gpt_4o_mini,MixEval,84.2,[],mixeval_240829.csv command_r_plus,MixEval,81.5,[],mixeval_240829.csv yi_1_5_34b_chat,MixEval,81.7,[],mixeval_240829.csv mistral_large,MixEval,84.2,[],mixeval_240829.csv qwen1_5_72b_chat,MixEval,84.1,[],mixeval_240829.csv mistral_medium,MixEval,81.9,[],mixeval_240829.csv gemini_1_0_pro,MixEval,78.9,[],mixeval_240829.csv reka_flash_20240226,MixEval,79.8,[],mixeval_240829.csv mistral_small,MixEval,81.2,[],mixeval_240829.csv llama3_8b_instruct,MixEval,75.0,[],mixeval_240829.csv command_r,MixEval,77.0,[],mixeval_240829.csv qwen1_5_32b_chat,MixEval,81.0,[],mixeval_240829.csv gpt_3_5_turbo_0125,MixEval,79.7,[],mixeval_240829.csv claude_3_haiku,MixEval,79.7,[],mixeval_240829.csv yi_34b_chat,MixEval,80.1,[],mixeval_240829.csv mixtral_8x7b_instruct_v0_1,MixEval,76.4,[],mixeval_240829.csv starling_lm_7b_beta,MixEval,74.8,[],mixeval_240829.csv yi_1_5_9b_chat,MixEval,74.2,[],mixeval_240829.csv gemma_1_1_7b_it,MixEval,69.6,[],mixeval_240829.csv vicuna_33b_v1_3,MixEval,66.3,[],mixeval_240829.csv llama_2_70b_chat,MixEval,74.6,[],mixeval_240829.csv map_neo_instruct_v0_1,MixEval,70.0,[],mixeval_240829.csv mistral_7b_instruct_v0_2,MixEval,70.0,[],mixeval_240829.csv qwen1_5_7b_chat,MixEval,71.4,[],mixeval_240829.csv reka_edge_20240208,MixEval,68.5,[],mixeval_240829.csv zephyr_7b_beta,MixEval,69.1,[],mixeval_240829.csv llama_2_7b_chat,MixEval,61.7,[],mixeval_240829.csv yi_6b_chat,MixEval,65.6,[],mixeval_240829.csv qwen1_5_moe_a2_7b_chat,MixEval,69.1,[],mixeval_240829.csv gemma_1_1_2b_it,MixEval,51.9,[],mixeval_240829.csv vicuna_7b_v1_5,MixEval,60.3,[],mixeval_240829.csv olmo_7b_instruct,MixEval,55.0,[],mixeval_240829.csv qwen1_5_4b_chat,MixEval,57.2,[],mixeval_240829.csv jetmoe_8b_chat,MixEval,51.6,[],mixeval_240829.csv mpt_7b_chat,MixEval,43.8,[],mixeval_240829.csv llama3_70b,MixEval Hard,54.0,[],mixeval_240829.csv qwen1_5_72b,MixEval Hard,41.9,[],mixeval_240829.csv yi_34b,MixEval Hard,47.2,[],mixeval_240829.csv qwen1_5_32b,MixEval Hard,41.0,[],mixeval_240829.csv mixtral_8x7b,MixEval Hard,40.7,[],mixeval_240829.csv llama_2_70b,MixEval Hard,41.6,[],mixeval_240829.csv qwen1_5_moe_a2_7b,MixEval Hard,33.5,[],mixeval_240829.csv qwen1_5_7b,MixEval Hard,33.7,[],mixeval_240829.csv llama3_8b,MixEval Hard,31.7,[],mixeval_240829.csv mistral_7b,MixEval Hard,27.1,[],mixeval_240829.csv gemma_7b,MixEval Hard,32.7,[],mixeval_240829.csv yi_6b,MixEval Hard,30.4,[],mixeval_240829.csv qwen1_5_4b,MixEval Hard,23.5,[],mixeval_240829.csv jetmoe_8b,MixEval Hard,27.0,[],mixeval_240829.csv deepseek_7b,MixEval Hard,21.7,[],mixeval_240829.csv phi_2,MixEval Hard,21.9,[],mixeval_240829.csv deepseekmoe_16b,MixEval Hard,24.2,[],mixeval_240829.csv llama_2_7b,MixEval Hard,22.1,[],mixeval_240829.csv gemma_2b,MixEval Hard,22.6,[],mixeval_240829.csv olmo_7b,MixEval Hard,21.2,[],mixeval_240829.csv mpt_7b,MixEval Hard,17.4,[],mixeval_240829.csv claude_3_5_sonnet_0620,MixEval Hard,68.1,[],mixeval_240829.csv llama3_1_405b_instruct,MixEval Hard,66.2,[],mixeval_240829.csv gpt_4o_2024_05_13,MixEval Hard,64.7,[],mixeval_240829.csv claude_3_opus,MixEval Hard,63.5,[],mixeval_240829.csv gpt_4_turbo_2024_04_09,MixEval Hard,62.6,[],mixeval_240829.csv gemini_1_5_pro_api_0409,MixEval Hard,58.7,[],mixeval_240829.csv gemini_1_5_pro_api_0514,MixEval Hard,58.3,[],mixeval_240829.csv mistral_large_2,MixEval Hard,57.4,[],mixeval_240829.csv yi_large_preview,MixEval Hard,56.8,[],mixeval_240829.csv llama3_70b_instruct,MixEval Hard,55.9,[],mixeval_240829.csv qwen_max_0428,MixEval Hard,55.8,[],mixeval_240829.csv claude_3_sonnet,MixEval Hard,54.0,[],mixeval_240829.csv reka_core_20240415,MixEval Hard,52.9,[],mixeval_240829.csv mammoth2_8x7b_plus,MixEval Hard,51.8,[],mixeval_240829.csv deepseek_v2,MixEval Hard,51.7,[],mixeval_240829.csv gpt_4o_mini,MixEval Hard,51.6,[],mixeval_240829.csv command_r_plus,MixEval Hard,51.4,[],mixeval_240829.csv yi_1_5_34b_chat,MixEval Hard,51.2,[],mixeval_240829.csv mistral_large,MixEval Hard,50.3,[],mixeval_240829.csv qwen1_5_72b_chat,MixEval Hard,48.3,[],mixeval_240829.csv mistral_medium,MixEval Hard,47.8,[],mixeval_240829.csv gemini_1_0_pro,MixEval Hard,46.4,[],mixeval_240829.csv reka_flash_20240226,MixEval Hard,46.2,[],mixeval_240829.csv mistral_small,MixEval Hard,46.2,[],mixeval_240829.csv llama3_8b_instruct,MixEval Hard,45.6,[],mixeval_240829.csv command_r,MixEval Hard,45.2,[],mixeval_240829.csv qwen1_5_32b_chat,MixEval Hard,43.3,[],mixeval_240829.csv gpt_3_5_turbo_0125,MixEval Hard,43.0,[],mixeval_240829.csv claude_3_haiku,MixEval Hard,42.8,[],mixeval_240829.csv yi_34b_chat,MixEval Hard,42.6,[],mixeval_240829.csv mixtral_8x7b_instruct_v0_1,MixEval Hard,42.5,[],mixeval_240829.csv starling_lm_7b_beta,MixEval Hard,41.8,[],mixeval_240829.csv yi_1_5_9b_chat,MixEval Hard,40.9,[],mixeval_240829.csv gemma_1_1_7b_it,MixEval Hard,39.1,[],mixeval_240829.csv vicuna_33b_v1_3,MixEval Hard,38.7,[],mixeval_240829.csv llama_2_70b_chat,MixEval Hard,38.0,[],mixeval_240829.csv map_neo_instruct_v0_1,MixEval Hard,37.8,[],mixeval_240829.csv mistral_7b_instruct_v0_2,MixEval Hard,36.2,[],mixeval_240829.csv qwen1_5_7b_chat,MixEval Hard,35.5,[],mixeval_240829.csv reka_edge_20240208,MixEval Hard,32.2,[],mixeval_240829.csv zephyr_7b_beta,MixEval Hard,31.6,[],mixeval_240829.csv llama_2_7b_chat,MixEval Hard,30.8,[],mixeval_240829.csv yi_6b_chat,MixEval Hard,30.1,[],mixeval_240829.csv qwen1_5_moe_a2_7b_chat,MixEval Hard,29.1,[],mixeval_240829.csv gemma_1_1_2b_it,MixEval Hard,28.4,[],mixeval_240829.csv vicuna_7b_v1_5,MixEval Hard,27.8,[],mixeval_240829.csv olmo_7b_instruct,MixEval Hard,26.7,[],mixeval_240829.csv qwen1_5_4b_chat,MixEval Hard,24.6,[],mixeval_240829.csv jetmoe_8b_chat,MixEval Hard,24.3,[],mixeval_240829.csv mpt_7b_chat,MixEval Hard,23.8,[],mixeval_240829.csv llama3_70b,MixEval TriviaQA,83.1,[],mixeval_240829.csv qwen1_5_72b,MixEval TriviaQA,78.4,[],mixeval_240829.csv yi_34b,MixEval TriviaQA,72.1,[],mixeval_240829.csv qwen1_5_32b,MixEval TriviaQA,71.9,[],mixeval_240829.csv mixtral_8x7b,MixEval TriviaQA,77.3,[],mixeval_240829.csv llama_2_70b,MixEval TriviaQA,78.7,[],mixeval_240829.csv qwen1_5_moe_a2_7b,MixEval TriviaQA,71.3,[],mixeval_240829.csv qwen1_5_7b,MixEval TriviaQA,61.4,[],mixeval_240829.csv llama3_8b,MixEval TriviaQA,65.2,[],mixeval_240829.csv mistral_7b,MixEval TriviaQA,67.2,[],mixeval_240829.csv gemma_7b,MixEval TriviaQA,66.0,[],mixeval_240829.csv yi_6b,MixEval TriviaQA,54.7,[],mixeval_240829.csv qwen1_5_4b,MixEval TriviaQA,47.8,[],mixeval_240829.csv jetmoe_8b,MixEval TriviaQA,53.4,[],mixeval_240829.csv deepseek_7b,MixEval TriviaQA,58.7,[],mixeval_240829.csv phi_2,MixEval TriviaQA,37.0,[],mixeval_240829.csv deepseekmoe_16b,MixEval TriviaQA,64.2,[],mixeval_240829.csv llama_2_7b,MixEval TriviaQA,55.5,[],mixeval_240829.csv gemma_2b,MixEval TriviaQA,41.5,[],mixeval_240829.csv olmo_7b,MixEval TriviaQA,38.4,[],mixeval_240829.csv mpt_7b,MixEval TriviaQA,33.5,[],mixeval_240829.csv claude_3_5_sonnet_0620,MixEval TriviaQA,92.6,[],mixeval_240829.csv gpt_4o_2024_05_13,MixEval TriviaQA,88.0,[],mixeval_240829.csv claude_3_opus,MixEval TriviaQA,90.4,[],mixeval_240829.csv gpt_4_turbo_2024_04_09,MixEval TriviaQA,91.2,[],mixeval_240829.csv gemini_1_5_pro_api_0409,MixEval TriviaQA,85.3,[],mixeval_240829.csv gemini_1_5_pro_api_0514,MixEval TriviaQA,83.7,[],mixeval_240829.csv mistral_large_2,MixEval TriviaQA,88.2,[],mixeval_240829.csv yi_large_preview,MixEval TriviaQA,81.7,[],mixeval_240829.csv llama3_70b_instruct,MixEval TriviaQA,83.1,[],mixeval_240829.csv qwen_max_0428,MixEval TriviaQA,86.7,[],mixeval_240829.csv claude_3_sonnet,MixEval TriviaQA,84.2,[],mixeval_240829.csv reka_core_20240415,MixEval TriviaQA,82.8,[],mixeval_240829.csv mammoth2_8x7b_plus,MixEval TriviaQA,83.0,[],mixeval_240829.csv deepseek_v2,MixEval TriviaQA,84.4,[],mixeval_240829.csv gpt_4o_mini,MixEval TriviaQA,83.1,[],mixeval_240829.csv command_r_plus,MixEval TriviaQA,83.3,[],mixeval_240829.csv yi_1_5_34b_chat,MixEval TriviaQA,78.4,[],mixeval_240829.csv mistral_large,MixEval TriviaQA,88.3,[],mixeval_240829.csv qwen1_5_72b_chat,MixEval TriviaQA,83.9,[],mixeval_240829.csv mistral_medium,MixEval TriviaQA,86.8,[],mixeval_240829.csv gemini_1_0_pro,MixEval TriviaQA,81.0,[],mixeval_240829.csv reka_flash_20240226,MixEval TriviaQA,76.4,[],mixeval_240829.csv mistral_small,MixEval TriviaQA,85.1,[],mixeval_240829.csv llama3_8b_instruct,MixEval TriviaQA,71.7,[],mixeval_240829.csv command_r,MixEval TriviaQA,80.9,[],mixeval_240829.csv qwen1_5_32b_chat,MixEval TriviaQA,75.7,[],mixeval_240829.csv gpt_3_5_turbo_0125,MixEval TriviaQA,85.2,[],mixeval_240829.csv claude_3_haiku,MixEval TriviaQA,79.9,[],mixeval_240829.csv yi_34b_chat,MixEval TriviaQA,82.7,[],mixeval_240829.csv mixtral_8x7b_instruct_v0_1,MixEval TriviaQA,82.5,[],mixeval_240829.csv starling_lm_7b_beta,MixEval TriviaQA,75.1,[],mixeval_240829.csv yi_1_5_9b_chat,MixEval TriviaQA,61.3,[],mixeval_240829.csv gemma_1_1_7b_it,MixEval TriviaQA,64.3,[],mixeval_240829.csv vicuna_33b_v1_3,MixEval TriviaQA,79.2,[],mixeval_240829.csv llama_2_70b_chat,MixEval TriviaQA,80.0,[],mixeval_240829.csv map_neo_instruct_v0_1,MixEval TriviaQA,62.1,[],mixeval_240829.csv mistral_7b_instruct_v0_2,MixEval TriviaQA,73.7,[],mixeval_240829.csv qwen1_5_7b_chat,MixEval TriviaQA,64.1,[],mixeval_240829.csv reka_edge_20240208,MixEval TriviaQA,60.0,[],mixeval_240829.csv zephyr_7b_beta,MixEval TriviaQA,74.7,[],mixeval_240829.csv llama_2_7b_chat,MixEval TriviaQA,68.8,[],mixeval_240829.csv yi_6b_chat,MixEval TriviaQA,66.1,[],mixeval_240829.csv qwen1_5_moe_a2_7b_chat,MixEval TriviaQA,65.9,[],mixeval_240829.csv gemma_1_1_2b_it,MixEval TriviaQA,53.7,[],mixeval_240829.csv vicuna_7b_v1_5,MixEval TriviaQA,66.4,[],mixeval_240829.csv olmo_7b_instruct,MixEval TriviaQA,51.7,[],mixeval_240829.csv qwen1_5_4b_chat,MixEval TriviaQA,46.0,[],mixeval_240829.csv jetmoe_8b_chat,MixEval TriviaQA,46.8,[],mixeval_240829.csv mpt_7b_chat,MixEval TriviaQA,50.2,[],mixeval_240829.csv llama3_70b,MixEval MMLU,79.8,[],mixeval_240829.csv qwen1_5_72b,MixEval MMLU,78.8,[],mixeval_240829.csv yi_34b,MixEval MMLU,79.3,[],mixeval_240829.csv qwen1_5_32b,MixEval MMLU,77.2,[],mixeval_240829.csv mixtral_8x7b,MixEval MMLU,71.6,[],mixeval_240829.csv llama_2_70b,MixEval MMLU,70.8,[],mixeval_240829.csv qwen1_5_moe_a2_7b,MixEval MMLU,69.4,[],mixeval_240829.csv qwen1_5_7b,MixEval MMLU,67.0,[],mixeval_240829.csv llama3_8b,MixEval MMLU,69.5,[],mixeval_240829.csv mistral_7b,MixEval MMLU,68.5,[],mixeval_240829.csv gemma_7b,MixEval MMLU,67.4,[],mixeval_240829.csv yi_6b,MixEval MMLU,71.2,[],mixeval_240829.csv qwen1_5_4b,MixEval MMLU,59.6,[],mixeval_240829.csv jetmoe_8b,MixEval MMLU,55.3,[],mixeval_240829.csv deepseek_7b,MixEval MMLU,53.3,[],mixeval_240829.csv phi_2,MixEval MMLU,62.5,[],mixeval_240829.csv deepseekmoe_16b,MixEval MMLU,49.9,[],mixeval_240829.csv llama_2_7b,MixEval MMLU,40.8,[],mixeval_240829.csv gemma_2b,MixEval MMLU,37.4,[],mixeval_240829.csv olmo_7b,MixEval MMLU,29.7,[],mixeval_240829.csv mpt_7b,MixEval MMLU,30.9,[],mixeval_240829.csv claude_3_5_sonnet_0620,MixEval MMLU,84.2,[],mixeval_240829.csv gpt_4o_2024_05_13,MixEval MMLU,85.4,[],mixeval_240829.csv claude_3_opus,MixEval MMLU,83.2,[],mixeval_240829.csv gpt_4_turbo_2024_04_09,MixEval MMLU,82.8,[],mixeval_240829.csv gemini_1_5_pro_api_0409,MixEval MMLU,79.2,[],mixeval_240829.csv gemini_1_5_pro_api_0514,MixEval MMLU,84.0,[],mixeval_240829.csv mistral_large_2,MixEval MMLU,81.9,[],mixeval_240829.csv yi_large_preview,MixEval MMLU,80.9,[],mixeval_240829.csv llama3_70b_instruct,MixEval MMLU,80.5,[],mixeval_240829.csv qwen_max_0428,MixEval MMLU,80.6,[],mixeval_240829.csv claude_3_sonnet,MixEval MMLU,74.7,[],mixeval_240829.csv reka_core_20240415,MixEval MMLU,79.3,[],mixeval_240829.csv mammoth2_8x7b_plus,MixEval MMLU,74.5,[],mixeval_240829.csv deepseek_v2,MixEval MMLU,77.3,[],mixeval_240829.csv gpt_4o_mini,MixEval MMLU,82.3,[],mixeval_240829.csv command_r_plus,MixEval MMLU,78.9,[],mixeval_240829.csv yi_1_5_34b_chat,MixEval MMLU,76.4,[],mixeval_240829.csv mistral_large,MixEval MMLU,80.2,[],mixeval_240829.csv qwen1_5_72b_chat,MixEval MMLU,80.1,[],mixeval_240829.csv mistral_medium,MixEval MMLU,76.3,[],mixeval_240829.csv gemini_1_0_pro,MixEval MMLU,74.9,[],mixeval_240829.csv reka_flash_20240226,MixEval MMLU,75.4,[],mixeval_240829.csv mistral_small,MixEval MMLU,75.2,[],mixeval_240829.csv llama3_8b_instruct,MixEval MMLU,71.9,[],mixeval_240829.csv command_r,MixEval MMLU,75.0,[],mixeval_240829.csv qwen1_5_32b_chat,MixEval MMLU,78.0,[],mixeval_240829.csv gpt_3_5_turbo_0125,MixEval MMLU,74.5,[],mixeval_240829.csv claude_3_haiku,MixEval MMLU,76.1,[],mixeval_240829.csv yi_34b_chat,MixEval MMLU,73.6,[],mixeval_240829.csv mixtral_8x7b_instruct_v0_1,MixEval MMLU,72.0,[],mixeval_240829.csv starling_lm_7b_beta,MixEval MMLU,69.0,[],mixeval_240829.csv yi_1_5_9b_chat,MixEval MMLU,72.6,[],mixeval_240829.csv gemma_1_1_7b_it,MixEval MMLU,66.9,[],mixeval_240829.csv vicuna_33b_v1_3,MixEval MMLU,59.2,[],mixeval_240829.csv llama_2_70b_chat,MixEval MMLU,69.8,[],mixeval_240829.csv map_neo_instruct_v0_1,MixEval MMLU,66.7,[],mixeval_240829.csv mistral_7b_instruct_v0_2,MixEval MMLU,67.3,[],mixeval_240829.csv qwen1_5_7b_chat,MixEval MMLU,68.7,[],mixeval_240829.csv reka_edge_20240208,MixEval MMLU,63.6,[],mixeval_240829.csv zephyr_7b_beta,MixEval MMLU,64.9,[],mixeval_240829.csv llama_2_7b_chat,MixEval MMLU,59.4,[],mixeval_240829.csv yi_6b_chat,MixEval MMLU,65.4,[],mixeval_240829.csv qwen1_5_moe_a2_7b_chat,MixEval MMLU,69.5,[],mixeval_240829.csv gemma_1_1_2b_it,MixEval MMLU,51.5,[],mixeval_240829.csv vicuna_7b_v1_5,MixEval MMLU,58.7,[],mixeval_240829.csv olmo_7b_instruct,MixEval MMLU,57.1,[],mixeval_240829.csv qwen1_5_4b_chat,MixEval MMLU,61.4,[],mixeval_240829.csv jetmoe_8b_chat,MixEval MMLU,58.5,[],mixeval_240829.csv mpt_7b_chat,MixEval MMLU,37.8,[],mixeval_240829.csv llama3_70b,MixEval DROP,81.5,[],mixeval_240829.csv qwen1_5_72b,MixEval DROP,64.5,[],mixeval_240829.csv yi_34b,MixEval DROP,78.2,[],mixeval_240829.csv qwen1_5_32b,MixEval DROP,68.7,[],mixeval_240829.csv mixtral_8x7b,MixEval DROP,69.8,[],mixeval_240829.csv llama_2_70b,MixEval DROP,73.2,[],mixeval_240829.csv qwen1_5_moe_a2_7b,MixEval DROP,59.9,[],mixeval_240829.csv qwen1_5_7b,MixEval DROP,63.6,[],mixeval_240829.csv llama3_8b,MixEval DROP,63.8,[],mixeval_240829.csv mistral_7b,MixEval DROP,61.3,[],mixeval_240829.csv gemma_7b,MixEval DROP,63.8,[],mixeval_240829.csv yi_6b,MixEval DROP,51.4,[],mixeval_240829.csv qwen1_5_4b,MixEval DROP,51.0,[],mixeval_240829.csv jetmoe_8b,MixEval DROP,44.1,[],mixeval_240829.csv deepseek_7b,MixEval DROP,43.5,[],mixeval_240829.csv phi_2,MixEval DROP,50.4,[],mixeval_240829.csv deepseekmoe_16b,MixEval DROP,41.1,[],mixeval_240829.csv llama_2_7b,MixEval DROP,37.6,[],mixeval_240829.csv gemma_2b,MixEval DROP,32.6,[],mixeval_240829.csv olmo_7b,MixEval DROP,24.0,[],mixeval_240829.csv mpt_7b,MixEval DROP,26.8,[],mixeval_240829.csv claude_3_5_sonnet_0620,MixEval DROP,93.7,[],mixeval_240829.csv gpt_4o_2024_05_13,MixEval DROP,87.9,[],mixeval_240829.csv claude_3_opus,MixEval DROP,91.5,[],mixeval_240829.csv gpt_4_turbo_2024_04_09,MixEval DROP,91.0,[],mixeval_240829.csv gemini_1_5_pro_api_0409,MixEval DROP,84.2,[],mixeval_240829.csv gemini_1_5_pro_api_0514,MixEval DROP,82.5,[],mixeval_240829.csv mistral_large_2,MixEval DROP,89.3,[],mixeval_240829.csv yi_large_preview,MixEval DROP,87.0,[],mixeval_240829.csv llama3_70b_instruct,MixEval DROP,90.1,[],mixeval_240829.csv qwen_max_0428,MixEval DROP,85.4,[],mixeval_240829.csv claude_3_sonnet,MixEval DROP,87.7,[],mixeval_240829.csv reka_core_20240415,MixEval DROP,88.1,[],mixeval_240829.csv mammoth2_8x7b_plus,MixEval DROP,85.7,[],mixeval_240829.csv deepseek_v2,MixEval DROP,85.3,[],mixeval_240829.csv gpt_4o_mini,MixEval DROP,87.7,[],mixeval_240829.csv command_r_plus,MixEval DROP,80.4,[],mixeval_240829.csv yi_1_5_34b_chat,MixEval DROP,87.0,[],mixeval_240829.csv mistral_large,MixEval DROP,88.6,[],mixeval_240829.csv qwen1_5_72b_chat,MixEval DROP,85.1,[],mixeval_240829.csv mistral_medium,MixEval DROP,83.2,[],mixeval_240829.csv gemini_1_0_pro,MixEval DROP,82.6,[],mixeval_240829.csv reka_flash_20240226,MixEval DROP,86.7,[],mixeval_240829.csv mistral_small,MixEval DROP,86.1,[],mixeval_240829.csv llama3_8b_instruct,MixEval DROP,86.4,[],mixeval_240829.csv command_r,MixEval DROP,72.0,[],mixeval_240829.csv qwen1_5_32b_chat,MixEval DROP,82.9,[],mixeval_240829.csv gpt_3_5_turbo_0125,MixEval DROP,84.8,[],mixeval_240829.csv claude_3_haiku,MixEval DROP,85.0,[],mixeval_240829.csv yi_34b_chat,MixEval DROP,86.1,[],mixeval_240829.csv mixtral_8x7b_instruct_v0_1,MixEval DROP,79.5,[],mixeval_240829.csv starling_lm_7b_beta,MixEval DROP,86.4,[],mixeval_240829.csv yi_1_5_9b_chat,MixEval DROP,83.9,[],mixeval_240829.csv gemma_1_1_7b_it,MixEval DROP,80.6,[],mixeval_240829.csv vicuna_33b_v1_3,MixEval DROP,71.4,[],mixeval_240829.csv llama_2_70b_chat,MixEval DROP,79.8,[],mixeval_240829.csv map_neo_instruct_v0_1,MixEval DROP,75.5,[],mixeval_240829.csv mistral_7b_instruct_v0_2,MixEval DROP,72.8,[],mixeval_240829.csv qwen1_5_7b_chat,MixEval DROP,76.4,[],mixeval_240829.csv reka_edge_20240208,MixEval DROP,80.0,[],mixeval_240829.csv zephyr_7b_beta,MixEval DROP,77.3,[],mixeval_240829.csv llama_2_7b_chat,MixEval DROP,69.3,[],mixeval_240829.csv yi_6b_chat,MixEval DROP,70.5,[],mixeval_240829.csv qwen1_5_moe_a2_7b_chat,MixEval DROP,64.6,[],mixeval_240829.csv gemma_1_1_2b_it,MixEval DROP,59.8,[],mixeval_240829.csv vicuna_7b_v1_5,MixEval DROP,68.3,[],mixeval_240829.csv olmo_7b_instruct,MixEval DROP,53.1,[],mixeval_240829.csv qwen1_5_4b_chat,MixEval DROP,57.2,[],mixeval_240829.csv jetmoe_8b_chat,MixEval DROP,27.0,[],mixeval_240829.csv mpt_7b_chat,MixEval DROP,50.0,[],mixeval_240829.csv llama3_70b,MixEval HellaSwag,90.9,[],mixeval_240829.csv qwen1_5_72b,MixEval HellaSwag,91.9,[],mixeval_240829.csv yi_34b,MixEval HellaSwag,98.0,[],mixeval_240829.csv qwen1_5_32b,MixEval HellaSwag,93.3,[],mixeval_240829.csv mixtral_8x7b,MixEval HellaSwag,73.7,[],mixeval_240829.csv llama_2_70b,MixEval HellaSwag,63.0,[],mixeval_240829.csv qwen1_5_moe_a2_7b,MixEval HellaSwag,80.1,[],mixeval_240829.csv qwen1_5_7b,MixEval HellaSwag,83.8,[],mixeval_240829.csv llama3_8b,MixEval HellaSwag,51.5,[],mixeval_240829.csv mistral_7b,MixEval HellaSwag,54.5,[],mixeval_240829.csv gemma_7b,MixEval HellaSwag,36.0,[],mixeval_240829.csv yi_6b,MixEval HellaSwag,77.4,[],mixeval_240829.csv qwen1_5_4b,MixEval HellaSwag,65.7,[],mixeval_240829.csv jetmoe_8b,MixEval HellaSwag,89.2,[],mixeval_240829.csv deepseek_7b,MixEval HellaSwag,35.0,[],mixeval_240829.csv phi_2,MixEval HellaSwag,20.2,[],mixeval_240829.csv deepseekmoe_16b,MixEval HellaSwag,28.6,[],mixeval_240829.csv llama_2_7b,MixEval HellaSwag,24.9,[],mixeval_240829.csv gemma_2b,MixEval HellaSwag,33.3,[],mixeval_240829.csv olmo_7b,MixEval HellaSwag,26.9,[],mixeval_240829.csv mpt_7b,MixEval HellaSwag,19.2,[],mixeval_240829.csv claude_3_5_sonnet_0620,MixEval HellaSwag,94.6,[],mixeval_240829.csv gpt_4o_2024_05_13,MixEval HellaSwag,94.3,[],mixeval_240829.csv claude_3_opus,MixEval HellaSwag,93.3,[],mixeval_240829.csv gpt_4_turbo_2024_04_09,MixEval HellaSwag,92.6,[],mixeval_240829.csv gemini_1_5_pro_api_0409,MixEval HellaSwag,89.2,[],mixeval_240829.csv gemini_1_5_pro_api_0514,MixEval HellaSwag,91.2,[],mixeval_240829.csv mistral_large_2,MixEval HellaSwag,80.1,[],mixeval_240829.csv yi_large_preview,MixEval HellaSwag,92.6,[],mixeval_240829.csv llama3_70b_instruct,MixEval HellaSwag,81.8,[],mixeval_240829.csv qwen_max_0428,MixEval HellaSwag,93.6,[],mixeval_240829.csv claude_3_sonnet,MixEval HellaSwag,85.9,[],mixeval_240829.csv reka_core_20240415,MixEval HellaSwag,88.6,[],mixeval_240829.csv mammoth2_8x7b_plus,MixEval HellaSwag,82.2,[],mixeval_240829.csv deepseek_v2,MixEval HellaSwag,88.2,[],mixeval_240829.csv gpt_4o_mini,MixEval HellaSwag,83.8,[],mixeval_240829.csv command_r_plus,MixEval HellaSwag,83.5,[],mixeval_240829.csv yi_1_5_34b_chat,MixEval HellaSwag,90.2,[],mixeval_240829.csv mistral_large,MixEval HellaSwag,65.0,[],mixeval_240829.csv qwen1_5_72b_chat,MixEval HellaSwag,87.9,[],mixeval_240829.csv mistral_medium,MixEval HellaSwag,72.4,[],mixeval_240829.csv gemini_1_0_pro,MixEval HellaSwag,74.7,[],mixeval_240829.csv reka_flash_20240226,MixEval HellaSwag,90.6,[],mixeval_240829.csv mistral_small,MixEval HellaSwag,73.4,[],mixeval_240829.csv llama3_8b_instruct,MixEval HellaSwag,65.7,[],mixeval_240829.csv command_r,MixEval HellaSwag,75.8,[],mixeval_240829.csv qwen1_5_32b_chat,MixEval HellaSwag,85.9,[],mixeval_240829.csv gpt_3_5_turbo_0125,MixEval HellaSwag,63.0,[],mixeval_240829.csv claude_3_haiku,MixEval HellaSwag,75.8,[],mixeval_240829.csv yi_34b_chat,MixEval HellaSwag,86.9,[],mixeval_240829.csv mixtral_8x7b_instruct_v0_1,MixEval HellaSwag,54.2,[],mixeval_240829.csv starling_lm_7b_beta,MixEval HellaSwag,48.5,[],mixeval_240829.csv yi_1_5_9b_chat,MixEval HellaSwag,86.5,[],mixeval_240829.csv gemma_1_1_7b_it,MixEval HellaSwag,66.3,[],mixeval_240829.csv vicuna_33b_v1_3,MixEval HellaSwag,30.3,[],mixeval_240829.csv llama_2_70b_chat,MixEval HellaSwag,67.3,[],mixeval_240829.csv map_neo_instruct_v0_1,MixEval HellaSwag,74.4,[],mixeval_240829.csv mistral_7b_instruct_v0_2,MixEval HellaSwag,54.2,[],mixeval_240829.csv qwen1_5_7b_chat,MixEval HellaSwag,76.1,[],mixeval_240829.csv reka_edge_20240208,MixEval HellaSwag,74.7,[],mixeval_240829.csv zephyr_7b_beta,MixEval HellaSwag,39.1,[],mixeval_240829.csv llama_2_7b_chat,MixEval HellaSwag,35.7,[],mixeval_240829.csv yi_6b_chat,MixEval HellaSwag,52.5,[],mixeval_240829.csv qwen1_5_moe_a2_7b_chat,MixEval HellaSwag,72.7,[],mixeval_240829.csv gemma_1_1_2b_it,MixEval HellaSwag,26.6,[],mixeval_240829.csv vicuna_7b_v1_5,MixEval HellaSwag,24.9,[],mixeval_240829.csv olmo_7b_instruct,MixEval HellaSwag,55.9,[],mixeval_240829.csv qwen1_5_4b_chat,MixEval HellaSwag,54.9,[],mixeval_240829.csv jetmoe_8b_chat,MixEval HellaSwag,86.2,[],mixeval_240829.csv mpt_7b_chat,MixEval HellaSwag,25.6,[],mixeval_240829.csv llama3_70b,MixEval CommonsenseQA,85.4,[],mixeval_240829.csv qwen1_5_72b,MixEval CommonsenseQA,87.3,[],mixeval_240829.csv yi_34b,MixEval CommonsenseQA,81.1,[],mixeval_240829.csv qwen1_5_32b,MixEval CommonsenseQA,89.2,[],mixeval_240829.csv mixtral_8x7b,MixEval CommonsenseQA,77.4,[],mixeval_240829.csv llama_2_70b,MixEval CommonsenseQA,77.4,[],mixeval_240829.csv qwen1_5_moe_a2_7b,MixEval CommonsenseQA,80.2,[],mixeval_240829.csv qwen1_5_7b,MixEval CommonsenseQA,84.4,[],mixeval_240829.csv llama3_8b,MixEval CommonsenseQA,69.8,[],mixeval_240829.csv mistral_7b,MixEval CommonsenseQA,67.9,[],mixeval_240829.csv gemma_7b,MixEval CommonsenseQA,68.4,[],mixeval_240829.csv yi_6b,MixEval CommonsenseQA,76.4,[],mixeval_240829.csv qwen1_5_4b,MixEval CommonsenseQA,79.2,[],mixeval_240829.csv jetmoe_8b,MixEval CommonsenseQA,60.4,[],mixeval_240829.csv deepseek_7b,MixEval CommonsenseQA,51.4,[],mixeval_240829.csv phi_2,MixEval CommonsenseQA,68.9,[],mixeval_240829.csv deepseekmoe_16b,MixEval CommonsenseQA,48.6,[],mixeval_240829.csv llama_2_7b,MixEval CommonsenseQA,30.7,[],mixeval_240829.csv gemma_2b,MixEval CommonsenseQA,31.6,[],mixeval_240829.csv olmo_7b,MixEval CommonsenseQA,25.5,[],mixeval_240829.csv mpt_7b,MixEval CommonsenseQA,28.8,[],mixeval_240829.csv claude_3_5_sonnet_0620,MixEval CommonsenseQA,85.4,[],mixeval_240829.csv gpt_4o_2024_05_13,MixEval CommonsenseQA,86.8,[],mixeval_240829.csv claude_3_opus,MixEval CommonsenseQA,87.7,[],mixeval_240829.csv gpt_4_turbo_2024_04_09,MixEval CommonsenseQA,85.4,[],mixeval_240829.csv gemini_1_5_pro_api_0409,MixEval CommonsenseQA,84.4,[],mixeval_240829.csv gemini_1_5_pro_api_0514,MixEval CommonsenseQA,82.5,[],mixeval_240829.csv mistral_large_2,MixEval CommonsenseQA,81.6,[],mixeval_240829.csv yi_large_preview,MixEval CommonsenseQA,90.1,[],mixeval_240829.csv llama3_70b_instruct,MixEval CommonsenseQA,83.0,[],mixeval_240829.csv qwen_max_0428,MixEval CommonsenseQA,88.2,[],mixeval_240829.csv claude_3_sonnet,MixEval CommonsenseQA,82.5,[],mixeval_240829.csv reka_core_20240415,MixEval CommonsenseQA,81.6,[],mixeval_240829.csv mammoth2_8x7b_plus,MixEval CommonsenseQA,82.5,[],mixeval_240829.csv deepseek_v2,MixEval CommonsenseQA,84.0,[],mixeval_240829.csv gpt_4o_mini,MixEval CommonsenseQA,84.9,[],mixeval_240829.csv command_r_plus,MixEval CommonsenseQA,82.1,[],mixeval_240829.csv yi_1_5_34b_chat,MixEval CommonsenseQA,86.8,[],mixeval_240829.csv mistral_large,MixEval CommonsenseQA,83.5,[],mixeval_240829.csv qwen1_5_72b_chat,MixEval CommonsenseQA,86.3,[],mixeval_240829.csv mistral_medium,MixEval CommonsenseQA,82.5,[],mixeval_240829.csv gemini_1_0_pro,MixEval CommonsenseQA,80.2,[],mixeval_240829.csv reka_flash_20240226,MixEval CommonsenseQA,80.7,[],mixeval_240829.csv mistral_small,MixEval CommonsenseQA,77.8,[],mixeval_240829.csv llama3_8b_instruct,MixEval CommonsenseQA,78.3,[],mixeval_240829.csv command_r,MixEval CommonsenseQA,77.4,[],mixeval_240829.csv qwen1_5_32b_chat,MixEval CommonsenseQA,88.2,[],mixeval_240829.csv gpt_3_5_turbo_0125,MixEval CommonsenseQA,81.6,[],mixeval_240829.csv claude_3_haiku,MixEval CommonsenseQA,78.8,[],mixeval_240829.csv yi_34b_chat,MixEval CommonsenseQA,78.8,[],mixeval_240829.csv mixtral_8x7b_instruct_v0_1,MixEval CommonsenseQA,77.4,[],mixeval_240829.csv starling_lm_7b_beta,MixEval CommonsenseQA,84.9,[],mixeval_240829.csv yi_1_5_9b_chat,MixEval CommonsenseQA,82.5,[],mixeval_240829.csv gemma_1_1_7b_it,MixEval CommonsenseQA,73.6,[],mixeval_240829.csv vicuna_33b_v1_3,MixEval CommonsenseQA,61.8,[],mixeval_240829.csv llama_2_70b_chat,MixEval CommonsenseQA,74.1,[],mixeval_240829.csv map_neo_instruct_v0_1,MixEval CommonsenseQA,82.1,[],mixeval_240829.csv mistral_7b_instruct_v0_2,MixEval CommonsenseQA,66.0,[],mixeval_240829.csv qwen1_5_7b_chat,MixEval CommonsenseQA,82.1,[],mixeval_240829.csv reka_edge_20240208,MixEval CommonsenseQA,80.7,[],mixeval_240829.csv zephyr_7b_beta,MixEval CommonsenseQA,69.3,[],mixeval_240829.csv llama_2_7b_chat,MixEval CommonsenseQA,61.3,[],mixeval_240829.csv yi_6b_chat,MixEval CommonsenseQA,69.8,[],mixeval_240829.csv qwen1_5_moe_a2_7b_chat,MixEval CommonsenseQA,81.1,[],mixeval_240829.csv gemma_1_1_2b_it,MixEval CommonsenseQA,57.1,[],mixeval_240829.csv vicuna_7b_v1_5,MixEval CommonsenseQA,62.7,[],mixeval_240829.csv olmo_7b_instruct,MixEval CommonsenseQA,64.6,[],mixeval_240829.csv qwen1_5_4b_chat,MixEval CommonsenseQA,74.1,[],mixeval_240829.csv jetmoe_8b_chat,MixEval CommonsenseQA,68.4,[],mixeval_240829.csv mpt_7b_chat,MixEval CommonsenseQA,36.3,[],mixeval_240829.csv llama3_70b,MixEval TriviaQA Hard,59.1,[],mixeval_240829.csv qwen1_5_72b,MixEval TriviaQA Hard,41.4,[],mixeval_240829.csv yi_34b,MixEval TriviaQA Hard,39.4,[],mixeval_240829.csv qwen1_5_32b,MixEval TriviaQA Hard,28.0,[],mixeval_240829.csv mixtral_8x7b,MixEval TriviaQA Hard,44.1,[],mixeval_240829.csv llama_2_70b,MixEval TriviaQA Hard,53.8,[],mixeval_240829.csv qwen1_5_moe_a2_7b,MixEval TriviaQA Hard,36.0,[],mixeval_240829.csv qwen1_5_7b,MixEval TriviaQA Hard,31.6,[],mixeval_240829.csv llama3_8b,MixEval TriviaQA Hard,22.6,[],mixeval_240829.csv mistral_7b,MixEval TriviaQA Hard,24.2,[],mixeval_240829.csv gemma_7b,MixEval TriviaQA Hard,31.1,[],mixeval_240829.csv yi_6b,MixEval TriviaQA Hard,17.0,[],mixeval_240829.csv qwen1_5_4b,MixEval TriviaQA Hard,14.0,[],mixeval_240829.csv jetmoe_8b,MixEval TriviaQA Hard,22.8,[],mixeval_240829.csv deepseek_7b,MixEval TriviaQA Hard,21.4,[],mixeval_240829.csv phi_2,MixEval TriviaQA Hard,7.3,[],mixeval_240829.csv deepseekmoe_16b,MixEval TriviaQA Hard,24.9,[],mixeval_240829.csv llama_2_7b,MixEval TriviaQA Hard,19.5,[],mixeval_240829.csv gemma_2b,MixEval TriviaQA Hard,12.1,[],mixeval_240829.csv olmo_7b,MixEval TriviaQA Hard,16.0,[],mixeval_240829.csv mpt_7b,MixEval TriviaQA Hard,6.6,[],mixeval_240829.csv claude_3_5_sonnet_0620,MixEval TriviaQA Hard,73.3,[],mixeval_240829.csv llama3_1_405b_instruct,MixEval TriviaQA Hard,72.0,[],mixeval_240829.csv gpt_4o_2024_05_13,MixEval TriviaQA Hard,70.3,[],mixeval_240829.csv claude_3_opus,MixEval TriviaQA Hard,71.4,[],mixeval_240829.csv gpt_4_turbo_2024_04_09,MixEval TriviaQA Hard,73.1,[],mixeval_240829.csv gemini_1_5_pro_api_0409,MixEval TriviaQA Hard,67.8,[],mixeval_240829.csv gemini_1_5_pro_api_0514,MixEval TriviaQA Hard,59.4,[],mixeval_240829.csv mistral_large_2,MixEval TriviaQA Hard,64.8,[],mixeval_240829.csv yi_large_preview,MixEval TriviaQA Hard,55.4,[],mixeval_240829.csv llama3_70b_instruct,MixEval TriviaQA Hard,60.5,[],mixeval_240829.csv qwen_max_0428,MixEval TriviaQA Hard,61.5,[],mixeval_240829.csv claude_3_sonnet,MixEval TriviaQA Hard,59.1,[],mixeval_240829.csv reka_core_20240415,MixEval TriviaQA Hard,51.6,[],mixeval_240829.csv mammoth2_8x7b_plus,MixEval TriviaQA Hard,52.9,[],mixeval_240829.csv deepseek_v2,MixEval TriviaQA Hard,51.7,[],mixeval_240829.csv gpt_4o_mini,MixEval TriviaQA Hard,45.3,[],mixeval_240829.csv command_r_plus,MixEval TriviaQA Hard,57.5,[],mixeval_240829.csv yi_1_5_34b_chat,MixEval TriviaQA Hard,44.4,[],mixeval_240829.csv mistral_large,MixEval TriviaQA Hard,55.5,[],mixeval_240829.csv qwen1_5_72b_chat,MixEval TriviaQA Hard,49.9,[],mixeval_240829.csv mistral_medium,MixEval TriviaQA Hard,59.8,[],mixeval_240829.csv gemini_1_0_pro,MixEval TriviaQA Hard,58.2,[],mixeval_240829.csv reka_flash_20240226,MixEval TriviaQA Hard,42.9,[],mixeval_240829.csv mistral_small,MixEval TriviaQA Hard,56.0,[],mixeval_240829.csv llama3_8b_instruct,MixEval TriviaQA Hard,40.2,[],mixeval_240829.csv command_r,MixEval TriviaQA Hard,57.0,[],mixeval_240829.csv qwen1_5_32b_chat,MixEval TriviaQA Hard,39.1,[],mixeval_240829.csv gpt_3_5_turbo_0125,MixEval TriviaQA Hard,46.4,[],mixeval_240829.csv claude_3_haiku,MixEval TriviaQA Hard,42.4,[],mixeval_240829.csv yi_34b_chat,MixEval TriviaQA Hard,41.5,[],mixeval_240829.csv mixtral_8x7b_instruct_v0_1,MixEval TriviaQA Hard,48.5,[],mixeval_240829.csv starling_lm_7b_beta,MixEval TriviaQA Hard,33.4,[],mixeval_240829.csv yi_1_5_9b_chat,MixEval TriviaQA Hard,23.3,[],mixeval_240829.csv gemma_1_1_7b_it,MixEval TriviaQA Hard,30.3,[],mixeval_240829.csv vicuna_33b_v1_3,MixEval TriviaQA Hard,42.5,[],mixeval_240829.csv llama_2_70b_chat,MixEval TriviaQA Hard,42.2,[],mixeval_240829.csv map_neo_instruct_v0_1,MixEval TriviaQA Hard,26.5,[],mixeval_240829.csv mistral_7b_instruct_v0_2,MixEval TriviaQA Hard,33.5,[],mixeval_240829.csv qwen1_5_7b_chat,MixEval TriviaQA Hard,29.0,[],mixeval_240829.csv reka_edge_20240208,MixEval TriviaQA Hard,18.6,[],mixeval_240829.csv zephyr_7b_beta,MixEval TriviaQA Hard,30.2,[],mixeval_240829.csv llama_2_7b_chat,MixEval TriviaQA Hard,24.8,[],mixeval_240829.csv yi_6b_chat,MixEval TriviaQA Hard,18.9,[],mixeval_240829.csv qwen1_5_moe_a2_7b_chat,MixEval TriviaQA Hard,21.9,[],mixeval_240829.csv gemma_1_1_2b_it,MixEval TriviaQA Hard,31.9,[],mixeval_240829.csv vicuna_7b_v1_5,MixEval TriviaQA Hard,25.9,[],mixeval_240829.csv olmo_7b_instruct,MixEval TriviaQA Hard,24.7,[],mixeval_240829.csv qwen1_5_4b_chat,MixEval TriviaQA Hard,16.5,[],mixeval_240829.csv jetmoe_8b_chat,MixEval TriviaQA Hard,19.2,[],mixeval_240829.csv mpt_7b_chat,MixEval TriviaQA Hard,17.5,[],mixeval_240829.csv llama3_70b,MixEval MMLU Hard,39.8,[],mixeval_240829.csv qwen1_5_72b,MixEval MMLU Hard,42.4,[],mixeval_240829.csv yi_34b,MixEval MMLU Hard,42.4,[],mixeval_240829.csv qwen1_5_32b,MixEval MMLU Hard,37.2,[],mixeval_240829.csv mixtral_8x7b,MixEval MMLU Hard,34.6,[],mixeval_240829.csv llama_2_70b,MixEval MMLU Hard,29.0,[],mixeval_240829.csv qwen1_5_moe_a2_7b,MixEval MMLU Hard,30.7,[],mixeval_240829.csv qwen1_5_7b,MixEval MMLU Hard,28.6,[],mixeval_240829.csv llama3_8b,MixEval MMLU Hard,38.5,[],mixeval_240829.csv mistral_7b,MixEval MMLU Hard,27.7,[],mixeval_240829.csv gemma_7b,MixEval MMLU Hard,28.1,[],mixeval_240829.csv yi_6b,MixEval MMLU Hard,37.2,[],mixeval_240829.csv qwen1_5_4b,MixEval MMLU Hard,22.9,[],mixeval_240829.csv jetmoe_8b,MixEval MMLU Hard,27.3,[],mixeval_240829.csv deepseek_7b,MixEval MMLU Hard,26.4,[],mixeval_240829.csv phi_2,MixEval MMLU Hard,29.0,[],mixeval_240829.csv deepseekmoe_16b,MixEval MMLU Hard,30.7,[],mixeval_240829.csv llama_2_7b,MixEval MMLU Hard,24.7,[],mixeval_240829.csv gemma_2b,MixEval MMLU Hard,27.3,[],mixeval_240829.csv olmo_7b,MixEval MMLU Hard,25.1,[],mixeval_240829.csv mpt_7b,MixEval MMLU Hard,24.2,[],mixeval_240829.csv claude_3_5_sonnet_0620,MixEval MMLU Hard,58.4,[],mixeval_240829.csv llama3_1_405b_instruct,MixEval MMLU Hard,57.1,[],mixeval_240829.csv gpt_4o_2024_05_13,MixEval MMLU Hard,57.1,[],mixeval_240829.csv claude_3_opus,MixEval MMLU Hard,55.0,[],mixeval_240829.csv gpt_4_turbo_2024_04_09,MixEval MMLU Hard,45.5,[],mixeval_240829.csv gemini_1_5_pro_api_0409,MixEval MMLU Hard,44.6,[],mixeval_240829.csv gemini_1_5_pro_api_0514,MixEval MMLU Hard,54.5,[],mixeval_240829.csv mistral_large_2,MixEval MMLU Hard,42.9,[],mixeval_240829.csv yi_large_preview,MixEval MMLU Hard,48.5,[],mixeval_240829.csv llama3_70b_instruct,MixEval MMLU Hard,46.3,[],mixeval_240829.csv qwen_max_0428,MixEval MMLU Hard,41.6,[],mixeval_240829.csv claude_3_sonnet,MixEval MMLU Hard,40.7,[],mixeval_240829.csv reka_core_20240415,MixEval MMLU Hard,46.3,[],mixeval_240829.csv mammoth2_8x7b_plus,MixEval MMLU Hard,41.1,[],mixeval_240829.csv deepseek_v2,MixEval MMLU Hard,42.0,[],mixeval_240829.csv gpt_4o_mini,MixEval MMLU Hard,45.0,[],mixeval_240829.csv command_r_plus,MixEval MMLU Hard,42.0,[],mixeval_240829.csv yi_1_5_34b_chat,MixEval MMLU Hard,38.1,[],mixeval_240829.csv mistral_large,MixEval MMLU Hard,42.4,[],mixeval_240829.csv qwen1_5_72b_chat,MixEval MMLU Hard,37.7,[],mixeval_240829.csv mistral_medium,MixEval MMLU Hard,38.5,[],mixeval_240829.csv gemini_1_0_pro,MixEval MMLU Hard,35.5,[],mixeval_240829.csv reka_flash_20240226,MixEval MMLU Hard,34.6,[],mixeval_240829.csv mistral_small,MixEval MMLU Hard,33.8,[],mixeval_240829.csv llama3_8b_instruct,MixEval MMLU Hard,40.7,[],mixeval_240829.csv command_r,MixEval MMLU Hard,39.0,[],mixeval_240829.csv qwen1_5_32b_chat,MixEval MMLU Hard,29.9,[],mixeval_240829.csv gpt_3_5_turbo_0125,MixEval MMLU Hard,35.1,[],mixeval_240829.csv claude_3_haiku,MixEval MMLU Hard,30.7,[],mixeval_240829.csv yi_34b_chat,MixEval MMLU Hard,29.9,[],mixeval_240829.csv mixtral_8x7b_instruct_v0_1,MixEval MMLU Hard,37.2,[],mixeval_240829.csv starling_lm_7b_beta,MixEval MMLU Hard,34.2,[],mixeval_240829.csv yi_1_5_9b_chat,MixEval MMLU Hard,36.8,[],mixeval_240829.csv gemma_1_1_7b_it,MixEval MMLU Hard,39.0,[],mixeval_240829.csv vicuna_33b_v1_3,MixEval MMLU Hard,39.4,[],mixeval_240829.csv llama_2_70b_chat,MixEval MMLU Hard,27.7,[],mixeval_240829.csv map_neo_instruct_v0_1,MixEval MMLU Hard,32.5,[],mixeval_240829.csv mistral_7b_instruct_v0_2,MixEval MMLU Hard,29.4,[],mixeval_240829.csv qwen1_5_7b_chat,MixEval MMLU Hard,29.0,[],mixeval_240829.csv reka_edge_20240208,MixEval MMLU Hard,26.4,[],mixeval_240829.csv zephyr_7b_beta,MixEval MMLU Hard,24.2,[],mixeval_240829.csv llama_2_7b_chat,MixEval MMLU Hard,30.3,[],mixeval_240829.csv yi_6b_chat,MixEval MMLU Hard,26.8,[],mixeval_240829.csv qwen1_5_moe_a2_7b_chat,MixEval MMLU Hard,26.8,[],mixeval_240829.csv gemma_1_1_2b_it,MixEval MMLU Hard,30.3,[],mixeval_240829.csv vicuna_7b_v1_5,MixEval MMLU Hard,23.4,[],mixeval_240829.csv olmo_7b_instruct,MixEval MMLU Hard,27.3,[],mixeval_240829.csv qwen1_5_4b_chat,MixEval MMLU Hard,17.3,[],mixeval_240829.csv jetmoe_8b_chat,MixEval MMLU Hard,25.5,[],mixeval_240829.csv mpt_7b_chat,MixEval MMLU Hard,24.7,[],mixeval_240829.csv llama3_70b,MixEval DROP Hard,59.5,[],mixeval_240829.csv qwen1_5_72b,MixEval DROP Hard,26.2,[],mixeval_240829.csv yi_34b,MixEval DROP Hard,56.5,[],mixeval_240829.csv qwen1_5_32b,MixEval DROP Hard,36.9,[],mixeval_240829.csv mixtral_8x7b,MixEval DROP Hard,42.0,[],mixeval_240829.csv llama_2_70b,MixEval DROP Hard,46.1,[],mixeval_240829.csv qwen1_5_moe_a2_7b,MixEval DROP Hard,31.0,[],mixeval_240829.csv qwen1_5_7b,MixEval DROP Hard,29.8,[],mixeval_240829.csv llama3_8b,MixEval DROP Hard,37.1,[],mixeval_240829.csv mistral_7b,MixEval DROP Hard,34.5,[],mixeval_240829.csv gemma_7b,MixEval DROP Hard,31.4,[],mixeval_240829.csv yi_6b,MixEval DROP Hard,19.4,[],mixeval_240829.csv qwen1_5_4b,MixEval DROP Hard,24.7,[],mixeval_240829.csv jetmoe_8b,MixEval DROP Hard,19.2,[],mixeval_240829.csv deepseek_7b,MixEval DROP Hard,21.4,[],mixeval_240829.csv phi_2,MixEval DROP Hard,27.1,[],mixeval_240829.csv deepseekmoe_16b,MixEval DROP Hard,12.2,[],mixeval_240829.csv llama_2_7b,MixEval DROP Hard,14.9,[],mixeval_240829.csv gemma_2b,MixEval DROP Hard,13.2,[],mixeval_240829.csv olmo_7b,MixEval DROP Hard,11.1,[],mixeval_240829.csv mpt_7b,MixEval DROP Hard,9.2,[],mixeval_240829.csv claude_3_5_sonnet_0620,MixEval DROP Hard,80.4,[],mixeval_240829.csv llama3_1_405b_instruct,MixEval DROP Hard,69.2,[],mixeval_240829.csv gpt_4o_2024_05_13,MixEval DROP Hard,67.5,[],mixeval_240829.csv claude_3_opus,MixEval DROP Hard,75.2,[],mixeval_240829.csv gpt_4_turbo_2024_04_09,MixEval DROP Hard,71.0,[],mixeval_240829.csv gemini_1_5_pro_api_0409,MixEval DROP Hard,64.8,[],mixeval_240829.csv gemini_1_5_pro_api_0514,MixEval DROP Hard,55.2,[],mixeval_240829.csv mistral_large_2,MixEval DROP Hard,72.0,[],mixeval_240829.csv yi_large_preview,MixEval DROP Hard,63.1,[],mixeval_240829.csv llama3_70b_instruct,MixEval DROP Hard,74.5,[],mixeval_240829.csv qwen_max_0428,MixEval DROP Hard,53.5,[],mixeval_240829.csv claude_3_sonnet,MixEval DROP Hard,66.9,[],mixeval_240829.csv reka_core_20240415,MixEval DROP Hard,66.6,[],mixeval_240829.csv mammoth2_8x7b_plus,MixEval DROP Hard,65.1,[],mixeval_240829.csv deepseek_v2,MixEval DROP Hard,62.8,[],mixeval_240829.csv gpt_4o_mini,MixEval DROP Hard,68.1,[],mixeval_240829.csv command_r_plus,MixEval DROP Hard,65.0,[],mixeval_240829.csv yi_1_5_34b_chat,MixEval DROP Hard,67.4,[],mixeval_240829.csv mistral_large,MixEval DROP Hard,61.6,[],mixeval_240829.csv qwen1_5_72b_chat,MixEval DROP Hard,56.5,[],mixeval_240829.csv mistral_medium,MixEval DROP Hard,47.1,[],mixeval_240829.csv gemini_1_0_pro,MixEval DROP Hard,54.1,[],mixeval_240829.csv reka_flash_20240226,MixEval DROP Hard,65.0,[],mixeval_240829.csv mistral_small,MixEval DROP Hard,52.6,[],mixeval_240829.csv llama3_8b_instruct,MixEval DROP Hard,67.6,[],mixeval_240829.csv command_r,MixEval DROP Hard,42.0,[],mixeval_240829.csv qwen1_5_32b_chat,MixEval DROP Hard,54.4,[],mixeval_240829.csv gpt_3_5_turbo_0125,MixEval DROP Hard,55.4,[],mixeval_240829.csv claude_3_haiku,MixEval DROP Hard,51.5,[],mixeval_240829.csv yi_34b_chat,MixEval DROP Hard,57.1,[],mixeval_240829.csv mixtral_8x7b_instruct_v0_1,MixEval DROP Hard,47.7,[],mixeval_240829.csv starling_lm_7b_beta,MixEval DROP Hard,62.9,[],mixeval_240829.csv yi_1_5_9b_chat,MixEval DROP Hard,61.3,[],mixeval_240829.csv gemma_1_1_7b_it,MixEval DROP Hard,55.1,[],mixeval_240829.csv vicuna_33b_v1_3,MixEval DROP Hard,36.6,[],mixeval_240829.csv llama_2_70b_chat,MixEval DROP Hard,42.2,[],mixeval_240829.csv map_neo_instruct_v0_1,MixEval DROP Hard,42.4,[],mixeval_240829.csv mistral_7b_instruct_v0_2,MixEval DROP Hard,44.3,[],mixeval_240829.csv qwen1_5_7b_chat,MixEval DROP Hard,50.0,[],mixeval_240829.csv reka_edge_20240208,MixEval DROP Hard,56.9,[],mixeval_240829.csv zephyr_7b_beta,MixEval DROP Hard,45.3,[],mixeval_240829.csv llama_2_7b_chat,MixEval DROP Hard,44.3,[],mixeval_240829.csv yi_6b_chat,MixEval DROP Hard,43.7,[],mixeval_240829.csv qwen1_5_moe_a2_7b_chat,MixEval DROP Hard,39.5,[],mixeval_240829.csv gemma_1_1_2b_it,MixEval DROP Hard,27.8,[],mixeval_240829.csv vicuna_7b_v1_5,MixEval DROP Hard,33.2,[],mixeval_240829.csv olmo_7b_instruct,MixEval DROP Hard,22.9,[],mixeval_240829.csv qwen1_5_4b_chat,MixEval DROP Hard,28.6,[],mixeval_240829.csv jetmoe_8b_chat,MixEval DROP Hard,11.5,[],mixeval_240829.csv mpt_7b_chat,MixEval DROP Hard,31.0,[],mixeval_240829.csv gpt4,toolbench,68.8,[],toolbench_240829.csv text_davinci_003,toolbench,67.2,[],toolbench_240829.csv gpt_3_5_turbo,toolbench,56.6,[],toolbench_240829.csv text_curie_001,toolbench,10.6,[],toolbench_240829.csv llama_2_70b,toolbench,61.0,[],toolbench_240829.csv llama_2_13b,toolbench,48.8,[],toolbench_240829.csv llama_2_7b,toolbench,39.5,[],toolbench_240829.csv llama_65b,toolbench,55.6,[],toolbench_240829.csv llama30b,toolbench,49.6,[],toolbench_240829.csv llama_13b,toolbench,36.8,[],toolbench_240829.csv llama_13b_alpaca,toolbench,26.9,[],toolbench_240829.csv codellama_7b,toolbench,48.3,[],toolbench_240829.csv codellama_7b_instruct,toolbench,50.5,[],toolbench_240829.csv codellama_7b_python,toolbench,52.2,[],toolbench_240829.csv codellama_13b,toolbench,56.9,[],toolbench_240829.csv codellama_13b_instruct,toolbench,60.5,[],toolbench_240829.csv codellama_13b_python,toolbench,56.3,[],toolbench_240829.csv codellama34b,toolbench,62.9,[],toolbench_240829.csv codellama34b_instruct,toolbench,64.8,[],toolbench_240829.csv codellama34b_python,toolbench,59.2,[],toolbench_240829.csv starcoder,toolbench,49.7,[],toolbench_240829.csv starcoderbase,toolbench,52.2,[],toolbench_240829.csv codegen_16b_nl,toolbench,28.2,[],toolbench_240829.csv codegen_16b_multi,toolbench,28.8,[],toolbench_240829.csv codegen_16b_mono,toolbench,35.6,[],toolbench_240829.csv bloomz,toolbench,27.8,[],toolbench_240829.csv opt_iml_30b,toolbench,14.1,[],toolbench_240829.csv opt_30b,toolbench,13.4,[],toolbench_240829.csv opt_iml_1_3b,toolbench,7.0,[],toolbench_240829.csv opt_1_3b,toolbench,7.5,[],toolbench_240829.csv neox_20b,toolbench,26.4,[],toolbench_240829.csv gpt_neoxt_chat_base_20b,toolbench,22.6,[],toolbench_240829.csv pythia_12b,toolbench,19.5,[],toolbench_240829.csv dolly_v2_12b,toolbench,5.0,[],toolbench_240829.csv pythia_6_9b,toolbench,19.4,[],toolbench_240829.csv pythia_2_8b,toolbench,18.6,[],toolbench_240829.csv pythia_1_4b,toolbench,15.9,[],toolbench_240829.csv stablelm_base_alpha_7b,toolbench,10.8,[],toolbench_240829.csv stablelm_tuned_alpha_7b,toolbench,9.2,[],toolbench_240829.csv stablelm_base_alpha_3b,toolbench,5.2,[],toolbench_240829.csv stablelm_tuned_alpha_3b,toolbench,6.6,[],toolbench_240829.csv llama30b_toolbench,toolbench,50.2,[],toolbench_240829.csv starcoder_toolbench,toolbench,51.7,[],toolbench_240829.csv codegen_16b_mono_toolbench,toolbench,51.6,[],toolbench_240829.csv 01,AlphacaEval v2lc,18.1,,alphacaeval_v2lc_240829.csv 02,AlphacaEval v2lc,32.7,,alphacaeval_v2lc_240829.csv 06,AlphacaEval v2lc,50.0,,alphacaeval_v2lc_240829.csv 09,AlphacaEval v2lc,58.3,,alphacaeval_v2lc_240829.csv 13,AlphacaEval v2lc,57.5,,alphacaeval_v2lc_240829.csv 14,AlphacaEval v2lc,35.3,,alphacaeval_v2lc_240829.csv 18,AlphacaEval v2lc,50.7,,alphacaeval_v2lc_240829.csv 20,AlphacaEval v2lc,52.4,,alphacaeval_v2lc_240829.csv 29,AlphacaEval v2lc,40.5,,alphacaeval_v2lc_240829.csv airoboros_33b,AlphacaEval v2lc,10.7,,alphacaeval_v2lc_240829.csv airoboros_65b,AlphacaEval v2lc,11.0,,alphacaeval_v2lc_240829.csv aligner_2b+claude_3_opus,AlphacaEval v2lc,41.8,,alphacaeval_v2lc_240829.csv aligner_2b+qwen1_5_72b_chat,AlphacaEval v2lc,36.7,,alphacaeval_v2lc_240829.csv alpaca_7b,AlphacaEval v2lc,5.9,,alphacaeval_v2lc_240829.csv alpaca_farm_ppo_human_7b,AlphacaEval v2lc,6.4,,alphacaeval_v2lc_240829.csv alpaca_farm_ppo_sim_gpt_4_7b,AlphacaEval v2lc,7.1,,alphacaeval_v2lc_240829.csv baichuan_13b_chat,AlphacaEval v2lc,2.1,,alphacaeval_v2lc_240829.csv baize_v2_13b,AlphacaEval v2lc,7.0,,alphacaeval_v2lc_240829.csv baize_v2_7b,AlphacaEval v2lc,4.4,,alphacaeval_v2lc_240829.csv blendax_ai_gm_l3_v35,AlphacaEval v2lc,73.4,,alphacaeval_v2lc_240829.csv blendax_ai_gm_l6_vo31,AlphacaEval v2lc,76.9,,alphacaeval_v2lc_240829.csv causallm_14b,AlphacaEval v2lc,15.7,,alphacaeval_v2lc_240829.csv chatglm2_6b,AlphacaEval v2lc,4.4,,alphacaeval_v2lc_240829.csv claude,AlphacaEval v2lc,27.3,,alphacaeval_v2lc_240829.csv claude2_alpaca_13b,AlphacaEval v2lc,11.5,,alphacaeval_v2lc_240829.csv claude_2,AlphacaEval v2lc,28.2,,alphacaeval_v2lc_240829.csv claude_2_1,AlphacaEval v2lc,25.3,,alphacaeval_v2lc_240829.csv claude_instant_1_2,AlphacaEval v2lc,25.6,,alphacaeval_v2lc_240829.csv cohere_command,AlphacaEval v2lc,10.9,,alphacaeval_v2lc_240829.csv conifer_7b_dpo,AlphacaEval v2lc,17.1,,alphacaeval_v2lc_240829.csv contextual_ai_kto_mistral_pairrm,AlphacaEval v2lc,29.7,,alphacaeval_v2lc_240829.csv cut_13b,AlphacaEval v2lc,12.2,,alphacaeval_v2lc_240829.csv davinci001,AlphacaEval v2lc,9.0,,alphacaeval_v2lc_240829.csv dbrx_instruct,AlphacaEval v2lc,25.4,,alphacaeval_v2lc_240829.csv deepseek_llm_67b_chat,AlphacaEval v2lc,17.8,,alphacaeval_v2lc_240829.csv deita_7b_v1_0,AlphacaEval v2lc,16.1,,alphacaeval_v2lc_240829.csv dolphin_2_2_1_mistral_7b,AlphacaEval v2lc,13.1,,alphacaeval_v2lc_240829.csv ein_70b_v0_1,AlphacaEval v2lc,35.0,,alphacaeval_v2lc_240829.csv evo_7b,AlphacaEval v2lc,16.5,,alphacaeval_v2lc_240829.csv evo_v2_7b,AlphacaEval v2lc,23.4,,alphacaeval_v2lc_240829.csv expo_+_internlm2_chat_20b,AlphacaEval v2lc,27.2,,alphacaeval_v2lc_240829.csv expo_+_internlm2_chat_7b,AlphacaEval v2lc,22.7,,alphacaeval_v2lc_240829.csv expo_+_llama3_instruct_8b_simpo,AlphacaEval v2lc,45.8,,alphacaeval_v2lc_240829.csv expo_+_sppo_mistral7b_pairrm,AlphacaEval v2lc,31.8,,alphacaeval_v2lc_240829.csv expo_+_starling_lm_7b_alpha,AlphacaEval v2lc,19.5,,alphacaeval_v2lc_240829.csv expo_+_starling_lm_7b_beta,AlphacaEval v2lc,26.4,,alphacaeval_v2lc_240829.csv expo_+_tulu_2_dpo_13b,AlphacaEval v2lc,17.6,,alphacaeval_v2lc_240829.csv expo_+_tulu_2_dpo_70b,AlphacaEval v2lc,25.7,,alphacaeval_v2lc_240829.csv expo_+_tulu_2_dpo_7b,AlphacaEval v2lc,11.7,,alphacaeval_v2lc_240829.csv expo_+_zephyr_7b_alpha,AlphacaEval v2lc,13.6,,alphacaeval_v2lc_240829.csv expo_+_zephyr_7b_beta,AlphacaEval v2lc,14.0,,alphacaeval_v2lc_240829.csv falcon_40b_instruct,AlphacaEval v2lc,5.6,,alphacaeval_v2lc_240829.csv falcon_7b_instruct,AlphacaEval v2lc,4.0,,alphacaeval_v2lc_240829.csv fsfairx_zephyr_chat_v0_1,AlphacaEval v2lc,34.8,,alphacaeval_v2lc_240829.csv gemini_pro,AlphacaEval v2lc,24.4,,alphacaeval_v2lc_240829.csv gemma_2_9b_it_dpo,AlphacaEval v2lc,67.7,,alphacaeval_v2lc_240829.csv gemma_2_9b_it_simpo,AlphacaEval v2lc,72.4,,alphacaeval_v2lc_240829.csv gemma_2_9b_it_wpo_hb,AlphacaEval v2lc,76.7,,alphacaeval_v2lc_240829.csv gemma_instruct_2b,AlphacaEval v2lc,5.4,,alphacaeval_v2lc_240829.csv gemma_instruct_7b,AlphacaEval v2lc,10.4,,alphacaeval_v2lc_240829.csv ghost_7b_alpha,AlphacaEval v2lc,6.9,,alphacaeval_v2lc_240829.csv ghost_8b_beta_d0x5,AlphacaEval v2lc,23.1,,alphacaeval_v2lc_240829.csv gpt_3_5,AlphacaEval v2lc,17.7,,alphacaeval_v2lc_240829.csv gpt_4,AlphacaEval v2lc,38.1,,alphacaeval_v2lc_240829.csv gpt_4_adversarial,AlphacaEval v2lc,12.2,,alphacaeval_v2lc_240829.csv guanaco_13b,AlphacaEval v2lc,3.0,,alphacaeval_v2lc_240829.csv guanaco_33b,AlphacaEval v2lc,5.7,,alphacaeval_v2lc_240829.csv guanaco_65b,AlphacaEval v2lc,8.3,,alphacaeval_v2lc_240829.csv guanaco_7b,AlphacaEval v2lc,2.9,,alphacaeval_v2lc_240829.csv higgs_llama3_70b_v2,AlphacaEval v2lc,56.8,,alphacaeval_v2lc_240829.csv humpback_llama2_70b,AlphacaEval v2lc,16.2,,alphacaeval_v2lc_240829.csv humpback_llama_65b,AlphacaEval v2lc,12.8,,alphacaeval_v2lc_240829.csv infinity_instruct_3m_0613_llama3_70b,AlphacaEval v2lc,31.5,,alphacaeval_v2lc_240829.csv infinity_instruct_3m_0613_mistral_7b,AlphacaEval v2lc,25.5,,alphacaeval_v2lc_240829.csv infinity_instruct_3m_0625_llama3_70b,AlphacaEval v2lc,38.0,,alphacaeval_v2lc_240829.csv infinity_instruct_3m_0625_llama3_8b,AlphacaEval v2lc,27.5,,alphacaeval_v2lc_240829.csv infinity_instruct_3m_0625_mistral_7b,AlphacaEval v2lc,31.4,,alphacaeval_v2lc_240829.csv infinity_instruct_3m_0625_qwen2_7b,AlphacaEval v2lc,21.9,,alphacaeval_v2lc_240829.csv infinity_instruct_3m_0625_yi_1_5_9b,AlphacaEval v2lc,20.5,,alphacaeval_v2lc_240829.csv infinity_instruct_7m_gen_llama3_1_70b,AlphacaEval v2lc,46.1,,alphacaeval_v2lc_240829.csv infinity_instruct_7m_gen_llama3_1_8b,AlphacaEval v2lc,33.9,,alphacaeval_v2lc_240829.csv infinity_instruct_7m_gen_mistral_7b,AlphacaEval v2lc,39.7,,alphacaeval_v2lc_240829.csv internlm2_chat_20b,AlphacaEval v2lc,18.7,,alphacaeval_v2lc_240829.csv jinachat,AlphacaEval v2lc,15.9,,alphacaeval_v2lc_240829.csv llama2_chat_13b,AlphacaEval v2lc,8.4,,alphacaeval_v2lc_240829.csv llama2_chat_70b,AlphacaEval v2lc,14.7,,alphacaeval_v2lc_240829.csv llama2_chat_7b,AlphacaEval v2lc,5.4,,alphacaeval_v2lc_240829.csv llama2_chat_7b_evol70k_neft,AlphacaEval v2lc,7.5,,alphacaeval_v2lc_240829.csv llama33b_oasst_rlhf,AlphacaEval v2lc,8.0,,alphacaeval_v2lc_240829.csv llama33b_oasst_sft,AlphacaEval v2lc,9.9,,alphacaeval_v2lc_240829.csv llama3_1_405b_instruct,AlphacaEval v2lc,39.3,,alphacaeval_v2lc_240829.csv llama3_1_70b_instruct,AlphacaEval v2lc,38.1,,alphacaeval_v2lc_240829.csv llama3_1_8b_instruct,AlphacaEval v2lc,20.9,,alphacaeval_v2lc_240829.csv llama3_70b_instruct,AlphacaEval v2lc,34.4,,alphacaeval_v2lc_240829.csv llama3_8b_instruct,AlphacaEval v2lc,22.9,,alphacaeval_v2lc_240829.csv llama3_instruct_8b_simpo,AlphacaEval v2lc,44.7,,alphacaeval_v2lc_240829.csv llama3_instruct_8b_wpo_hb_v2,AlphacaEval v2lc,53.4,,alphacaeval_v2lc_240829.csv llama3_pbm_nova_70b,AlphacaEval v2lc,62.4,,alphacaeval_v2lc_240829.csv lmcocktail_10_7b_v1,AlphacaEval v2lc,19.0,,alphacaeval_v2lc_240829.csv merlinite_7b_aot,AlphacaEval v2lc,31.7,,alphacaeval_v2lc_240829.csv minichat_1_5_3b,AlphacaEval v2lc,7.7,,alphacaeval_v2lc_240829.csv minichat_3b,AlphacaEval v2lc,5.7,,alphacaeval_v2lc_240829.csv minotaur_13b,AlphacaEval v2lc,11.5,,alphacaeval_v2lc_240829.csv mistral_7b_remax_v0_1,AlphacaEval v2lc,20.6,,alphacaeval_v2lc_240829.csv mistral_7b_v0_2,AlphacaEval v2lc,17.1,,alphacaeval_v2lc_240829.csv mistral_7b_v0_3,AlphacaEval v2lc,20.6,,alphacaeval_v2lc_240829.csv mistral_medium,AlphacaEval v2lc,28.6,,alphacaeval_v2lc_240829.csv mistral_orpo_beta,AlphacaEval v2lc,14.7,,alphacaeval_v2lc_240829.csv mixtral_8x22b_v0_1,AlphacaEval v2lc,30.9,,alphacaeval_v2lc_240829.csv mixtral_8x7b_v0_1,AlphacaEval v2lc,23.7,,alphacaeval_v2lc_240829.csv nanbeige2_16b_chat,AlphacaEval v2lc,40.6,,alphacaeval_v2lc_240829.csv nanbeige2_8b_chat,AlphacaEval v2lc,25.2,,alphacaeval_v2lc_240829.csv nanbeige_plus_chat_v0_1,AlphacaEval v2lc,44.5,,alphacaeval_v2lc_240829.csv nous_hermes_13b,AlphacaEval v2lc,9.7,,alphacaeval_v2lc_240829.csv openbudddy_llama2_13b_v11_1,AlphacaEval v2lc,9.2,,alphacaeval_v2lc_240829.csv openbudddy_llama2_70b_v10_1,AlphacaEval v2lc,12.6,,alphacaeval_v2lc_240829.csv openbuddy_falcon_40b_v9,AlphacaEval v2lc,9.0,,alphacaeval_v2lc_240829.csv openbuddy_falcon_7b_v6,AlphacaEval v2lc,4.8,,alphacaeval_v2lc_240829.csv openbuddy_llama30b_v7_1,AlphacaEval v2lc,10.2,,alphacaeval_v2lc_240829.csv openbuddy_llama_65b_v8,AlphacaEval v2lc,12.5,,alphacaeval_v2lc_240829.csv openchat8192_13b,AlphacaEval v2lc,7.9,,alphacaeval_v2lc_240829.csv openchat_13b,AlphacaEval v2lc,8.8,,alphacaeval_v2lc_240829.csv openchat_v2_13b,AlphacaEval v2lc,10.4,,alphacaeval_v2lc_240829.csv openchat_v2_w_13b,AlphacaEval v2lc,12.0,,alphacaeval_v2lc_240829.csv openchat_v3_1_13b,AlphacaEval v2lc,14.5,,alphacaeval_v2lc_240829.csv opencoderplus_15b,AlphacaEval v2lc,8.2,,alphacaeval_v2lc_240829.csv openhermes_2_5_mistral_7b,AlphacaEval v2lc,16.2,,alphacaeval_v2lc_240829.csv openpipe_moa_gpt_4_turbo,AlphacaEval v2lc,68.4,,alphacaeval_v2lc_240829.csv pairrm_0_4b+tulu_2+dpo_13b_best_of_16,AlphacaEval v2lc,17.4,,alphacaeval_v2lc_240829.csv pairrm_0_4b+tulu_2+dpo_70b_best_of_16,AlphacaEval v2lc,21.4,,alphacaeval_v2lc_240829.csv pairrm_0_4b+yi_34b_chat_best_of_16,AlphacaEval v2lc,28.8,,alphacaeval_v2lc_240829.csv pairrm_0_4b+zephyr_7b_beta_best_of_16,AlphacaEval v2lc,15.5,,alphacaeval_v2lc_240829.csv phi_2,AlphacaEval v2lc,4.4,,alphacaeval_v2lc_240829.csv phi_2_dpo,AlphacaEval v2lc,7.8,,alphacaeval_v2lc_240829.csv phi_2_sft,AlphacaEval v2lc,5.9,,alphacaeval_v2lc_240829.csv platolm_7b,AlphacaEval v2lc,10.5,,alphacaeval_v2lc_240829.csv pythia_12b_oasst_sft,AlphacaEval v2lc,3.3,,alphacaeval_v2lc_240829.csv pythia_12b_sft,AlphacaEval v2lc,4.2,,alphacaeval_v2lc_240829.csv qwen1_5_110b_chat,AlphacaEval v2lc,43.9,,alphacaeval_v2lc_240829.csv qwen1_5_14b_chat,AlphacaEval v2lc,23.9,,alphacaeval_v2lc_240829.csv qwen1_5_1_8b_chat,AlphacaEval v2lc,2.6,,alphacaeval_v2lc_240829.csv qwen1_5_72b_chat,AlphacaEval v2lc,36.6,,alphacaeval_v2lc_240829.csv qwen1_5_7b_chat,AlphacaEval v2lc,14.7,,alphacaeval_v2lc_240829.csv qwen2_72b_instruct,AlphacaEval v2lc,38.1,,alphacaeval_v2lc_240829.csv qwen_14b_chat,AlphacaEval v2lc,12.4,,alphacaeval_v2lc_240829.csv rebel_llama3_8b_instruct,AlphacaEval v2lc,31.4,,alphacaeval_v2lc_240829.csv recycled_wizardlm_7b_v1_0,AlphacaEval v2lc,6.9,,alphacaeval_v2lc_240829.csv recycled_wizardlm_7b_v2_0,AlphacaEval v2lc,7.5,,alphacaeval_v2lc_240829.csv samba_coe_v0_1,AlphacaEval v2lc,22.9,,alphacaeval_v2lc_240829.csv samba_coe_v0_2,AlphacaEval v2lc,27.6,,alphacaeval_v2lc_240829.csv samba_coe_v0_2_best_of_16,AlphacaEval v2lc,31.5,,alphacaeval_v2lc_240829.csv shopee_slimmoa_v1,AlphacaEval v2lc,77.5,,alphacaeval_v2lc_240829.csv snorkel_mistral_pairrm_dpo,AlphacaEval v2lc,26.4,,alphacaeval_v2lc_240829.csv snorkel_mistral_pairrm_dpo+best_of_16,AlphacaEval v2lc,30.0,,alphacaeval_v2lc_240829.csv sppo_gemma_2_9b_it_pairrm,AlphacaEval v2lc,54.0,,alphacaeval_v2lc_240829.csv sppo_llama3_instruct_8b_pairrm,AlphacaEval v2lc,38.6,,alphacaeval_v2lc_240829.csv sppo_mistral7b_pairrm,AlphacaEval v2lc,30.5,,alphacaeval_v2lc_240829.csv starling_lm_7b_alpha,AlphacaEval v2lc,14.7,,alphacaeval_v2lc_240829.csv storm_7b,AlphacaEval v2lc,50.5,,alphacaeval_v2lc_240829.csv storm_7b_best_of_64,AlphacaEval v2lc,61.6,,alphacaeval_v2lc_240829.csv tempnet_llama2_chat_13b_v0_1,AlphacaEval v2lc,8.6,,alphacaeval_v2lc_240829.csv tempnet_llama2_chat_70b_v0_1,AlphacaEval v2lc,15.8,,alphacaeval_v2lc_240829.csv tempnet_llama2_chat_7b_v0_1,AlphacaEval v2lc,5.7,,alphacaeval_v2lc_240829.csv together_moa,AlphacaEval v2lc,65.4,,alphacaeval_v2lc_240829.csv together_moa_lite,AlphacaEval v2lc,59.1,,alphacaeval_v2lc_240829.csv tulu_2+dpo_13b,AlphacaEval v2lc,11.6,,alphacaeval_v2lc_240829.csv tulu_2+dpo_70b,AlphacaEval v2lc,21.2,,alphacaeval_v2lc_240829.csv tulu_2+dpo_7b,AlphacaEval v2lc,9.2,,alphacaeval_v2lc_240829.csv ultralm_13b,AlphacaEval v2lc,7.1,,alphacaeval_v2lc_240829.csv ultralm_13b_best_of_16,AlphacaEval v2lc,9.9,,alphacaeval_v2lc_240829.csv ultralm_13b_v2_0,AlphacaEval v2lc,9.1,,alphacaeval_v2lc_240829.csv ultralm_13b_v2_0_best_of_16,AlphacaEval v2lc,14.2,,alphacaeval_v2lc_240829.csv vicuna_13b,AlphacaEval v2lc,9.2,,alphacaeval_v2lc_240829.csv vicuna_13b_v1_3,AlphacaEval v2lc,10.8,,alphacaeval_v2lc_240829.csv vicuna_13b_v1_5,AlphacaEval v2lc,10.5,,alphacaeval_v2lc_240829.csv vicuna_13b_v1_5_together,AlphacaEval v2lc,11.7,,alphacaeval_v2lc_240829.csv vicuna_33b_v1_3,AlphacaEval v2lc,17.6,,alphacaeval_v2lc_240829.csv vicuna_7b,AlphacaEval v2lc,6.3,,alphacaeval_v2lc_240829.csv vicuna_7b_v1_3,AlphacaEval v2lc,7.2,,alphacaeval_v2lc_240829.csv vicuna_7b_v1_5,AlphacaEval v2lc,7.6,,alphacaeval_v2lc_240829.csv wizardlm_13b,AlphacaEval v2lc,9.8,,alphacaeval_v2lc_240829.csv wizardlm_13b_v1_1,AlphacaEval v2lc,13.9,,alphacaeval_v2lc_240829.csv wizardlm_13b_v1_2,AlphacaEval v2lc,14.5,,alphacaeval_v2lc_240829.csv wizardlm_70b,AlphacaEval v2lc,17.6,,alphacaeval_v2lc_240829.csv xwinlm_13b_v0_1,AlphacaEval v2lc,17.9,,alphacaeval_v2lc_240829.csv xwinlm_70b_v0_1,AlphacaEval v2lc,24.6,,alphacaeval_v2lc_240829.csv xwinlm_7b_v0_1,AlphacaEval v2lc,10.8,,alphacaeval_v2lc_240829.csv yi_34b_chat,AlphacaEval v2lc,27.2,,alphacaeval_v2lc_240829.csv yi_large_preview,AlphacaEval v2lc,51.9,,alphacaeval_v2lc_240829.csv zephyr_7b_alpha,AlphacaEval v2lc,10.3,,alphacaeval_v2lc_240829.csv zephyr_7b_beta,AlphacaEval v2lc,13.2,,alphacaeval_v2lc_240829.csv claude_3_haiku_20240307,HELM AirBench Security Risks,1.0,[],helm_airbench_240916.csv claude_3_sonnet_20240229,HELM AirBench Security Risks,0.9957894736842106,[],helm_airbench_240916.csv llama3_8b_chat,HELM AirBench Security Risks,0.9368421052631579,[],helm_airbench_240916.csv claude_3_opus_20240229,HELM AirBench Security Risks,0.9368421052631579,[],helm_airbench_240916.csv gemini_1_5_pro_001_safety_default,HELM AirBench Security Risks,0.9031578947368422,[],helm_airbench_240916.csv gemini_1_5_flash_001_safety_default,HELM AirBench Security Risks,0.8747368421052631,[],helm_airbench_240916.csv gpt_3_5_turbo_0613,HELM AirBench Security Risks,0.8610526315789473,[],helm_airbench_240916.csv gpt_4_turbo_2024_04_09,HELM AirBench Security Risks,0.8557894736842105,[],helm_airbench_240916.csv llama3_70b_chat,HELM AirBench Security Risks,0.8389473684210527,[],helm_airbench_240916.csv gpt_3_5_turbo_1106,HELM AirBench Security Risks,0.7157894736842105,[],helm_airbench_240916.csv gpt_4o_2024_05_13,HELM AirBench Security Risks,0.6926315789473685,[],helm_airbench_240916.csv gpt_3_5_turbo_0125,HELM AirBench Security Risks,0.5789473684210527,[],helm_airbench_240916.csv qwen1_5_72b_chat,HELM AirBench Security Risks,0.5284210526315789,[],helm_airbench_240916.csv deepseek_llm_67b_chat,HELM AirBench Security Risks,0.5242105263157895,[],helm_airbench_240916.csv yi_34b_chat,HELM AirBench Security Risks,0.46947368421052627,[],helm_airbench_240916.csv mixtral_8x22b_instruct_v0_1,HELM AirBench Security Risks,0.2989473684210525,[],helm_airbench_240916.csv mixtral_8x7b_instruct_v0_1,HELM AirBench Security Risks,0.18736842105263152,[],helm_airbench_240916.csv command_r,HELM AirBench Security Risks,0.18210526315789466,[],helm_airbench_240916.csv command_r_plus,HELM AirBench Security Risks,0.13263157894736843,[],helm_airbench_240916.csv mistral_7b_instruct_v0_3,HELM AirBench Security Risks,0.02421052631578935,[],helm_airbench_240916.csv dbrx_instructruct,HELM AirBench Security Risks,0.0,[],helm_airbench_240916.csv claude_3_haiku_20240307,HELM AirBench Operational Misuses,0.569060773480663,[],helm_airbench_240916.csv claude_3_sonnet_20240229,HELM AirBench Operational Misuses,0.7513812154696133,[],helm_airbench_240916.csv llama3_8b_chat,HELM AirBench Operational Misuses,0.3443830570902394,[],helm_airbench_240916.csv claude_3_opus_20240229,HELM AirBench Operational Misuses,0.7440147329650093,[],helm_airbench_240916.csv gemini_1_5_pro_001_safety_default,HELM AirBench Operational Misuses,1.0,[],helm_airbench_240916.csv gemini_1_5_flash_001_safety_default,HELM AirBench Operational Misuses,0.9392265193370166,[],helm_airbench_240916.csv gpt_3_5_turbo_0613,HELM AirBench Operational Misuses,0.6077348066298341,[],helm_airbench_240916.csv gpt_4_turbo_2024_04_09,HELM AirBench Operational Misuses,0.4511970534069981,[],helm_airbench_240916.csv llama3_70b_chat,HELM AirBench Operational Misuses,0.28545119705340694,[],helm_airbench_240916.csv gpt_3_5_turbo_1106,HELM AirBench Operational Misuses,0.4511970534069981,[],helm_airbench_240916.csv gpt_4o_2024_05_13,HELM AirBench Operational Misuses,0.1252302025782689,[],helm_airbench_240916.csv gpt_3_5_turbo_0125,HELM AirBench Operational Misuses,0.20810313075506437,[],helm_airbench_240916.csv qwen1_5_72b_chat,HELM AirBench Operational Misuses,0.20073664825046034,[],helm_airbench_240916.csv deepseek_llm_67b_chat,HELM AirBench Operational Misuses,0.3167587476979742,[],helm_airbench_240916.csv yi_34b_chat,HELM AirBench Operational Misuses,0.34990791896869256,[],helm_airbench_240916.csv mixtral_8x22b_instruct_v0_1,HELM AirBench Operational Misuses,0.2523020257826887,[],helm_airbench_240916.csv mixtral_8x7b_instruct_v0_1,HELM AirBench Operational Misuses,0.11602209944751385,[],helm_airbench_240916.csv command_r,HELM AirBench Operational Misuses,0.0055248618784528025,[],helm_airbench_240916.csv command_r_plus,HELM AirBench Operational Misuses,0.0,[],helm_airbench_240916.csv mistral_7b_instruct_v0_3,HELM AirBench Operational Misuses,0.07366482504604055,[],helm_airbench_240916.csv dbrx_instructruct,HELM AirBench Operational Misuses,0.012891344383056946,[],helm_airbench_240916.csv claude_3_haiku_20240307,HELM AirBench Violence & Extremism,0.9956204379562044,[],helm_airbench_240916.csv claude_3_sonnet_20240229,HELM AirBench Violence & Extremism,1.0,[],helm_airbench_240916.csv llama3_8b_chat,HELM AirBench Violence & Extremism,0.9153284671532846,[],helm_airbench_240916.csv claude_3_opus_20240229,HELM AirBench Violence & Extremism,0.9547445255474453,[],helm_airbench_240916.csv gemini_1_5_pro_001_safety_default,HELM AirBench Violence & Extremism,0.8583941605839416,[],helm_airbench_240916.csv gemini_1_5_flash_001_safety_default,HELM AirBench Violence & Extremism,0.8058394160583942,[],helm_airbench_240916.csv gpt_3_5_turbo_0613,HELM AirBench Violence & Extremism,0.5635036496350364,[],helm_airbench_240916.csv gpt_4_turbo_2024_04_09,HELM AirBench Violence & Extremism,0.7474452554744525,[],helm_airbench_240916.csv llama3_70b_chat,HELM AirBench Violence & Extremism,0.7153284671532847,[],helm_airbench_240916.csv gpt_3_5_turbo_1106,HELM AirBench Violence & Extremism,0.3678832116788322,[],helm_airbench_240916.csv gpt_4o_2024_05_13,HELM AirBench Violence & Extremism,0.45839416058394156,[],helm_airbench_240916.csv gpt_3_5_turbo_0125,HELM AirBench Violence & Extremism,0.2583941605839415,[],helm_airbench_240916.csv qwen1_5_72b_chat,HELM AirBench Violence & Extremism,0.3824817518248176,[],helm_airbench_240916.csv deepseek_llm_67b_chat,HELM AirBench Violence & Extremism,0.43795620437956195,[],helm_airbench_240916.csv yi_34b_chat,HELM AirBench Violence & Extremism,0.4131386861313868,[],helm_airbench_240916.csv mixtral_8x22b_instruct_v0_1,HELM AirBench Violence & Extremism,0.16788321167883213,[],helm_airbench_240916.csv mixtral_8x7b_instruct_v0_1,HELM AirBench Violence & Extremism,0.15766423357664228,[],helm_airbench_240916.csv command_r,HELM AirBench Violence & Extremism,0.09635036496350358,[],helm_airbench_240916.csv command_r_plus,HELM AirBench Violence & Extremism,0.03649635036496357,[],helm_airbench_240916.csv mistral_7b_instruct_v0_3,HELM AirBench Violence & Extremism,0.05109489051094884,[],helm_airbench_240916.csv dbrx_instructruct,HELM AirBench Violence & Extremism,0.0,[],helm_airbench_240916.csv claude_3_haiku_20240307,HELM AirBench Hate/Toxicity,1.0,[],helm_airbench_240916.csv claude_3_sonnet_20240229,HELM AirBench Hate/Toxicity,0.9765100671140939,[],helm_airbench_240916.csv llama3_8b_chat,HELM AirBench Hate/Toxicity,0.7802013422818792,[],helm_airbench_240916.csv claude_3_opus_20240229,HELM AirBench Hate/Toxicity,0.9429530201342282,[],helm_airbench_240916.csv gemini_1_5_pro_001_safety_default,HELM AirBench Hate/Toxicity,0.8691275167785235,[],helm_airbench_240916.csv gemini_1_5_flash_001_safety_default,HELM AirBench Hate/Toxicity,0.8204697986577181,[],helm_airbench_240916.csv gpt_3_5_turbo_0613,HELM AirBench Hate/Toxicity,0.6359060402684563,[],helm_airbench_240916.csv gpt_4_turbo_2024_04_09,HELM AirBench Hate/Toxicity,0.8540268456375839,[],helm_airbench_240916.csv llama3_70b_chat,HELM AirBench Hate/Toxicity,0.5436241610738255,[],helm_airbench_240916.csv gpt_3_5_turbo_1106,HELM AirBench Hate/Toxicity,0.3691275167785235,[],helm_airbench_240916.csv gpt_4o_2024_05_13,HELM AirBench Hate/Toxicity,0.546979865771812,[],helm_airbench_240916.csv gpt_3_5_turbo_0125,HELM AirBench Hate/Toxicity,0.23993288590604023,[],helm_airbench_240916.csv qwen1_5_72b_chat,HELM AirBench Hate/Toxicity,0.4731543624161073,[],helm_airbench_240916.csv deepseek_llm_67b_chat,HELM AirBench Hate/Toxicity,0.4832214765100671,[],helm_airbench_240916.csv yi_34b_chat,HELM AirBench Hate/Toxicity,0.46308724832214765,[],helm_airbench_240916.csv mixtral_8x22b_instruct_v0_1,HELM AirBench Hate/Toxicity,0.3959731543624161,[],helm_airbench_240916.csv mixtral_8x7b_instruct_v0_1,HELM AirBench Hate/Toxicity,0.25,[],helm_airbench_240916.csv command_r,HELM AirBench Hate/Toxicity,0.11241610738255048,[],helm_airbench_240916.csv command_r_plus,HELM AirBench Hate/Toxicity,0.0,[],helm_airbench_240916.csv mistral_7b_instruct_v0_3,HELM AirBench Hate/Toxicity,0.25503355704697983,[],helm_airbench_240916.csv dbrx_instructruct,HELM AirBench Hate/Toxicity,0.04865771812080544,[],helm_airbench_240916.csv claude_3_haiku_20240307,HELM AirBench Sexual Content,0.9357798165137614,[],helm_airbench_240916.csv claude_3_sonnet_20240229,HELM AirBench Sexual Content,1.0,[],helm_airbench_240916.csv llama3_8b_chat,HELM AirBench Sexual Content,0.9559633027522936,[],helm_airbench_240916.csv claude_3_opus_20240229,HELM AirBench Sexual Content,0.7137614678899082,[],helm_airbench_240916.csv gemini_1_5_pro_001_safety_default,HELM AirBench Sexual Content,0.8091743119266055,[],helm_airbench_240916.csv gemini_1_5_flash_001_safety_default,HELM AirBench Sexual Content,0.7834862385321101,[],helm_airbench_240916.csv gpt_3_5_turbo_0613,HELM AirBench Sexual Content,0.33027522935779796,[],helm_airbench_240916.csv gpt_4_turbo_2024_04_09,HELM AirBench Sexual Content,0.6440366972477063,[],helm_airbench_240916.csv llama3_70b_chat,HELM AirBench Sexual Content,0.4385321100917431,[],helm_airbench_240916.csv gpt_3_5_turbo_1106,HELM AirBench Sexual Content,0.31192660550458695,[],helm_airbench_240916.csv gpt_4o_2024_05_13,HELM AirBench Sexual Content,0.37614678899082554,[],helm_airbench_240916.csv gpt_3_5_turbo_0125,HELM AirBench Sexual Content,0.11376146788990804,[],helm_airbench_240916.csv qwen1_5_72b_chat,HELM AirBench Sexual Content,0.17247706422018338,[],helm_airbench_240916.csv deepseek_llm_67b_chat,HELM AirBench Sexual Content,0.19633027522935764,[],helm_airbench_240916.csv yi_34b_chat,HELM AirBench Sexual Content,0.2807339449541284,[],helm_airbench_240916.csv mixtral_8x22b_instruct_v0_1,HELM AirBench Sexual Content,0.29357798165137616,[],helm_airbench_240916.csv mixtral_8x7b_instruct_v0_1,HELM AirBench Sexual Content,0.17798165137614663,[],helm_airbench_240916.csv command_r,HELM AirBench Sexual Content,0.03119266055045855,[],helm_airbench_240916.csv command_r_plus,HELM AirBench Sexual Content,0.0,[],helm_airbench_240916.csv mistral_7b_instruct_v0_3,HELM AirBench Sexual Content,0.24220183486238522,[],helm_airbench_240916.csv dbrx_instructruct,HELM AirBench Sexual Content,0.08256880733944938,[],helm_airbench_240916.csv claude_3_haiku_20240307,HELM AirBench Child Harm,0.9759797724399495,[],helm_airbench_240916.csv claude_3_sonnet_20240229,HELM AirBench Child Harm,1.0,[],helm_airbench_240916.csv llama3_8b_chat,HELM AirBench Child Harm,0.8192161820480405,[],helm_airbench_240916.csv claude_3_opus_20240229,HELM AirBench Child Harm,0.9279393173198482,[],helm_airbench_240916.csv gemini_1_5_pro_001_safety_default,HELM AirBench Child Harm,0.8735777496839443,[],helm_airbench_240916.csv gemini_1_5_flash_001_safety_default,HELM AirBench Child Harm,0.806573957016435,[],helm_airbench_240916.csv gpt_3_5_turbo_0613,HELM AirBench Child Harm,0.6257901390644753,[],helm_airbench_240916.csv gpt_4_turbo_2024_04_09,HELM AirBench Child Harm,0.6864728192161821,[],helm_airbench_240916.csv llama3_70b_chat,HELM AirBench Child Harm,0.8305941845764855,[],helm_airbench_240916.csv gpt_3_5_turbo_1106,HELM AirBench Child Harm,0.37294563843236417,[],helm_airbench_240916.csv gpt_4o_2024_05_13,HELM AirBench Child Harm,0.47029077117572693,[],helm_airbench_240916.csv gpt_3_5_turbo_0125,HELM AirBench Child Harm,0.21744627054361576,[],helm_airbench_240916.csv qwen1_5_72b_chat,HELM AirBench Child Harm,0.3008849557522124,[],helm_airbench_240916.csv deepseek_llm_67b_chat,HELM AirBench Child Harm,0.35524652338811635,[],helm_airbench_240916.csv yi_34b_chat,HELM AirBench Child Harm,0.37926675094816686,[],helm_airbench_240916.csv mixtral_8x22b_instruct_v0_1,HELM AirBench Child Harm,0.19848293299620734,[],helm_airbench_240916.csv mixtral_8x7b_instruct_v0_1,HELM AirBench Child Harm,0.09608091024020238,[],helm_airbench_240916.csv command_r,HELM AirBench Child Harm,0.1264222503160557,[],helm_airbench_240916.csv command_r_plus,HELM AirBench Child Harm,0.1327433628318585,[],helm_airbench_240916.csv mistral_7b_instruct_v0_3,HELM AirBench Child Harm,0.0,[],helm_airbench_240916.csv dbrx_instructruct,HELM AirBench Child Harm,0.0,[],helm_airbench_240916.csv claude_3_haiku_20240307,HELM AirBench Self Harm,0.9695290858725761,[],helm_airbench_240916.csv claude_3_sonnet_20240229,HELM AirBench Self Harm,1.0,[],helm_airbench_240916.csv llama3_8b_chat,HELM AirBench Self Harm,0.8767313019390581,[],helm_airbench_240916.csv claude_3_opus_20240229,HELM AirBench Self Harm,0.9695290858725761,[],helm_airbench_240916.csv gemini_1_5_pro_001_safety_default,HELM AirBench Self Harm,0.8919667590027701,[],helm_airbench_240916.csv gemini_1_5_flash_001_safety_default,HELM AirBench Self Harm,0.9695290858725761,[],helm_airbench_240916.csv gpt_3_5_turbo_0613,HELM AirBench Self Harm,0.8767313019390581,[],helm_airbench_240916.csv gpt_4_turbo_2024_04_09,HELM AirBench Self Harm,0.7839335180055401,[],helm_airbench_240916.csv llama3_70b_chat,HELM AirBench Self Harm,0.8919667590027701,[],helm_airbench_240916.csv gpt_3_5_turbo_1106,HELM AirBench Self Harm,0.554016620498615,[],helm_airbench_240916.csv gpt_4o_2024_05_13,HELM AirBench Self Harm,0.7382271468144044,[],helm_airbench_240916.csv gpt_3_5_turbo_0125,HELM AirBench Self Harm,0.41551246537396125,[],helm_airbench_240916.csv qwen1_5_72b_chat,HELM AirBench Self Harm,0.5069252077562327,[],helm_airbench_240916.csv deepseek_llm_67b_chat,HELM AirBench Self Harm,0.5235457063711911,[],helm_airbench_240916.csv yi_34b_chat,HELM AirBench Self Harm,0.5997229916897506,[],helm_airbench_240916.csv mixtral_8x22b_instruct_v0_1,HELM AirBench Self Harm,0.554016620498615,[],helm_airbench_240916.csv mixtral_8x7b_instruct_v0_1,HELM AirBench Self Harm,0.2617728531855955,[],helm_airbench_240916.csv command_r,HELM AirBench Self Harm,0.19944598337950137,[],helm_airbench_240916.csv command_r_plus,HELM AirBench Self Harm,0.19944598337950137,[],helm_airbench_240916.csv mistral_7b_instruct_v0_3,HELM AirBench Self Harm,0.2770083102493074,[],helm_airbench_240916.csv dbrx_instructruct,HELM AirBench Self Harm,0.0,[],helm_airbench_240916.csv claude_3_haiku_20240307,HELM AirBench Political Usage,0.9892703862660944,[],helm_airbench_240916.csv claude_3_sonnet_20240229,HELM AirBench Political Usage,1.0,[],helm_airbench_240916.csv llama3_8b_chat,HELM AirBench Political Usage,0.8433476394849786,[],helm_airbench_240916.csv claude_3_opus_20240229,HELM AirBench Political Usage,0.9656652360515021,[],helm_airbench_240916.csv gemini_1_5_pro_001_safety_default,HELM AirBench Political Usage,0.8605150214592274,[],helm_airbench_240916.csv gemini_1_5_flash_001_safety_default,HELM AirBench Political Usage,0.8240343347639485,[],helm_airbench_240916.csv gpt_3_5_turbo_0613,HELM AirBench Political Usage,0.5364806866952789,[],helm_airbench_240916.csv gpt_4_turbo_2024_04_09,HELM AirBench Political Usage,0.686695278969957,[],helm_airbench_240916.csv llama3_70b_chat,HELM AirBench Political Usage,0.6695278969957081,[],helm_airbench_240916.csv gpt_3_5_turbo_1106,HELM AirBench Political Usage,0.3798283261802575,[],helm_airbench_240916.csv gpt_4o_2024_05_13,HELM AirBench Political Usage,0.38841201716738194,[],helm_airbench_240916.csv gpt_3_5_turbo_0125,HELM AirBench Political Usage,0.25536480686695284,[],helm_airbench_240916.csv qwen1_5_72b_chat,HELM AirBench Political Usage,0.37231759656652363,[],helm_airbench_240916.csv deepseek_llm_67b_chat,HELM AirBench Political Usage,0.46244635193133043,[],helm_airbench_240916.csv yi_34b_chat,HELM AirBench Political Usage,0.47532188841201717,[],helm_airbench_240916.csv mixtral_8x22b_instruct_v0_1,HELM AirBench Political Usage,0.23175965665236054,[],helm_airbench_240916.csv mixtral_8x7b_instruct_v0_1,HELM AirBench Political Usage,0.16630901287553645,[],helm_airbench_240916.csv command_r,HELM AirBench Political Usage,0.1094420600858369,[],helm_airbench_240916.csv command_r_plus,HELM AirBench Political Usage,0.07296137339055786,[],helm_airbench_240916.csv mistral_7b_instruct_v0_3,HELM AirBench Political Usage,0.05793991416309008,[],helm_airbench_240916.csv dbrx_instructruct,HELM AirBench Political Usage,0.0,[],helm_airbench_240916.csv claude_3_haiku_20240307,HELM AirBench Economic Harm,0.9785637727759914,[],helm_airbench_240916.csv claude_3_sonnet_20240229,HELM AirBench Economic Harm,1.0,[],helm_airbench_240916.csv llama3_8b_chat,HELM AirBench Economic Harm,0.8713826366559485,[],helm_airbench_240916.csv claude_3_opus_20240229,HELM AirBench Economic Harm,0.9571275455519829,[],helm_airbench_240916.csv gemini_1_5_pro_001_safety_default,HELM AirBench Economic Harm,0.92497320471597,[],helm_airbench_240916.csv gemini_1_5_flash_001_safety_default,HELM AirBench Economic Harm,0.857449088960343,[],helm_airbench_240916.csv gpt_3_5_turbo_0613,HELM AirBench Economic Harm,0.557341907824223,[],helm_airbench_240916.csv gpt_4_turbo_2024_04_09,HELM AirBench Economic Harm,0.707395498392283,[],helm_airbench_240916.csv llama3_70b_chat,HELM AirBench Economic Harm,0.6570203644158628,[],helm_airbench_240916.csv gpt_3_5_turbo_1106,HELM AirBench Economic Harm,0.3536977491961415,[],helm_airbench_240916.csv gpt_4o_2024_05_13,HELM AirBench Economic Harm,0.3922829581993569,[],helm_airbench_240916.csv gpt_3_5_turbo_0125,HELM AirBench Economic Harm,0.26045016077170424,[],helm_airbench_240916.csv qwen1_5_72b_chat,HELM AirBench Economic Harm,0.3536977491961415,[],helm_airbench_240916.csv deepseek_llm_67b_chat,HELM AirBench Economic Harm,0.4137191854233655,[],helm_airbench_240916.csv yi_34b_chat,HELM AirBench Economic Harm,0.4823151125401929,[],helm_airbench_240916.csv mixtral_8x22b_instruct_v0_1,HELM AirBench Economic Harm,0.32797427652733113,[],helm_airbench_240916.csv mixtral_8x7b_instruct_v0_1,HELM AirBench Economic Harm,0.22829581993569126,[],helm_airbench_240916.csv command_r,HELM AirBench Economic Harm,0.142550911039657,[],helm_airbench_240916.csv command_r_plus,HELM AirBench Economic Harm,0.060021436227224,[],helm_airbench_240916.csv mistral_7b_instruct_v0_3,HELM AirBench Economic Harm,0.04608788853161838,[],helm_airbench_240916.csv dbrx_instructruct,HELM AirBench Economic Harm,0.0,[],helm_airbench_240916.csv claude_3_haiku_20240307,HELM AirBench Deception,1.0,[],helm_airbench_240916.csv claude_3_sonnet_20240229,HELM AirBench Deception,0.991636798088411,[],helm_airbench_240916.csv llama3_8b_chat,HELM AirBench Deception,0.7968936678614098,[],helm_airbench_240916.csv claude_3_opus_20240229,HELM AirBench Deception,0.955794504181601,[],helm_airbench_240916.csv gemini_1_5_pro_001_safety_default,HELM AirBench Deception,0.8494623655913979,[],helm_airbench_240916.csv gemini_1_5_flash_001_safety_default,HELM AirBench Deception,0.7479091995221028,[],helm_airbench_240916.csv gpt_3_5_turbo_0613,HELM AirBench Deception,0.48267622461170845,[],helm_airbench_240916.csv gpt_4_turbo_2024_04_09,HELM AirBench Deception,0.7431302270011948,[],helm_airbench_240916.csv llama3_70b_chat,HELM AirBench Deception,0.6463560334528076,[],helm_airbench_240916.csv gpt_3_5_turbo_1106,HELM AirBench Deception,0.31899641577060933,[],helm_airbench_240916.csv gpt_4o_2024_05_13,HELM AirBench Deception,0.5041816009557945,[],helm_airbench_240916.csv gpt_3_5_turbo_0125,HELM AirBench Deception,0.21266427718040626,[],helm_airbench_240916.csv qwen1_5_72b_chat,HELM AirBench Deception,0.23058542413381133,[],helm_airbench_240916.csv deepseek_llm_67b_chat,HELM AirBench Deception,0.3321385902031063,[],helm_airbench_240916.csv yi_34b_chat,HELM AirBench Deception,0.29271206690561524,[],helm_airbench_240916.csv mixtral_8x22b_instruct_v0_1,HELM AirBench Deception,0.23894862604540035,[],helm_airbench_240916.csv mixtral_8x7b_instruct_v0_1,HELM AirBench Deception,0.12425328554360826,[],helm_airbench_240916.csv command_r,HELM AirBench Deception,0.12425328554360826,[],helm_airbench_240916.csv command_r_plus,HELM AirBench Deception,0.07048984468339314,[],helm_airbench_240916.csv mistral_7b_instruct_v0_3,HELM AirBench Deception,0.044205495818399054,[],helm_airbench_240916.csv dbrx_instructruct,HELM AirBench Deception,0.0,[],helm_airbench_240916.csv claude_3_haiku_20240307,HELM AirBench Manipulation,1.0,[],helm_airbench_240916.csv claude_3_sonnet_20240229,HELM AirBench Manipulation,0.9177777777777778,[],helm_airbench_240916.csv llama3_8b_chat,HELM AirBench Manipulation,0.6666666666666666,[],helm_airbench_240916.csv claude_3_opus_20240229,HELM AirBench Manipulation,0.9477777777777777,[],helm_airbench_240916.csv gemini_1_5_pro_001_safety_default,HELM AirBench Manipulation,0.8144444444444444,[],helm_airbench_240916.csv gemini_1_5_flash_001_safety_default,HELM AirBench Manipulation,0.7333333333333334,[],helm_airbench_240916.csv gpt_3_5_turbo_0613,HELM AirBench Manipulation,0.5777777777777777,[],helm_airbench_240916.csv gpt_4_turbo_2024_04_09,HELM AirBench Manipulation,0.681111111111111,[],helm_airbench_240916.csv llama3_70b_chat,HELM AirBench Manipulation,0.5844444444444443,[],helm_airbench_240916.csv gpt_3_5_turbo_1106,HELM AirBench Manipulation,0.4222222222222223,[],helm_airbench_240916.csv gpt_4o_2024_05_13,HELM AirBench Manipulation,0.4588888888888888,[],helm_airbench_240916.csv gpt_3_5_turbo_0125,HELM AirBench Manipulation,0.2811111111111111,[],helm_airbench_240916.csv qwen1_5_72b_chat,HELM AirBench Manipulation,0.3555555555555555,[],helm_airbench_240916.csv deepseek_llm_67b_chat,HELM AirBench Manipulation,0.4222222222222223,[],helm_airbench_240916.csv yi_34b_chat,HELM AirBench Manipulation,0.46666666666666656,[],helm_airbench_240916.csv mixtral_8x22b_instruct_v0_1,HELM AirBench Manipulation,0.3255555555555555,[],helm_airbench_240916.csv mixtral_8x7b_instruct_v0_1,HELM AirBench Manipulation,0.29555555555555546,[],helm_airbench_240916.csv command_r,HELM AirBench Manipulation,0.15555555555555556,[],helm_airbench_240916.csv command_r_plus,HELM AirBench Manipulation,0.11111111111111105,[],helm_airbench_240916.csv mistral_7b_instruct_v0_3,HELM AirBench Manipulation,0.11111111111111105,[],helm_airbench_240916.csv dbrx_instructruct,HELM AirBench Manipulation,0.0,[],helm_airbench_240916.csv claude_3_haiku_20240307,HELM AirBench Defamation,1.0,[],helm_airbench_240916.csv claude_3_sonnet_20240229,HELM AirBench Defamation,0.9524421593830334,[],helm_airbench_240916.csv llama3_8b_chat,HELM AirBench Defamation,0.5591259640102827,[],helm_airbench_240916.csv claude_3_opus_20240229,HELM AirBench Defamation,0.9524421593830334,[],helm_airbench_240916.csv gemini_1_5_pro_001_safety_default,HELM AirBench Defamation,0.7982005141388174,[],helm_airbench_240916.csv gemini_1_5_flash_001_safety_default,HELM AirBench Defamation,0.6902313624678662,[],helm_airbench_240916.csv gpt_3_5_turbo_0613,HELM AirBench Defamation,0.45244215938303334,[],helm_airbench_240916.csv gpt_4_turbo_2024_04_09,HELM AirBench Defamation,0.8329048843187661,[],helm_airbench_240916.csv llama3_70b_chat,HELM AirBench Defamation,0.3097686375321337,[],helm_airbench_240916.csv gpt_3_5_turbo_1106,HELM AirBench Defamation,0.42930591259640094,[],helm_airbench_240916.csv gpt_4o_2024_05_13,HELM AirBench Defamation,0.4999999999999999,[],helm_airbench_240916.csv gpt_3_5_turbo_0125,HELM AirBench Defamation,0.28534704370179953,[],helm_airbench_240916.csv qwen1_5_72b_chat,HELM AirBench Defamation,0.23778920308483287,[],helm_airbench_240916.csv deepseek_llm_67b_chat,HELM AirBench Defamation,0.5244215938303342,[],helm_airbench_240916.csv yi_34b_chat,HELM AirBench Defamation,0.41645244215938293,[],helm_airbench_240916.csv mixtral_8x22b_instruct_v0_1,HELM AirBench Defamation,0.45244215938303334,[],helm_airbench_240916.csv mixtral_8x7b_instruct_v0_1,HELM AirBench Defamation,0.2737789203084833,[],helm_airbench_240916.csv command_r,HELM AirBench Defamation,0.21465295629820047,[],helm_airbench_240916.csv command_r_plus,HELM AirBench Defamation,0.0,[],helm_airbench_240916.csv mistral_7b_instruct_v0_3,HELM AirBench Defamation,0.21465295629820047,[],helm_airbench_240916.csv dbrx_instructruct,HELM AirBench Defamation,0.08354755784061696,[],helm_airbench_240916.csv claude_3_haiku_20240307,HELM AirBench Fundamental Rights,1.0,[],helm_airbench_240916.csv claude_3_sonnet_20240229,HELM AirBench Fundamental Rights,1.0,[],helm_airbench_240916.csv llama3_8b_chat,HELM AirBench Fundamental Rights,0.9714889123548046,[],helm_airbench_240916.csv claude_3_opus_20240229,HELM AirBench Fundamental Rights,1.0,[],helm_airbench_240916.csv gemini_1_5_pro_001_safety_default,HELM AirBench Fundamental Rights,0.9366420274551215,[],helm_airbench_240916.csv gemini_1_5_flash_001_safety_default,HELM AirBench Fundamental Rights,0.9440337909186906,[],helm_airbench_240916.csv gpt_3_5_turbo_0613,HELM AirBench Fundamental Rights,0.7750791974656811,[],helm_airbench_240916.csv gpt_4_turbo_2024_04_09,HELM AirBench Fundamental Rights,0.9155227032734953,[],helm_airbench_240916.csv llama3_70b_chat,HELM AirBench Fundamental Rights,0.8447729672650475,[],helm_airbench_240916.csv gpt_3_5_turbo_1106,HELM AirBench Fundamental Rights,0.648363252375924,[],helm_airbench_240916.csv gpt_4o_2024_05_13,HELM AirBench Fundamental Rights,0.7180570221752904,[],helm_airbench_240916.csv gpt_3_5_turbo_0125,HELM AirBench Fundamental Rights,0.450897571277719,[],helm_airbench_240916.csv qwen1_5_72b_chat,HELM AirBench Fundamental Rights,0.5068637803590285,[],helm_airbench_240916.csv deepseek_llm_67b_chat,HELM AirBench Fundamental Rights,0.6061246040126715,[],helm_airbench_240916.csv yi_34b_chat,HELM AirBench Fundamental Rights,0.7602956705385427,[],helm_airbench_240916.csv mixtral_8x22b_instruct_v0_1,HELM AirBench Fundamental Rights,0.3949313621964098,[],helm_airbench_240916.csv mixtral_8x7b_instruct_v0_1,HELM AirBench Fundamental Rights,0.337909186906019,[],helm_airbench_240916.csv command_r,HELM AirBench Fundamental Rights,0.1837381203801478,[],helm_airbench_240916.csv command_r_plus,HELM AirBench Fundamental Rights,0.15522703273495242,[],helm_airbench_240916.csv mistral_7b_instruct_v0_3,HELM AirBench Fundamental Rights,0.057022175290390664,[],helm_airbench_240916.csv dbrx_instructruct,HELM AirBench Fundamental Rights,0.0,[],helm_airbench_240916.csv claude_3_haiku_20240307,HELM AirBench Discrimination/Bias,0.6772727272727272,[],helm_airbench_240916.csv claude_3_sonnet_20240229,HELM AirBench Discrimination/Bias,0.7909090909090909,[],helm_airbench_240916.csv llama3_8b_chat,HELM AirBench Discrimination/Bias,0.36136363636363633,[],helm_airbench_240916.csv claude_3_opus_20240229,HELM AirBench Discrimination/Bias,0.9318181818181818,[],helm_airbench_240916.csv gemini_1_5_pro_001_safety_default,HELM AirBench Discrimination/Bias,1.0,[],helm_airbench_240916.csv gemini_1_5_flash_001_safety_default,HELM AirBench Discrimination/Bias,0.8068181818181818,[],helm_airbench_240916.csv gpt_3_5_turbo_0613,HELM AirBench Discrimination/Bias,0.3727272727272728,[],helm_airbench_240916.csv gpt_4_turbo_2024_04_09,HELM AirBench Discrimination/Bias,0.4977272727272727,[],helm_airbench_240916.csv llama3_70b_chat,HELM AirBench Discrimination/Bias,0.40454545454545465,[],helm_airbench_240916.csv gpt_3_5_turbo_1106,HELM AirBench Discrimination/Bias,0.206818181818182,[],helm_airbench_240916.csv gpt_4o_2024_05_13,HELM AirBench Discrimination/Bias,0.23863636363636387,[],helm_airbench_240916.csv gpt_3_5_turbo_0125,HELM AirBench Discrimination/Bias,0.12727272727272732,[],helm_airbench_240916.csv qwen1_5_72b_chat,HELM AirBench Discrimination/Bias,0.2477272727272729,[],helm_airbench_240916.csv deepseek_llm_67b_chat,HELM AirBench Discrimination/Bias,0.21818181818181837,[],helm_airbench_240916.csv yi_34b_chat,HELM AirBench Discrimination/Bias,0.2749999999999999,[],helm_airbench_240916.csv mixtral_8x22b_instruct_v0_1,HELM AirBench Discrimination/Bias,0.19772727272727286,[],helm_airbench_240916.csv mixtral_8x7b_instruct_v0_1,HELM AirBench Discrimination/Bias,0.20000000000000018,[],helm_airbench_240916.csv command_r,HELM AirBench Discrimination/Bias,0.004545454545454519,[],helm_airbench_240916.csv command_r_plus,HELM AirBench Discrimination/Bias,0.0,[],helm_airbench_240916.csv mistral_7b_instruct_v0_3,HELM AirBench Discrimination/Bias,0.12727272727272732,[],helm_airbench_240916.csv dbrx_instructruct,HELM AirBench Discrimination/Bias,0.011363636363636354,[],helm_airbench_240916.csv claude_3_haiku_20240307,HELM AirBench Privacy,1.0,[],helm_airbench_240916.csv claude_3_sonnet_20240229,HELM AirBench Privacy,0.9958960328317373,[],helm_airbench_240916.csv llama3_8b_chat,HELM AirBench Privacy,0.8098495212038304,[],helm_airbench_240916.csv claude_3_opus_20240229,HELM AirBench Privacy,0.9863201094391245,[],helm_airbench_240916.csv gemini_1_5_pro_001_safety_default,HELM AirBench Privacy,0.9493844049247606,[],helm_airbench_240916.csv gemini_1_5_flash_001_safety_default,HELM AirBench Privacy,0.9261285909712722,[],helm_airbench_240916.csv gpt_3_5_turbo_0613,HELM AirBench Privacy,0.6853625170998632,[],helm_airbench_240916.csv gpt_4_turbo_2024_04_09,HELM AirBench Privacy,0.8344733242134063,[],helm_airbench_240916.csv llama3_70b_chat,HELM AirBench Privacy,0.7428180574555403,[],helm_airbench_240916.csv gpt_3_5_turbo_1106,HELM AirBench Privacy,0.5841313269493844,[],helm_airbench_240916.csv gpt_4o_2024_05_13,HELM AirBench Privacy,0.5020519835841313,[],helm_airbench_240916.csv gpt_3_5_turbo_0125,HELM AirBench Privacy,0.4733242134062927,[],helm_airbench_240916.csv qwen1_5_72b_chat,HELM AirBench Privacy,0.37072503419972624,[],helm_airbench_240916.csv deepseek_llm_67b_chat,HELM AirBench Privacy,0.4131326949384404,[],helm_airbench_240916.csv yi_34b_chat,HELM AirBench Privacy,0.5212038303693571,[],helm_airbench_240916.csv mixtral_8x22b_instruct_v0_1,HELM AirBench Privacy,0.3064295485636115,[],helm_airbench_240916.csv mixtral_8x7b_instruct_v0_1,HELM AirBench Privacy,0.32558139534883723,[],helm_airbench_240916.csv command_r,HELM AirBench Privacy,0.16142270861833108,[],helm_airbench_240916.csv command_r_plus,HELM AirBench Privacy,0.14774281805745548,[],helm_airbench_240916.csv mistral_7b_instruct_v0_3,HELM AirBench Privacy,0.13679890560875507,[],helm_airbench_240916.csv dbrx_instructruct,HELM AirBench Privacy,0.0,[],helm_airbench_240916.csv claude_3_haiku_20240307,HELM AirBench Criminal Activities,0.9917269906928645,[],helm_airbench_240916.csv claude_3_sonnet_20240229,HELM AirBench Criminal Activities,1.0,[],helm_airbench_240916.csv llama3_8b_chat,HELM AirBench Criminal Activities,1.0,[],helm_airbench_240916.csv claude_3_opus_20240229,HELM AirBench Criminal Activities,0.9824198552223371,[],helm_airbench_240916.csv gemini_1_5_pro_001_safety_default,HELM AirBench Criminal Activities,0.9565667011375387,[],helm_airbench_240916.csv gemini_1_5_flash_001_safety_default,HELM AirBench Criminal Activities,0.9658738366080661,[],helm_airbench_240916.csv gpt_3_5_turbo_0613,HELM AirBench Criminal Activities,0.8883143743536711,[],helm_airbench_240916.csv gpt_4_turbo_2024_04_09,HELM AirBench Criminal Activities,0.9400206825232679,[],helm_airbench_240916.csv llama3_70b_chat,HELM AirBench Criminal Activities,0.9741468459152016,[],helm_airbench_240916.csv gpt_3_5_turbo_1106,HELM AirBench Criminal Activities,0.7238883143743536,[],helm_airbench_240916.csv gpt_4o_2024_05_13,HELM AirBench Criminal Activities,0.7590486039296794,[],helm_airbench_240916.csv gpt_3_5_turbo_0125,HELM AirBench Criminal Activities,0.5346432264736298,[],helm_airbench_240916.csv qwen1_5_72b_chat,HELM AirBench Criminal Activities,0.6380558428128231,[],helm_airbench_240916.csv deepseek_llm_67b_chat,HELM AirBench Criminal Activities,0.6897621509824199,[],helm_airbench_240916.csv yi_34b_chat,HELM AirBench Criminal Activities,0.7156153050672182,[],helm_airbench_240916.csv mixtral_8x22b_instruct_v0_1,HELM AirBench Criminal Activities,0.33195449844881075,[],helm_airbench_240916.csv mixtral_8x7b_instruct_v0_1,HELM AirBench Criminal Activities,0.23267838676318509,[],helm_airbench_240916.csv command_r,HELM AirBench Criminal Activities,0.25853154084798347,[],helm_airbench_240916.csv command_r_plus,HELM AirBench Criminal Activities,0.15511892450879006,[],helm_airbench_240916.csv mistral_7b_instruct_v0_3,HELM AirBench Criminal Activities,0.02585315408479838,[],helm_airbench_240916.csv dbrx_instructruct,HELM AirBench Criminal Activities,0.0,[],helm_airbench_240916.csv claude_3_haiku_20240307,HELM AirBench AIR Score,0.9655172413793103,[],helm_airbench_240916.csv claude_3_sonnet_20240229,HELM AirBench AIR Score,1.0,[],helm_airbench_240916.csv llama3_8b_chat,HELM AirBench AIR Score,0.6568144499178982,[],helm_airbench_240916.csv claude_3_opus_20240229,HELM AirBench AIR Score,1.0,[],helm_airbench_240916.csv gemini_1_5_pro_001_safety_default,HELM AirBench AIR Score,0.9802955665024631,[],helm_airbench_240916.csv gemini_1_5_flash_001_safety_default,HELM AirBench AIR Score,0.9080459770114943,[],helm_airbench_240916.csv gpt_3_5_turbo_0613,HELM AirBench AIR Score,0.6223316912972086,[],helm_airbench_240916.csv gpt_4_turbo_2024_04_09,HELM AirBench AIR Score,0.7619047619047619,[],helm_airbench_240916.csv llama3_70b_chat,HELM AirBench AIR Score,0.6568144499178982,[],helm_airbench_240916.csv gpt_3_5_turbo_1106,HELM AirBench AIR Score,0.451559934318555,[],helm_airbench_240916.csv gpt_4o_2024_05_13,HELM AirBench AIR Score,0.45977011494252873,[],helm_airbench_240916.csv gpt_3_5_turbo_0125,HELM AirBench AIR Score,0.31691297208538594,[],helm_airbench_240916.csv qwen1_5_72b_chat,HELM AirBench AIR Score,0.37438423645320185,[],helm_airbench_240916.csv deepseek_llm_67b_chat,HELM AirBench AIR Score,0.41543513957307054,[],helm_airbench_240916.csv yi_34b_chat,HELM AirBench AIR Score,0.458128078817734,[],helm_airbench_240916.csv mixtral_8x22b_instruct_v0_1,HELM AirBench AIR Score,0.28735632183908044,[],helm_airbench_240916.csv mixtral_8x7b_instruct_v0_1,HELM AirBench AIR Score,0.23152709359605905,[],helm_airbench_240916.csv command_r,HELM AirBench AIR Score,0.1050903119868638,[],helm_airbench_240916.csv command_r_plus,HELM AirBench AIR Score,0.064039408866995,[],helm_airbench_240916.csv mistral_7b_instruct_v0_3,HELM AirBench AIR Score,0.11165845648604278,[],helm_airbench_240916.csv dbrx_instructruct,HELM AirBench AIR Score,0.0,[],helm_airbench_240916.csv claude_3_5_sonnet,OpenCompass,67.9,[],opencompass_240829.csv gpt_4o_20240513,OpenCompass,67.7,[],opencompass_240829.csv mistral_large,OpenCompass,63.2,[],opencompass_240829.csv mistral_large_instruct_2407,OpenCompass,62.5,[],opencompass_240829.csv deepseek_v2_chat0618,OpenCompass,61.7,[],opencompass_240829.csv gpt_4o_mini_20240718,OpenCompass,60.4,[],opencompass_240829.csv qwen_max_0428,OpenCompass,57.8,[],opencompass_240829.csv yi_large,OpenCompass,56.3,[],opencompass_240829.csv qwen2_72b_instruct,OpenCompass,55.4,[],opencompass_240829.csv glm_4,OpenCompass,55.2,[],opencompass_240829.csv llama3_1_70b_instruct,OpenCompass,53.9,[],opencompass_240829.csv gemma_2_27b_it,OpenCompass,53.5,[],opencompass_240829.csv qwen1_5_110b_chat,OpenCompass,51.9,[],opencompass_240829.csv 240615,OpenCompass,51.0,[],opencompass_240829.csv baichuan4,OpenCompass,50.4,[],opencompass_240829.csv step_1_8k,OpenCompass,49.9,[],opencompass_240829.csv abab6_5,OpenCompass,49.9,[],opencompass_240829.csv ernie_4_0_8k_preview_0518,OpenCompass,48.8,[],opencompass_240829.csv moonshot_v1_8k,OpenCompass,48.6,[],opencompass_240829.csv glm_4_9b_chat,OpenCompass,47.9,[],opencompass_240829.csv yi_1_5_34b_chat,OpenCompass,46.9,[],opencompass_240829.csv hunyuan_standard_256k,OpenCompass,46.9,[],opencompass_240829.csv mixtral_8x22b_instruct_v0_1,OpenCompass,46.3,[],opencompass_240829.csv gemma_2_9b_it,OpenCompass,45.5,[],opencompass_240829.csv qwen2_7b_instruct,OpenCompass,45.1,[],opencompass_240829.csv internlm2_5_7b_chat,OpenCompass,44.5,[],opencompass_240829.csv yi_1_5_9b_chat,OpenCompass,42.6,[],opencompass_240829.csv nanbeige2_16b_chat,OpenCompass,42.3,[],opencompass_240829.csv llama3_1_8b_instruct,OpenCompass,42.1,[],opencompass_240829.csv dbrx_instructruct,OpenCompass,37.6,[],opencompass_240829.csv yi_1_5_6b_chat,OpenCompass,36.5,[],opencompass_240829.csv internlm2_chat_20b,OpenCompass,36.0,[],opencompass_240829.csv mixtral_8x7b_instruct_v0_1,OpenCompass,34.5,[],opencompass_240829.csv mistral_7b_instruct_v0_3,OpenCompass,30.7,[],opencompass_240829.csv deepseek_v2_lite_chat,OpenCompass,30.0,[],opencompass_240829.csv claude_3_5_sonnet,OpenCompass Language,50.9,[],opencompass_240829.csv gpt_4o_20240513,OpenCompass Language,55.5,[],opencompass_240829.csv mistral_large,OpenCompass Language,50.9,[],opencompass_240829.csv mistral_large_instruct_2407,OpenCompass Language,50.3,[],opencompass_240829.csv deepseek_v2_chat0618,OpenCompass Language,46.3,[],opencompass_240829.csv gpt_4o_mini_20240718,OpenCompass Language,50.1,[],opencompass_240829.csv qwen_max_0428,OpenCompass Language,56.5,[],opencompass_240829.csv yi_large,OpenCompass Language,48.7,[],opencompass_240829.csv qwen2_72b_instruct,OpenCompass Language,45.8,[],opencompass_240829.csv glm_4,OpenCompass Language,45.8,[],opencompass_240829.csv llama3_1_70b_instruct,OpenCompass Language,38.4,[],opencompass_240829.csv gemma_2_27b_it,OpenCompass Language,45.2,[],opencompass_240829.csv qwen1_5_110b_chat,OpenCompass Language,53.4,[],opencompass_240829.csv 240615,OpenCompass Language,31.1,[],opencompass_240829.csv baichuan4,OpenCompass Language,37.2,[],opencompass_240829.csv step_1_8k,OpenCompass Language,40.6,[],opencompass_240829.csv abab6_5,OpenCompass Language,44.9,[],opencompass_240829.csv ernie_4_0_8k_preview_0518,OpenCompass Language,36.7,[],opencompass_240829.csv moonshot_v1_8k,OpenCompass Language,46.3,[],opencompass_240829.csv glm_4_9b_chat,OpenCompass Language,44.3,[],opencompass_240829.csv yi_1_5_34b_chat,OpenCompass Language,50.5,[],opencompass_240829.csv hunyuan_standard_256k,OpenCompass Language,30.6,[],opencompass_240829.csv mixtral_8x22b_instruct_v0_1,OpenCompass Language,33.0,[],opencompass_240829.csv gemma_2_9b_it,OpenCompass Language,40.8,[],opencompass_240829.csv qwen2_7b_instruct,OpenCompass Language,43.5,[],opencompass_240829.csv internlm2_5_7b_chat,OpenCompass Language,44.6,[],opencompass_240829.csv yi_1_5_9b_chat,OpenCompass Language,46.1,[],opencompass_240829.csv nanbeige2_16b_chat,OpenCompass Language,50.5,[],opencompass_240829.csv llama3_1_8b_instruct,OpenCompass Language,33.7,[],opencompass_240829.csv dbrx_instructruct,OpenCompass Language,25.6,[],opencompass_240829.csv yi_1_5_6b_chat,OpenCompass Language,43.6,[],opencompass_240829.csv internlm2_chat_20b,OpenCompass Language,36.7,[],opencompass_240829.csv mixtral_8x7b_instruct_v0_1,OpenCompass Language,36.6,[],opencompass_240829.csv mistral_7b_instruct_v0_3,OpenCompass Language,30.3,[],opencompass_240829.csv deepseek_v2_lite_chat,OpenCompass Language,31.4,[],opencompass_240829.csv claude_3_5_sonnet,OpenCompass Knowledge,85.0,[],opencompass_240829.csv gpt_4o_20240513,OpenCompass Knowledge,85.2,[],opencompass_240829.csv mistral_large,OpenCompass Knowledge,83.4,[],opencompass_240829.csv mistral_large_instruct_2407,OpenCompass Knowledge,83.3,[],opencompass_240829.csv deepseek_v2_chat0618,OpenCompass Knowledge,78.8,[],opencompass_240829.csv gpt_4o_mini_20240718,OpenCompass Knowledge,78.7,[],opencompass_240829.csv qwen_max_0428,OpenCompass Knowledge,79.0,[],opencompass_240829.csv yi_large,OpenCompass Knowledge,75.3,[],opencompass_240829.csv qwen2_72b_instruct,OpenCompass Knowledge,84.0,[],opencompass_240829.csv glm_4,OpenCompass Knowledge,77.7,[],opencompass_240829.csv llama3_1_70b_instruct,OpenCompass Knowledge,81.4,[],opencompass_240829.csv gemma_2_27b_it,OpenCompass Knowledge,58.5,[],opencompass_240829.csv qwen1_5_110b_chat,OpenCompass Knowledge,79.3,[],opencompass_240829.csv 240615,OpenCompass Knowledge,78.3,[],opencompass_240829.csv baichuan4,OpenCompass Knowledge,74.2,[],opencompass_240829.csv step_1_8k,OpenCompass Knowledge,72.0,[],opencompass_240829.csv abab6_5,OpenCompass Knowledge,69.8,[],opencompass_240829.csv ernie_4_0_8k_preview_0518,OpenCompass Knowledge,76.4,[],opencompass_240829.csv moonshot_v1_8k,OpenCompass Knowledge,61.0,[],opencompass_240829.csv glm_4_9b_chat,OpenCompass Knowledge,68.9,[],opencompass_240829.csv yi_1_5_34b_chat,OpenCompass Knowledge,65.0,[],opencompass_240829.csv hunyuan_standard_256k,OpenCompass Knowledge,69.7,[],opencompass_240829.csv mixtral_8x22b_instruct_v0_1,OpenCompass Knowledge,72.2,[],opencompass_240829.csv gemma_2_9b_it,OpenCompass Knowledge,53.7,[],opencompass_240829.csv qwen2_7b_instruct,OpenCompass Knowledge,64.1,[],opencompass_240829.csv internlm2_5_7b_chat,OpenCompass Knowledge,64.8,[],opencompass_240829.csv yi_1_5_9b_chat,OpenCompass Knowledge,56.0,[],opencompass_240829.csv nanbeige2_16b_chat,OpenCompass Knowledge,53.8,[],opencompass_240829.csv llama3_1_8b_instruct,OpenCompass Knowledge,63.2,[],opencompass_240829.csv dbrx_instructruct,OpenCompass Knowledge,66.3,[],opencompass_240829.csv yi_1_5_6b_chat,OpenCompass Knowledge,41.3,[],opencompass_240829.csv internlm2_chat_20b,OpenCompass Knowledge,60.0,[],opencompass_240829.csv mixtral_8x7b_instruct_v0_1,OpenCompass Knowledge,50.4,[],opencompass_240829.csv mistral_7b_instruct_v0_3,OpenCompass Knowledge,47.8,[],opencompass_240829.csv deepseek_v2_lite_chat,OpenCompass Knowledge,41.3,[],opencompass_240829.csv claude_3_5_sonnet,OpenCompass Reasoning,57.0,[],opencompass_240829.csv gpt_4o_20240513,OpenCompass Reasoning,55.8,[],opencompass_240829.csv mistral_large,OpenCompass Reasoning,50.1,[],opencompass_240829.csv mistral_large_instruct_2407,OpenCompass Reasoning,50.0,[],opencompass_240829.csv deepseek_v2_chat0618,OpenCompass Reasoning,47.4,[],opencompass_240829.csv gpt_4o_mini_20240718,OpenCompass Reasoning,45.4,[],opencompass_240829.csv qwen_max_0428,OpenCompass Reasoning,47.9,[],opencompass_240829.csv yi_large,OpenCompass Reasoning,47.6,[],opencompass_240829.csv qwen2_72b_instruct,OpenCompass Reasoning,44.7,[],opencompass_240829.csv glm_4,OpenCompass Reasoning,46.1,[],opencompass_240829.csv llama3_1_70b_instruct,OpenCompass Reasoning,31.6,[],opencompass_240829.csv gemma_2_27b_it,OpenCompass Reasoning,45.4,[],opencompass_240829.csv qwen1_5_110b_chat,OpenCompass Reasoning,45.8,[],opencompass_240829.csv 240615,OpenCompass Reasoning,27.8,[],opencompass_240829.csv baichuan4,OpenCompass Reasoning,38.5,[],opencompass_240829.csv step_1_8k,OpenCompass Reasoning,35.8,[],opencompass_240829.csv abab6_5,OpenCompass Reasoning,47.0,[],opencompass_240829.csv ernie_4_0_8k_preview_0518,OpenCompass Reasoning,41.3,[],opencompass_240829.csv moonshot_v1_8k,OpenCompass Reasoning,46.0,[],opencompass_240829.csv glm_4_9b_chat,OpenCompass Reasoning,40.0,[],opencompass_240829.csv yi_1_5_34b_chat,OpenCompass Reasoning,42.7,[],opencompass_240829.csv hunyuan_standard_256k,OpenCompass Reasoning,36.8,[],opencompass_240829.csv mixtral_8x22b_instruct_v0_1,OpenCompass Reasoning,28.6,[],opencompass_240829.csv gemma_2_9b_it,OpenCompass Reasoning,41.9,[],opencompass_240829.csv qwen2_7b_instruct,OpenCompass Reasoning,36.2,[],opencompass_240829.csv internlm2_5_7b_chat,OpenCompass Reasoning,39.3,[],opencompass_240829.csv yi_1_5_9b_chat,OpenCompass Reasoning,39.8,[],opencompass_240829.csv nanbeige2_16b_chat,OpenCompass Reasoning,40.5,[],opencompass_240829.csv llama3_1_8b_instruct,OpenCompass Reasoning,24.9,[],opencompass_240829.csv dbrx_instructruct,OpenCompass Reasoning,20.8,[],opencompass_240829.csv yi_1_5_6b_chat,OpenCompass Reasoning,36.5,[],opencompass_240829.csv internlm2_chat_20b,OpenCompass Reasoning,18.9,[],opencompass_240829.csv mixtral_8x7b_instruct_v0_1,OpenCompass Reasoning,28.1,[],opencompass_240829.csv mistral_7b_instruct_v0_3,OpenCompass Reasoning,20.7,[],opencompass_240829.csv deepseek_v2_lite_chat,OpenCompass Reasoning,28.1,[],opencompass_240829.csv claude_3_5_sonnet,OpenCompass Math,71.1,[],opencompass_240829.csv gpt_4o_20240513,OpenCompass Math,71.1,[],opencompass_240829.csv mistral_large,OpenCompass Math,66.4,[],opencompass_240829.csv mistral_large_instruct_2407,OpenCompass Math,72.8,[],opencompass_240829.csv deepseek_v2_chat0618,OpenCompass Math,68.2,[],opencompass_240829.csv gpt_4o_mini_20240718,OpenCompass Math,58.2,[],opencompass_240829.csv qwen_max_0428,OpenCompass Math,55.1,[],opencompass_240829.csv yi_large,OpenCompass Math,54.8,[],opencompass_240829.csv qwen2_72b_instruct,OpenCompass Math,57.7,[],opencompass_240829.csv glm_4,OpenCompass Math,53.2,[],opencompass_240829.csv llama3_1_70b_instruct,OpenCompass Math,58.0,[],opencompass_240829.csv gemma_2_27b_it,OpenCompass Math,50.1,[],opencompass_240829.csv qwen1_5_110b_chat,OpenCompass Math,39.6,[],opencompass_240829.csv 240615,OpenCompass Math,67.5,[],opencompass_240829.csv baichuan4,OpenCompass Math,51.8,[],opencompass_240829.csv step_1_8k,OpenCompass Math,51.4,[],opencompass_240829.csv abab6_5,OpenCompass Math,47.2,[],opencompass_240829.csv ernie_4_0_8k_preview_0518,OpenCompass Math,44.7,[],opencompass_240829.csv moonshot_v1_8k,OpenCompass Math,46.6,[],opencompass_240829.csv glm_4_9b_chat,OpenCompass Math,38.7,[],opencompass_240829.csv yi_1_5_34b_chat,OpenCompass Math,38.1,[],opencompass_240829.csv hunyuan_standard_256k,OpenCompass Math,53.9,[],opencompass_240829.csv mixtral_8x22b_instruct_v0_1,OpenCompass Math,47.2,[],opencompass_240829.csv gemma_2_9b_it,OpenCompass Math,40.7,[],opencompass_240829.csv qwen2_7b_instruct,OpenCompass Math,37.7,[],opencompass_240829.csv internlm2_5_7b_chat,OpenCompass Math,40.8,[],opencompass_240829.csv yi_1_5_9b_chat,OpenCompass Math,38.2,[],opencompass_240829.csv nanbeige2_16b_chat,OpenCompass Math,25.8,[],opencompass_240829.csv llama3_1_8b_instruct,OpenCompass Math,38.0,[],opencompass_240829.csv dbrx_instructruct,OpenCompass Math,35.3,[],opencompass_240829.csv yi_1_5_6b_chat,OpenCompass Math,28.4,[],opencompass_240829.csv internlm2_chat_20b,OpenCompass Math,27.4,[],opencompass_240829.csv mixtral_8x7b_instruct_v0_1,OpenCompass Math,24.8,[],opencompass_240829.csv mistral_7b_instruct_v0_3,OpenCompass Math,18.1,[],opencompass_240829.csv deepseek_v2_lite_chat,OpenCompass Math,22.8,[],opencompass_240829.csv claude_3_5_sonnet,OpenCompass Code,69.6,[],opencompass_240829.csv gpt_4o_20240513,OpenCompass Code,69.1,[],opencompass_240829.csv mistral_large,OpenCompass Code,65.1,[],opencompass_240829.csv mistral_large_instruct_2407,OpenCompass Code,55.6,[],opencompass_240829.csv deepseek_v2_chat0618,OpenCompass Code,66.2,[],opencompass_240829.csv gpt_4o_mini_20240718,OpenCompass Code,63.3,[],opencompass_240829.csv qwen_max_0428,OpenCompass Code,52.4,[],opencompass_240829.csv yi_large,OpenCompass Code,54.3,[],opencompass_240829.csv qwen2_72b_instruct,OpenCompass Code,49.5,[],opencompass_240829.csv glm_4,OpenCompass Code,56.3,[],opencompass_240829.csv llama3_1_70b_instruct,OpenCompass Code,53.7,[],opencompass_240829.csv gemma_2_27b_it,OpenCompass Code,54.6,[],opencompass_240829.csv qwen1_5_110b_chat,OpenCompass Code,49.5,[],opencompass_240829.csv 240615,OpenCompass Code,50.2,[],opencompass_240829.csv baichuan4,OpenCompass Code,44.1,[],opencompass_240829.csv step_1_8k,OpenCompass Code,44.2,[],opencompass_240829.csv abab6_5,OpenCompass Code,50.5,[],opencompass_240829.csv ernie_4_0_8k_preview_0518,OpenCompass Code,50.6,[],opencompass_240829.csv moonshot_v1_8k,OpenCompass Code,47.0,[],opencompass_240829.csv glm_4_9b_chat,OpenCompass Code,45.1,[],opencompass_240829.csv yi_1_5_34b_chat,OpenCompass Code,44.8,[],opencompass_240829.csv hunyuan_standard_256k,OpenCompass Code,46.1,[],opencompass_240829.csv mixtral_8x22b_instruct_v0_1,OpenCompass Code,44.7,[],opencompass_240829.csv gemma_2_9b_it,OpenCompass Code,42.2,[],opencompass_240829.csv qwen2_7b_instruct,OpenCompass Code,44.0,[],opencompass_240829.csv internlm2_5_7b_chat,OpenCompass Code,34.8,[],opencompass_240829.csv yi_1_5_9b_chat,OpenCompass Code,41.8,[],opencompass_240829.csv nanbeige2_16b_chat,OpenCompass Code,33.3,[],opencompass_240829.csv llama3_1_8b_instruct,OpenCompass Code,39.3,[],opencompass_240829.csv dbrx_instructruct,OpenCompass Code,32.2,[],opencompass_240829.csv yi_1_5_6b_chat,OpenCompass Code,34.4,[],opencompass_240829.csv internlm2_chat_20b,OpenCompass Code,36.2,[],opencompass_240829.csv mixtral_8x7b_instruct_v0_1,OpenCompass Code,26.7,[],opencompass_240829.csv mistral_7b_instruct_v0_3,OpenCompass Code,23.6,[],opencompass_240829.csv deepseek_v2_lite_chat,OpenCompass Code,16.3,[],opencompass_240829.csv claude_3_5_sonnet,OpenCompass Instruction,66.2,[],opencompass_240829.csv gpt_4o_20240513,OpenCompass Instruction,60.3,[],opencompass_240829.csv mistral_large,OpenCompass Instruction,51.1,[],opencompass_240829.csv mistral_large_instruct_2407,OpenCompass Instruction,50.3,[],opencompass_240829.csv deepseek_v2_chat0618,OpenCompass Instruction,44.1,[],opencompass_240829.csv gpt_4o_mini_20240718,OpenCompass Instruction,56.0,[],opencompass_240829.csv qwen_max_0428,OpenCompass Instruction,47.4,[],opencompass_240829.csv yi_large,OpenCompass Instruction,40.0,[],opencompass_240829.csv qwen2_72b_instruct,OpenCompass Instruction,34.0,[],opencompass_240829.csv glm_4,OpenCompass Instruction,36.9,[],opencompass_240829.csv llama3_1_70b_instruct,OpenCompass Instruction,46.2,[],opencompass_240829.csv gemma_2_27b_it,OpenCompass Instruction,45.2,[],opencompass_240829.csv qwen1_5_110b_chat,OpenCompass Instruction,36.8,[],opencompass_240829.csv 240615,OpenCompass Instruction,30.6,[],opencompass_240829.csv baichuan4,OpenCompass Instruction,39.4,[],opencompass_240829.csv step_1_8k,OpenCompass Instruction,38.9,[],opencompass_240829.csv abab6_5,OpenCompass Instruction,32.0,[],opencompass_240829.csv ernie_4_0_8k_preview_0518,OpenCompass Instruction,28.5,[],opencompass_240829.csv moonshot_v1_8k,OpenCompass Instruction,35.9,[],opencompass_240829.csv glm_4_9b_chat,OpenCompass Instruction,36.0,[],opencompass_240829.csv yi_1_5_34b_chat,OpenCompass Instruction,38.8,[],opencompass_240829.csv hunyuan_standard_256k,OpenCompass Instruction,29.2,[],opencompass_240829.csv mixtral_8x22b_instruct_v0_1,OpenCompass Instruction,31.2,[],opencompass_240829.csv gemma_2_9b_it,OpenCompass Instruction,40.9,[],opencompass_240829.csv qwen2_7b_instruct,OpenCompass Instruction,27.5,[],opencompass_240829.csv internlm2_5_7b_chat,OpenCompass Instruction,26.5,[],opencompass_240829.csv yi_1_5_9b_chat,OpenCompass Instruction,29.8,[],opencompass_240829.csv nanbeige2_16b_chat,OpenCompass Instruction,33.2,[],opencompass_240829.csv llama3_1_8b_instruct,OpenCompass Instruction,39.1,[],opencompass_240829.csv dbrx_instructruct,OpenCompass Instruction,32.5,[],opencompass_240829.csv yi_1_5_6b_chat,OpenCompass Instruction,26.3,[],opencompass_240829.csv internlm2_chat_20b,OpenCompass Instruction,18.5,[],opencompass_240829.csv mixtral_8x7b_instruct_v0_1,OpenCompass Instruction,28.2,[],opencompass_240829.csv mistral_7b_instruct_v0_3,OpenCompass Instruction,28.5,[],opencompass_240829.csv deepseek_v2_lite_chat,OpenCompass Instruction,20.6,[],opencompass_240829.csv claude_3_5_sonnet,OpenCompass Agent,81.7,[],opencompass_240829.csv gpt_4o_20240513,OpenCompass Agent,84.4,[],opencompass_240829.csv mistral_large,OpenCompass Agent,83.5,[],opencompass_240829.csv mistral_large_instruct_2407,OpenCompass Agent,84.5,[],opencompass_240829.csv deepseek_v2_chat0618,OpenCompass Agent,83.7,[],opencompass_240829.csv gpt_4o_mini_20240718,OpenCompass Agent,85.7,[],opencompass_240829.csv qwen_max_0428,OpenCompass Agent,83.8,[],opencompass_240829.csv yi_large,OpenCompass Agent,86.1,[],opencompass_240829.csv qwen2_72b_instruct,OpenCompass Agent,85.9,[],opencompass_240829.csv glm_4,OpenCompass Agent,80.4,[],opencompass_240829.csv llama3_1_70b_instruct,OpenCompass Agent,86.5,[],opencompass_240829.csv gemma_2_27b_it,OpenCompass Agent,85.5,[],opencompass_240829.csv qwen1_5_110b_chat,OpenCompass Agent,79.6,[],opencompass_240829.csv 240615,OpenCompass Agent,79.3,[],opencompass_240829.csv baichuan4,OpenCompass Agent,84.5,[],opencompass_240829.csv step_1_8k,OpenCompass Agent,84.2,[],opencompass_240829.csv abab6_5,OpenCompass Agent,62.5,[],opencompass_240829.csv ernie_4_0_8k_preview_0518,OpenCompass Agent,72.7,[],opencompass_240829.csv moonshot_v1_8k,OpenCompass Agent,63.5,[],opencompass_240829.csv glm_4_9b_chat,OpenCompass Agent,81.9,[],opencompass_240829.csv yi_1_5_34b_chat,OpenCompass Agent,63.5,[],opencompass_240829.csv hunyuan_standard_256k,OpenCompass Agent,65.6,[],opencompass_240829.csv mixtral_8x22b_instruct_v0_1,OpenCompass Agent,86.0,[],opencompass_240829.csv gemma_2_9b_it,OpenCompass Agent,69.9,[],opencompass_240829.csv qwen2_7b_instruct,OpenCompass Agent,79.7,[],opencompass_240829.csv internlm2_5_7b_chat,OpenCompass Agent,79.0,[],opencompass_240829.csv yi_1_5_9b_chat,OpenCompass Agent,54.3,[],opencompass_240829.csv nanbeige2_16b_chat,OpenCompass Agent,85.8,[],opencompass_240829.csv llama3_1_8b_instruct,OpenCompass Agent,80.1,[],opencompass_240829.csv dbrx_instructruct,OpenCompass Agent,75.3,[],opencompass_240829.csv yi_1_5_6b_chat,OpenCompass Agent,55.4,[],opencompass_240829.csv internlm2_chat_20b,OpenCompass Agent,80.3,[],opencompass_240829.csv mixtral_8x7b_instruct_v0_1,OpenCompass Agent,71.0,[],opencompass_240829.csv mistral_7b_instruct_v0_3,OpenCompass Agent,75.4,[],opencompass_240829.csv deepseek_v2_lite_chat,OpenCompass Agent,72.4,[],opencompass_240829.csv 240615,OpenCompass Arena,1011.0,,opencompass_arena_240829.csv abab6_5_chat,OpenCompass Arena,1027.0,,opencompass_arena_240829.csv baichuan4,OpenCompass Arena,1007.0,,opencompass_arena_240829.csv claude_3_5_sonnet_20240620,OpenCompass Arena,1055.0,,opencompass_arena_240829.csv command_r_plus,OpenCompass Arena,977.0,,opencompass_arena_240829.csv dbrx_instructruct,OpenCompass Arena,879.0,,opencompass_arena_240829.csv deepseek_llm_67b_chat,OpenCompass Arena,937.0,,opencompass_arena_240829.csv deepseek_moe_16b_chat,OpenCompass Arena,895.0,,opencompass_arena_240829.csv deepseek_v2,OpenCompass Arena,1027.0,,opencompass_arena_240829.csv deepseek_v2_chat,OpenCompass Arena,1048.0,,opencompass_arena_240829.csv ernie_4_0_8k_preview_0518,OpenCompass Arena,1051.0,,opencompass_arena_240829.csv glm_4_0520,OpenCompass Arena,1033.0,,opencompass_arena_240829.csv gpt_40_20240513,OpenCompass Arena,1090.0,,opencompass_arena_240829.csv gpt_4_turbo_20240409,OpenCompass Arena,1044.0,,opencompass_arena_240829.csv hunyuan_pro,OpenCompass Arena,1069.0,,opencompass_arena_240829.csv internlm2_5_7b_chat,OpenCompass Arena,958.0,,opencompass_arena_240829.csv internlm2_chat_20b,OpenCompass Arena,992.0,,opencompass_arena_240829.csv internlm2_chat_7b,OpenCompass Arena,968.0,,opencompass_arena_240829.csv llama3_70b_instruct,OpenCompass Arena,926.0,,opencompass_arena_240829.csv llama3_8b_instruct,OpenCompass Arena,920.0,,opencompass_arena_240829.csv mixtral_8x22b_instruct_v0_1,OpenCompass Arena,933.0,,opencompass_arena_240829.csv moonshot_v1_32k,OpenCompass Arena,994.0,,opencompass_arena_240829.csv qwen1_5_14b_chat,OpenCompass Arena,968.0,,opencompass_arena_240829.csv qwen1_5_32b_chat,OpenCompass Arena,1007.0,,opencompass_arena_240829.csv qwen1_5_72b_chat,OpenCompass Arena,1007.0,,opencompass_arena_240829.csv qwen1_5_7b_chat,OpenCompass Arena,970.0,,opencompass_arena_240829.csv qwen2_72b_instruct,OpenCompass Arena,1085.0,,opencompass_arena_240829.csv qwen_max_0428,OpenCompass Arena,1071.0,,opencompass_arena_240829.csv yi_1_5_34b_chat,OpenCompass Arena,1016.0,,opencompass_arena_240829.csv yi_34b_chat,OpenCompass Arena,983.0,,opencompass_arena_240829.csv yi_large,OpenCompass Arena,1051.0,,opencompass_arena_240829.csv claude_3_5_sonnet_20240620,LiveBench 240725,59.87,[],livebench_240829.csv gpt_4o_2024_08_06,LiveBench 240725,56.71,[],livebench_240829.csv chatgpt_4o_latest,LiveBench 240725,54.71,[],livebench_240829.csv gpt_4o_2024_05_13,LiveBench 240725,54.63,[],livebench_240829.csv llama3_1_405b_instruct_turbo,LiveBench 240725,54.25,[],livebench_240829.csv gemini_1_5_pro_exp_0827,LiveBench 240725,53.78,[],livebench_240829.csv gpt_4_turbo_2024_04_09,LiveBench 240725,52.88,[],livebench_240829.csv gemini_1_5_pro_exp_0801,LiveBench 240725,52.22,[],livebench_240829.csv claude_3_opus_20240229,LiveBench 240725,50.56,[],livebench_240829.csv gpt_4_0125_preview,LiveBench 240725,48.9,[],livebench_240829.csv dracarys_llama3_1_70b_instruct,LiveBench 240725,48.67,[],livebench_240829.csv llama3_1_70b_instruct_turbo,LiveBench 240725,48.44,[],livebench_240829.csv mistral_large_2407,LiveBench 240725,47.97,[],livebench_240829.csv gemini_1_5_flash_exp_0827,LiveBench 240725,46.87,[],livebench_240829.csv deepseek_coder_v2,LiveBench 240725,46.31,[],livebench_240829.csv deepseek_chat_v2,LiveBench 240725,46.04,[],livebench_240829.csv gpt_4_0613,LiveBench 240725,45.6,[],livebench_240829.csv gemini_1_5_pro_api_0514,LiveBench 240725,44.72,[],livebench_240829.csv gpt_4o_mini_2024_07_18,LiveBench 240725,44.26,[],livebench_240829.csv gemma_2_27b_it,LiveBench 240725,41.26,[],livebench_240829.csv dracarys_72b_instruct,LiveBench 240725,41.2,[],livebench_240829.csv qwen2_72b_instruct,LiveBench 240725,40.15,[],livebench_240829.csv hermes_3_llama3_1_70b,LiveBench 240725,40.05,[],livebench_240829.csv gemini_1_5_flash_api_0514,LiveBench 240725,40.04,[],livebench_240829.csv smaug_qwen2_72b_instruct,LiveBench 240725,39.32,[],livebench_240829.csv mistral_large_2402,LiveBench 240725,39.18,[],livebench_240829.csv claude_3_sonnet_20240229,LiveBench 240725,38.72,[],livebench_240829.csv llama3_70b_instruct,LiveBench 240725,37.73,[],livebench_240829.csv claude_3_haiku_20240307,LiveBench 240725,35.86,[],livebench_240829.csv mixtral_8x22b_instruct_v0_1,LiveBench 240725,35.17,[],livebench_240829.csv phi_3_5_moe_instruct,LiveBench 240725,35.16,[],livebench_240829.csv gpt_3_5_turbo_0125,LiveBench 240725,34.54,[],livebench_240829.csv mistral_small_2402,LiveBench 240725,32.19,[],livebench_240829.csv command_r_plus,LiveBench 240725,32.17,[],livebench_240829.csv gemma_2_9b_it,LiveBench 240725,31.34,[],livebench_240829.csv phi_3_medium_4k_instruct,LiveBench 240725,31.22,[],livebench_240829.csv phi_3_medium_128k_instruct,LiveBench 240725,30.3,[],livebench_240829.csv phi_3_small_128k_instruct,LiveBench 240725,29.97,[],livebench_240829.csv qwen1_5_110b_chat,LiveBench 240725,29.78,[],livebench_240829.csv deepseek_coder_v2_lite_instruct,LiveBench 240725,29.53,[],livebench_240829.csv qwen1_5_72b_chat,LiveBench 240725,29.26,[],livebench_240829.csv open_mistral_nemo,LiveBench 240725,29.17,[],livebench_240829.csv phi_3_5_mini_instruct,LiveBench 240725,28.3,[],livebench_240829.csv llama3_1_8b_instruct_turbo,LiveBench 240725,28.03,[],livebench_240829.csv phi_3_small_8k_instruct,LiveBench 240725,27.98,[],livebench_240829.csv llama3_8b_instruct,LiveBench 240725,27.56,[],livebench_240829.csv command_r,LiveBench 240725,26.83,[],livebench_240829.csv qwen2_7b_instruct,LiveBench 240725,26.58,[],livebench_240829.csv phi_3_mini_128k_instruct,LiveBench 240725,25.55,[],livebench_240829.csv phi_3_mini_4k_instruct,LiveBench 240725,25.46,[],livebench_240829.csv mathstral_7b_v0_1,LiveBench 240725,24.48,[],livebench_240829.csv openhermes_2_5_mistral_7b,LiveBench 240725,24.13,[],livebench_240829.csv mixtral_8x7b_instruct_v0_1,LiveBench 240725,22.73,[],livebench_240829.csv mistral_7b_instruct_v0_3,LiveBench 240725,21.25,[],livebench_240829.csv mistral_7b_instruct_v0_2,LiveBench 240725,20.05,[],livebench_240829.csv gemma_1_1_7b_it,LiveBench 240725,18.78,[],livebench_240829.csv zephyr_7b_alpha,LiveBench 240725,18.6,[],livebench_240829.csv qwen1_5_7b_chat,LiveBench 240725,17.98,[],livebench_240829.csv deepseek_v2_lite_chat,LiveBench 240725,17.74,[],livebench_240829.csv zephyr_7b_beta,LiveBench 240725,16.72,[],livebench_240829.csv starling_lm_7b_beta,LiveBench 240725,16.6,[],livebench_240829.csv vicuna_7b_v1_5_16k,LiveBench 240725,14.5,[],livebench_240829.csv vicuna_7b_v1_5,LiveBench 240725,12.57,[],livebench_240829.csv llama_2_7b_chat,LiveBench 240725,11.63,[],livebench_240829.csv qwen1_5_4b_chat,LiveBench 240725,11.28,[],livebench_240829.csv qwen2_1_5b_instruct,LiveBench 240725,10.35,[],livebench_240829.csv yi_6b_chat,LiveBench 240725,9.58,[],livebench_240829.csv qwen2_0_5b_instruct,LiveBench 240725,7.68,[],livebench_240829.csv qwen1_5_1_8b_chat,LiveBench 240725,6.04,[],livebench_240829.csv qwen1_5_0_5b_chat,LiveBench 240725,5.21,[],livebench_240829.csv claude_3_5_sonnet_20240620,LiveBench Reasoning,58.67,[],livebench_240829.csv gpt_4o_2024_08_06,LiveBench Reasoning,54.67,[],livebench_240829.csv chatgpt_4o_latest,LiveBench Reasoning,52.0,[],livebench_240829.csv gpt_4o_2024_05_13,LiveBench Reasoning,50.0,[],livebench_240829.csv llama3_1_405b_instruct_turbo,LiveBench Reasoning,53.33,[],livebench_240829.csv gemini_1_5_pro_exp_0827,LiveBench Reasoning,49.33,[],livebench_240829.csv gpt_4_turbo_2024_04_09,LiveBench Reasoning,51.33,[],livebench_240829.csv gemini_1_5_pro_exp_0801,LiveBench Reasoning,48.67,[],livebench_240829.csv claude_3_opus_20240229,LiveBench Reasoning,41.33,[],livebench_240829.csv gpt_4_0125_preview,LiveBench Reasoning,47.33,[],livebench_240829.csv dracarys_llama3_1_70b_instruct,LiveBench Reasoning,44.0,[],livebench_240829.csv llama3_1_70b_instruct_turbo,LiveBench Reasoning,40.67,[],livebench_240829.csv mistral_large_2407,LiveBench Reasoning,42.0,[],livebench_240829.csv gemini_1_5_flash_exp_0827,LiveBench Reasoning,47.33,[],livebench_240829.csv deepseek_coder_v2,LiveBench Reasoning,45.33,[],livebench_240829.csv deepseek_chat_v2,LiveBench Reasoning,40.0,[],livebench_240829.csv gpt_4_0613,LiveBench Reasoning,34.67,[],livebench_240829.csv gemini_1_5_pro_api_0514,LiveBench Reasoning,35.33,[],livebench_240829.csv gpt_4o_mini_2024_07_18,LiveBench Reasoning,35.33,[],livebench_240829.csv gemma_2_27b_it,LiveBench Reasoning,32.0,[],livebench_240829.csv dracarys_72b_instruct,LiveBench Reasoning,40.0,[],livebench_240829.csv qwen2_72b_instruct,LiveBench Reasoning,41.33,[],livebench_240829.csv hermes_3_llama3_1_70b,LiveBench Reasoning,33.33,[],livebench_240829.csv gemini_1_5_flash_api_0514,LiveBench Reasoning,29.33,[],livebench_240829.csv smaug_qwen2_72b_instruct,LiveBench Reasoning,36.0,[],livebench_240829.csv mistral_large_2402,LiveBench Reasoning,36.0,[],livebench_240829.csv claude_3_sonnet_20240229,LiveBench Reasoning,28.67,[],livebench_240829.csv llama3_70b_instruct,LiveBench Reasoning,30.67,[],livebench_240829.csv claude_3_haiku_20240307,LiveBench Reasoning,29.33,[],livebench_240829.csv mixtral_8x22b_instruct_v0_1,LiveBench Reasoning,29.33,[],livebench_240829.csv phi_3_5_moe_instruct,LiveBench Reasoning,38.67,[],livebench_240829.csv gpt_3_5_turbo_0125,LiveBench Reasoning,26.67,[],livebench_240829.csv mistral_small_2402,LiveBench Reasoning,26.0,[],livebench_240829.csv command_r_plus,LiveBench Reasoning,28.67,[],livebench_240829.csv gemma_2_9b_it,LiveBench Reasoning,17.33,[],livebench_240829.csv phi_3_medium_4k_instruct,LiveBench Reasoning,36.67,[],livebench_240829.csv phi_3_medium_128k_instruct,LiveBench Reasoning,34.0,[],livebench_240829.csv phi_3_small_128k_instruct,LiveBench Reasoning,30.0,[],livebench_240829.csv qwen1_5_110b_chat,LiveBench Reasoning,30.67,[],livebench_240829.csv deepseek_coder_v2_lite_instruct,LiveBench Reasoning,26.0,[],livebench_240829.csv qwen1_5_72b_chat,LiveBench Reasoning,23.33,[],livebench_240829.csv open_mistral_nemo,LiveBench Reasoning,25.33,[],livebench_240829.csv phi_3_5_mini_instruct,LiveBench Reasoning,33.33,[],livebench_240829.csv llama3_1_8b_instruct_turbo,LiveBench Reasoning,15.33,[],livebench_240829.csv phi_3_small_8k_instruct,LiveBench Reasoning,23.33,[],livebench_240829.csv llama3_8b_instruct,LiveBench Reasoning,24.0,[],livebench_240829.csv command_r,LiveBench Reasoning,25.33,[],livebench_240829.csv qwen2_7b_instruct,LiveBench Reasoning,20.0,[],livebench_240829.csv phi_3_mini_128k_instruct,LiveBench Reasoning,28.0,[],livebench_240829.csv phi_3_mini_4k_instruct,LiveBench Reasoning,28.0,[],livebench_240829.csv mathstral_7b_v0_1,LiveBench Reasoning,18.0,[],livebench_240829.csv openhermes_2_5_mistral_7b,LiveBench Reasoning,20.0,[],livebench_240829.csv mixtral_8x7b_instruct_v0_1,LiveBench Reasoning,17.33,[],livebench_240829.csv mistral_7b_instruct_v0_3,LiveBench Reasoning,16.0,[],livebench_240829.csv mistral_7b_instruct_v0_2,LiveBench Reasoning,14.0,[],livebench_240829.csv gemma_1_1_7b_it,LiveBench Reasoning,14.67,[],livebench_240829.csv zephyr_7b_alpha,LiveBench Reasoning,12.0,[],livebench_240829.csv qwen1_5_7b_chat,LiveBench Reasoning,16.0,[],livebench_240829.csv deepseek_v2_lite_chat,LiveBench Reasoning,16.0,[],livebench_240829.csv zephyr_7b_beta,LiveBench Reasoning,12.67,[],livebench_240829.csv starling_lm_7b_beta,LiveBench Reasoning,18.67,[],livebench_240829.csv vicuna_7b_v1_5_16k,LiveBench Reasoning,15.33,[],livebench_240829.csv vicuna_7b_v1_5,LiveBench Reasoning,12.67,[],livebench_240829.csv llama_2_7b_chat,LiveBench Reasoning,12.0,[],livebench_240829.csv qwen1_5_4b_chat,LiveBench Reasoning,10.67,[],livebench_240829.csv qwen2_1_5b_instruct,LiveBench Reasoning,8.0,[],livebench_240829.csv yi_6b_chat,LiveBench Reasoning,10.67,[],livebench_240829.csv qwen2_0_5b_instruct,LiveBench Reasoning,6.0,[],livebench_240829.csv qwen1_5_1_8b_chat,LiveBench Reasoning,3.33,[],livebench_240829.csv qwen1_5_0_5b_chat,LiveBench Reasoning,2.67,[],livebench_240829.csv claude_3_5_sonnet_20240620,LiveBench Coding,60.85,[],livebench_240829.csv gpt_4o_2024_08_06,LiveBench Coding,51.44,[],livebench_240829.csv chatgpt_4o_latest,LiveBench Coding,47.15,[],livebench_240829.csv gpt_4o_2024_05_13,LiveBench Coding,49.36,[],livebench_240829.csv llama3_1_405b_instruct_turbo,LiveBench Coding,43.8,[],livebench_240829.csv gemini_1_5_pro_exp_0827,LiveBench Coding,40.95,[],livebench_240829.csv gpt_4_turbo_2024_04_09,LiveBench Coding,49.0,[],livebench_240829.csv gemini_1_5_pro_exp_0801,LiveBench Coding,41.23,[],livebench_240829.csv claude_3_opus_20240229,LiveBench Coding,38.59,[],livebench_240829.csv gpt_4_0125_preview,LiveBench Coding,41.8,[],livebench_240829.csv dracarys_llama3_1_70b_instruct,LiveBench Coding,35.23,[],livebench_240829.csv llama3_1_70b_instruct_turbo,LiveBench Coding,32.67,[],livebench_240829.csv mistral_large_2407,LiveBench Coding,47.08,[],livebench_240829.csv gemini_1_5_flash_exp_0827,LiveBench Coding,40.59,[],livebench_240829.csv deepseek_coder_v2,LiveBench Coding,41.51,[],livebench_240829.csv deepseek_chat_v2,LiveBench Coding,41.15,[],livebench_240829.csv gpt_4_0613,LiveBench Coding,37.31,[],livebench_240829.csv gemini_1_5_pro_api_0514,LiveBench Coding,32.31,[],livebench_240829.csv gpt_4o_mini_2024_07_18,LiveBench Coding,43.15,[],livebench_240829.csv gemma_2_27b_it,LiveBench Coding,35.95,[],livebench_240829.csv dracarys_72b_instruct,LiveBench Coding,38.95,[],livebench_240829.csv qwen2_72b_instruct,LiveBench Coding,32.38,[],livebench_240829.csv hermes_3_llama3_1_70b,LiveBench Coding,31.38,[],livebench_240829.csv gemini_1_5_flash_api_0514,LiveBench Coding,34.31,[],livebench_240829.csv smaug_qwen2_72b_instruct,LiveBench Coding,38.03,[],livebench_240829.csv mistral_large_2402,LiveBench Coding,27.38,[],livebench_240829.csv claude_3_sonnet_20240229,LiveBench Coding,26.38,[],livebench_240829.csv llama3_70b_instruct,LiveBench Coding,22.03,[],livebench_240829.csv claude_3_haiku_20240307,LiveBench Coding,24.46,[],livebench_240829.csv mixtral_8x22b_instruct_v0_1,LiveBench Coding,32.03,[],livebench_240829.csv phi_3_5_moe_instruct,LiveBench Coding,21.74,[],livebench_240829.csv gpt_3_5_turbo_0125,LiveBench Coding,27.74,[],livebench_240829.csv mistral_small_2402,LiveBench Coding,21.18,[],livebench_240829.csv command_r_plus,LiveBench Coding,19.46,[],livebench_240829.csv gemma_2_9b_it,LiveBench Coding,22.46,[],livebench_240829.csv phi_3_medium_4k_instruct,LiveBench Coding,20.46,[],livebench_240829.csv phi_3_medium_128k_instruct,LiveBench Coding,21.1,[],livebench_240829.csv phi_3_small_128k_instruct,LiveBench Coding,24.57,[],livebench_240829.csv qwen1_5_110b_chat,LiveBench Coding,21.82,[],livebench_240829.csv deepseek_coder_v2_lite_instruct,LiveBench Coding,24.74,[],livebench_240829.csv qwen1_5_72b_chat,LiveBench Coding,22.82,[],livebench_240829.csv open_mistral_nemo,LiveBench Coding,28.74,[],livebench_240829.csv phi_3_5_mini_instruct,LiveBench Coding,15.9,[],livebench_240829.csv llama3_1_8b_instruct_turbo,LiveBench Coding,19.74,[],livebench_240829.csv phi_3_small_8k_instruct,LiveBench Coding,20.26,[],livebench_240829.csv llama3_8b_instruct,LiveBench Coding,19.82,[],livebench_240829.csv command_r,LiveBench Coding,15.26,[],livebench_240829.csv qwen2_7b_instruct,LiveBench Coding,28.95,[],livebench_240829.csv phi_3_mini_128k_instruct,LiveBench Coding,15.04,[],livebench_240829.csv phi_3_mini_4k_instruct,LiveBench Coding,15.04,[],livebench_240829.csv mathstral_7b_v0_1,LiveBench Coding,14.54,[],livebench_240829.csv openhermes_2_5_mistral_7b,LiveBench Coding,13.26,[],livebench_240829.csv mixtral_8x7b_instruct_v0_1,LiveBench Coding,11.62,[],livebench_240829.csv mistral_7b_instruct_v0_3,LiveBench Coding,10.97,[],livebench_240829.csv mistral_7b_instruct_v0_2,LiveBench Coding,13.9,[],livebench_240829.csv gemma_1_1_7b_it,LiveBench Coding,9.62,[],livebench_240829.csv zephyr_7b_alpha,LiveBench Coding,12.26,[],livebench_240829.csv qwen1_5_7b_chat,LiveBench Coding,9.41,[],livebench_240829.csv deepseek_v2_lite_chat,LiveBench Coding,7.13,[],livebench_240829.csv zephyr_7b_beta,LiveBench Coding,8.05,[],livebench_240829.csv starling_lm_7b_beta,LiveBench Coding,18.46,[],livebench_240829.csv vicuna_7b_v1_5_16k,LiveBench Coding,2.64,[],livebench_240829.csv vicuna_7b_v1_5,LiveBench Coding,1.92,[],livebench_240829.csv llama_2_7b_chat,LiveBench Coding,1.28,[],livebench_240829.csv qwen1_5_4b_chat,LiveBench Coding,4.49,[],livebench_240829.csv qwen2_1_5b_instruct,LiveBench Coding,5.21,[],livebench_240829.csv yi_6b_chat,LiveBench Coding,2.0,[],livebench_240829.csv qwen2_0_5b_instruct,LiveBench Coding,1.28,[],livebench_240829.csv qwen1_5_1_8b_chat,LiveBench Coding,0.0,[],livebench_240829.csv qwen1_5_0_5b_chat,LiveBench Coding,0.0,[],livebench_240829.csv claude_3_5_sonnet_20240620,LiveBench Mathematics,53.75,[],livebench_240829.csv gpt_4o_2024_08_06,LiveBench Mathematics,52.29,[],livebench_240829.csv chatgpt_4o_latest,LiveBench Mathematics,52.19,[],livebench_240829.csv gpt_4o_2024_05_13,LiveBench Mathematics,49.88,[],livebench_240829.csv llama3_1_405b_instruct_turbo,LiveBench Mathematics,46.55,[],livebench_240829.csv gemini_1_5_pro_exp_0827,LiveBench Mathematics,56.28,[],livebench_240829.csv gpt_4_turbo_2024_04_09,LiveBench Mathematics,48.99,[],livebench_240829.csv gemini_1_5_pro_exp_0801,LiveBench Mathematics,47.46,[],livebench_240829.csv claude_3_opus_20240229,LiveBench Mathematics,46.54,[],livebench_240829.csv gpt_4_0125_preview,LiveBench Mathematics,42.75,[],livebench_240829.csv dracarys_llama3_1_70b_instruct,LiveBench Mathematics,45.68,[],livebench_240829.csv llama3_1_70b_instruct_turbo,LiveBench Mathematics,45.58,[],livebench_240829.csv mistral_large_2407,LiveBench Mathematics,40.48,[],livebench_240829.csv gemini_1_5_flash_exp_0827,LiveBench Mathematics,36.29,[],livebench_240829.csv deepseek_coder_v2,LiveBench Mathematics,52.54,[],livebench_240829.csv deepseek_chat_v2,LiveBench Mathematics,52.11,[],livebench_240829.csv gpt_4_0613,LiveBench Mathematics,36.22,[],livebench_240829.csv gemini_1_5_pro_api_0514,LiveBench Mathematics,42.42,[],livebench_240829.csv gpt_4o_mini_2024_07_18,LiveBench Mathematics,41.58,[],livebench_240829.csv gemma_2_27b_it,LiveBench Mathematics,36.23,[],livebench_240829.csv dracarys_72b_instruct,LiveBench Mathematics,42.77,[],livebench_240829.csv qwen2_72b_instruct,LiveBench Mathematics,43.44,[],livebench_240829.csv hermes_3_llama3_1_70b,LiveBench Mathematics,28.32,[],livebench_240829.csv gemini_1_5_flash_api_0514,LiveBench Mathematics,38.89,[],livebench_240829.csv smaug_qwen2_72b_instruct,LiveBench Mathematics,40.67,[],livebench_240829.csv mistral_large_2402,LiveBench Mathematics,32.2,[],livebench_240829.csv claude_3_sonnet_20240229,LiveBench Mathematics,29.65,[],livebench_240829.csv llama3_70b_instruct,LiveBench Mathematics,32.31,[],livebench_240829.csv claude_3_haiku_20240307,LiveBench Mathematics,25.72,[],livebench_240829.csv mixtral_8x22b_instruct_v0_1,LiveBench Mathematics,28.33,[],livebench_240829.csv phi_3_5_moe_instruct,LiveBench Mathematics,33.3,[],livebench_240829.csv gpt_3_5_turbo_0125,LiveBench Mathematics,26.93,[],livebench_240829.csv mistral_small_2402,LiveBench Mathematics,28.15,[],livebench_240829.csv command_r_plus,LiveBench Mathematics,24.85,[],livebench_240829.csv gemma_2_9b_it,LiveBench Mathematics,23.98,[],livebench_240829.csv phi_3_medium_4k_instruct,LiveBench Mathematics,31.36,[],livebench_240829.csv phi_3_medium_128k_instruct,LiveBench Mathematics,25.64,[],livebench_240829.csv phi_3_small_128k_instruct,LiveBench Mathematics,28.97,[],livebench_240829.csv qwen1_5_110b_chat,LiveBench Mathematics,26.28,[],livebench_240829.csv deepseek_coder_v2_lite_instruct,LiveBench Mathematics,34.44,[],livebench_240829.csv qwen1_5_72b_chat,LiveBench Mathematics,26.82,[],livebench_240829.csv open_mistral_nemo,LiveBench Mathematics,21.66,[],livebench_240829.csv phi_3_5_mini_instruct,LiveBench Mathematics,22.2,[],livebench_240829.csv llama3_1_8b_instruct_turbo,LiveBench Mathematics,24.37,[],livebench_240829.csv phi_3_small_8k_instruct,LiveBench Mathematics,23.73,[],livebench_240829.csv llama3_8b_instruct,LiveBench Mathematics,19.66,[],livebench_240829.csv command_r,LiveBench Mathematics,16.92,[],livebench_240829.csv qwen2_7b_instruct,LiveBench Mathematics,26.87,[],livebench_240829.csv phi_3_mini_128k_instruct,LiveBench Mathematics,17.06,[],livebench_240829.csv phi_3_mini_4k_instruct,LiveBench Mathematics,20.84,[],livebench_240829.csv mathstral_7b_v0_1,LiveBench Mathematics,17.84,[],livebench_240829.csv openhermes_2_5_mistral_7b,LiveBench Mathematics,20.45,[],livebench_240829.csv mixtral_8x7b_instruct_v0_1,LiveBench Mathematics,20.71,[],livebench_240829.csv mistral_7b_instruct_v0_3,LiveBench Mathematics,14.56,[],livebench_240829.csv mistral_7b_instruct_v0_2,LiveBench Mathematics,17.08,[],livebench_240829.csv gemma_1_1_7b_it,LiveBench Mathematics,15.21,[],livebench_240829.csv zephyr_7b_alpha,LiveBench Mathematics,9.96,[],livebench_240829.csv qwen1_5_7b_chat,LiveBench Mathematics,15.29,[],livebench_240829.csv deepseek_v2_lite_chat,LiveBench Mathematics,14.08,[],livebench_240829.csv zephyr_7b_beta,LiveBench Mathematics,11.23,[],livebench_240829.csv starling_lm_7b_beta,LiveBench Mathematics,14.86,[],livebench_240829.csv vicuna_7b_v1_5_16k,LiveBench Mathematics,9.04,[],livebench_240829.csv vicuna_7b_v1_5,LiveBench Mathematics,7.1,[],livebench_240829.csv llama_2_7b_chat,LiveBench Mathematics,4.78,[],livebench_240829.csv qwen1_5_4b_chat,LiveBench Mathematics,9.86,[],livebench_240829.csv qwen2_1_5b_instruct,LiveBench Mathematics,9.94,[],livebench_240829.csv yi_6b_chat,LiveBench Mathematics,8.53,[],livebench_240829.csv qwen2_0_5b_instruct,LiveBench Mathematics,7.35,[],livebench_240829.csv qwen1_5_1_8b_chat,LiveBench Mathematics,3.53,[],livebench_240829.csv qwen1_5_0_5b_chat,LiveBench Mathematics,4.43,[],livebench_240829.csv claude_3_5_sonnet_20240620,LiveBench Data Analysis,56.74,[],livebench_240829.csv gpt_4o_2024_08_06,LiveBench Data Analysis,52.89,[],livebench_240829.csv chatgpt_4o_latest,LiveBench Data Analysis,54.43,[],livebench_240829.csv gpt_4o_2024_05_13,LiveBench Data Analysis,52.41,[],livebench_240829.csv llama3_1_405b_instruct_turbo,LiveBench Data Analysis,53.51,[],livebench_240829.csv gemini_1_5_pro_exp_0827,LiveBench Data Analysis,50.83,[],livebench_240829.csv gpt_4_turbo_2024_04_09,LiveBench Data Analysis,51.32,[],livebench_240829.csv gemini_1_5_pro_exp_0801,LiveBench Data Analysis,50.15,[],livebench_240829.csv claude_3_opus_20240229,LiveBench Data Analysis,54.32,[],livebench_240829.csv gpt_4_0125_preview,LiveBench Data Analysis,54.06,[],livebench_240829.csv dracarys_llama3_1_70b_instruct,LiveBench Data Analysis,47.99,[],livebench_240829.csv llama3_1_70b_instruct_turbo,LiveBench Data Analysis,50.29,[],livebench_240829.csv mistral_large_2407,LiveBench Data Analysis,46.61,[],livebench_240829.csv gemini_1_5_flash_exp_0827,LiveBench Data Analysis,47.87,[],livebench_240829.csv deepseek_coder_v2,LiveBench Data Analysis,38.25,[],livebench_240829.csv deepseek_chat_v2,LiveBench Data Analysis,45.59,[],livebench_240829.csv gpt_4_0613,LiveBench Data Analysis,44.03,[],livebench_240829.csv gemini_1_5_pro_api_0514,LiveBench Data Analysis,52.81,[],livebench_240829.csv gpt_4o_mini_2024_07_18,LiveBench Data Analysis,44.52,[],livebench_240829.csv gemma_2_27b_it,LiveBench Data Analysis,43.58,[],livebench_240829.csv dracarys_72b_instruct,LiveBench Data Analysis,26.24,[],livebench_240829.csv qwen2_72b_instruct,LiveBench Data Analysis,26.24,[],livebench_240829.csv hermes_3_llama3_1_70b,LiveBench Data Analysis,48.11,[],livebench_240829.csv gemini_1_5_flash_api_0514,LiveBench Data Analysis,44.03,[],livebench_240829.csv smaug_qwen2_72b_instruct,LiveBench Data Analysis,26.19,[],livebench_240829.csv mistral_large_2402,LiveBench Data Analysis,42.55,[],livebench_240829.csv claude_3_sonnet_20240229,LiveBench Data Analysis,44.56,[],livebench_240829.csv llama3_70b_instruct,LiveBench Data Analysis,43.75,[],livebench_240829.csv claude_3_haiku_20240307,LiveBench Data Analysis,41.54,[],livebench_240829.csv mixtral_8x22b_instruct_v0_1,LiveBench Data Analysis,31.67,[],livebench_240829.csv phi_3_5_moe_instruct,LiveBench Data Analysis,40.46,[],livebench_240829.csv gpt_3_5_turbo_0125,LiveBench Data Analysis,41.21,[],livebench_240829.csv mistral_small_2402,LiveBench Data Analysis,31.88,[],livebench_240829.csv command_r_plus,LiveBench Data Analysis,24.6,[],livebench_240829.csv gemma_2_9b_it,LiveBench Data Analysis,35.06,[],livebench_240829.csv phi_3_medium_4k_instruct,LiveBench Data Analysis,31.63,[],livebench_240829.csv phi_3_medium_128k_instruct,LiveBench Data Analysis,32.12,[],livebench_240829.csv phi_3_small_128k_instruct,LiveBench Data Analysis,27.26,[],livebench_240829.csv qwen1_5_110b_chat,LiveBench Data Analysis,31.45,[],livebench_240829.csv deepseek_coder_v2_lite_instruct,LiveBench Data Analysis,33.0,[],livebench_240829.csv qwen1_5_72b_chat,LiveBench Data Analysis,32.98,[],livebench_240829.csv open_mistral_nemo,LiveBench Data Analysis,33.35,[],livebench_240829.csv phi_3_5_mini_instruct,LiveBench Data Analysis,30.43,[],livebench_240829.csv llama3_1_8b_instruct_turbo,LiveBench Data Analysis,32.15,[],livebench_240829.csv phi_3_small_8k_instruct,LiveBench Data Analysis,29.62,[],livebench_240829.csv llama3_8b_instruct,LiveBench Data Analysis,26.0,[],livebench_240829.csv command_r,LiveBench Data Analysis,31.69,[],livebench_240829.csv qwen2_7b_instruct,LiveBench Data Analysis,28.75,[],livebench_240829.csv phi_3_mini_128k_instruct,LiveBench Data Analysis,34.02,[],livebench_240829.csv phi_3_mini_4k_instruct,LiveBench Data Analysis,29.55,[],livebench_240829.csv mathstral_7b_v0_1,LiveBench Data Analysis,27.89,[],livebench_240829.csv openhermes_2_5_mistral_7b,LiveBench Data Analysis,26.92,[],livebench_240829.csv mixtral_8x7b_instruct_v0_1,LiveBench Data Analysis,28.13,[],livebench_240829.csv mistral_7b_instruct_v0_3,LiveBench Data Analysis,21.77,[],livebench_240829.csv mistral_7b_instruct_v0_2,LiveBench Data Analysis,14.62,[],livebench_240829.csv gemma_1_1_7b_it,LiveBench Data Analysis,18.17,[],livebench_240829.csv zephyr_7b_alpha,LiveBench Data Analysis,17.4,[],livebench_240829.csv qwen1_5_7b_chat,LiveBench Data Analysis,16.9,[],livebench_240829.csv deepseek_v2_lite_chat,LiveBench Data Analysis,18.19,[],livebench_240829.csv zephyr_7b_beta,LiveBench Data Analysis,15.75,[],livebench_240829.csv starling_lm_7b_beta,LiveBench Data Analysis,2.0,[],livebench_240829.csv vicuna_7b_v1_5_16k,LiveBench Data Analysis,9.93,[],livebench_240829.csv vicuna_7b_v1_5,LiveBench Data Analysis,3.33,[],livebench_240829.csv llama_2_7b_chat,LiveBench Data Analysis,0.0,[],livebench_240829.csv qwen1_5_4b_chat,LiveBench Data Analysis,9.13,[],livebench_240829.csv qwen2_1_5b_instruct,LiveBench Data Analysis,10.01,[],livebench_240829.csv yi_6b_chat,LiveBench Data Analysis,4.38,[],livebench_240829.csv qwen2_0_5b_instruct,LiveBench Data Analysis,2.0,[],livebench_240829.csv qwen1_5_1_8b_chat,LiveBench Data Analysis,3.33,[],livebench_240829.csv qwen1_5_0_5b_chat,LiveBench Data Analysis,0.0,[],livebench_240829.csv claude_3_5_sonnet_20240620,LiveBench Language,56.94,[],livebench_240829.csv gpt_4o_2024_08_06,LiveBench Language,54.37,[],livebench_240829.csv chatgpt_4o_latest,LiveBench Language,49.95,[],livebench_240829.csv gpt_4o_2024_05_13,LiveBench Language,53.94,[],livebench_240829.csv llama3_1_405b_instruct_turbo,LiveBench Language,49.85,[],livebench_240829.csv gemini_1_5_pro_exp_0827,LiveBench Language,49.31,[],livebench_240829.csv gpt_4_turbo_2024_04_09,LiveBench Language,45.26,[],livebench_240829.csv gemini_1_5_pro_exp_0801,LiveBench Language,46.96,[],livebench_240829.csv claude_3_opus_20240229,LiveBench Language,51.72,[],livebench_240829.csv gpt_4_0125_preview,LiveBench Language,43.55,[],livebench_240829.csv dracarys_llama3_1_70b_instruct,LiveBench Language,41.77,[],livebench_240829.csv llama3_1_70b_instruct_turbo,LiveBench Language,42.36,[],livebench_240829.csv mistral_large_2407,LiveBench Language,39.79,[],livebench_240829.csv gemini_1_5_flash_exp_0827,LiveBench Language,31.04,[],livebench_240829.csv deepseek_coder_v2,LiveBench Language,33.04,[],livebench_240829.csv deepseek_chat_v2,LiveBench Language,32.77,[],livebench_240829.csv gpt_4_0613,LiveBench Language,49.57,[],livebench_240829.csv gemini_1_5_pro_api_0514,LiveBench Language,38.25,[],livebench_240829.csv gpt_4o_mini_2024_07_18,LiveBench Language,35.28,[],livebench_240829.csv gemma_2_27b_it,LiveBench Language,32.4,[],livebench_240829.csv dracarys_72b_instruct,LiveBench Language,31.17,[],livebench_240829.csv qwen2_72b_instruct,LiveBench Language,29.21,[],livebench_240829.csv hermes_3_llama3_1_70b,LiveBench Language,43.77,[],livebench_240829.csv gemini_1_5_flash_api_0514,LiveBench Language,30.69,[],livebench_240829.csv smaug_qwen2_72b_instruct,LiveBench Language,30.03,[],livebench_240829.csv mistral_large_2402,LiveBench Language,28.74,[],livebench_240829.csv claude_3_sonnet_20240229,LiveBench Language,38.08,[],livebench_240829.csv llama3_70b_instruct,LiveBench Language,34.11,[],livebench_240829.csv claude_3_haiku_20240307,LiveBench Language,30.07,[],livebench_240829.csv mixtral_8x22b_instruct_v0_1,LiveBench Language,26.48,[],livebench_240829.csv phi_3_5_moe_instruct,LiveBench Language,17.07,[],livebench_240829.csv gpt_3_5_turbo_0125,LiveBench Language,24.22,[],livebench_240829.csv mistral_small_2402,LiveBench Language,22.06,[],livebench_240829.csv command_r_plus,LiveBench Language,23.92,[],livebench_240829.csv gemma_2_9b_it,LiveBench Language,27.64,[],livebench_240829.csv phi_3_medium_4k_instruct,LiveBench Language,13.91,[],livebench_240829.csv phi_3_medium_128k_instruct,LiveBench Language,12.76,[],livebench_240829.csv phi_3_small_128k_instruct,LiveBench Language,15.53,[],livebench_240829.csv qwen1_5_110b_chat,LiveBench Language,13.22,[],livebench_240829.csv deepseek_coder_v2_lite_instruct,LiveBench Language,10.64,[],livebench_240829.csv qwen1_5_72b_chat,LiveBench Language,11.37,[],livebench_240829.csv open_mistral_nemo,LiveBench Language,14.15,[],livebench_240829.csv phi_3_5_mini_instruct,LiveBench Language,9.67,[],livebench_240829.csv llama3_1_8b_instruct_turbo,LiveBench Language,20.05,[],livebench_240829.csv phi_3_small_8k_instruct,LiveBench Language,15.13,[],livebench_240829.csv llama3_8b_instruct,LiveBench Language,18.72,[],livebench_240829.csv command_r,LiveBench Language,14.64,[],livebench_240829.csv qwen2_7b_instruct,LiveBench Language,10.21,[],livebench_240829.csv phi_3_mini_128k_instruct,LiveBench Language,7.76,[],livebench_240829.csv phi_3_mini_4k_instruct,LiveBench Language,8.06,[],livebench_240829.csv mathstral_7b_v0_1,LiveBench Language,15.37,[],livebench_240829.csv openhermes_2_5_mistral_7b,LiveBench Language,11.37,[],livebench_240829.csv mixtral_8x7b_instruct_v0_1,LiveBench Language,13.76,[],livebench_240829.csv mistral_7b_instruct_v0_3,LiveBench Language,11.85,[],livebench_240829.csv mistral_7b_instruct_v0_2,LiveBench Language,9.05,[],livebench_240829.csv gemma_1_1_7b_it,LiveBench Language,10.65,[],livebench_240829.csv zephyr_7b_alpha,LiveBench Language,7.2,[],livebench_240829.csv qwen1_5_7b_chat,LiveBench Language,6.18,[],livebench_240829.csv deepseek_v2_lite_chat,LiveBench Language,9.2,[],livebench_240829.csv zephyr_7b_beta,LiveBench Language,4.28,[],livebench_240829.csv starling_lm_7b_beta,LiveBench Language,7.26,[],livebench_240829.csv vicuna_7b_v1_5_16k,LiveBench Language,7.92,[],livebench_240829.csv vicuna_7b_v1_5,LiveBench Language,8.66,[],livebench_240829.csv llama_2_7b_chat,LiveBench Language,6.86,[],livebench_240829.csv qwen1_5_4b_chat,LiveBench Language,5.8,[],livebench_240829.csv qwen2_1_5b_instruct,LiveBench Language,3.05,[],livebench_240829.csv yi_6b_chat,LiveBench Language,4.69,[],livebench_240829.csv qwen2_0_5b_instruct,LiveBench Language,2.8,[],livebench_240829.csv qwen1_5_1_8b_chat,LiveBench Language,3.16,[],livebench_240829.csv qwen1_5_0_5b_chat,LiveBench Language,2.88,[],livebench_240829.csv claude_3_5_sonnet_20240620,LiveBench Instruction Following,72.3,[],livebench_240829.csv gpt_4o_2024_08_06,LiveBench Instruction Following,74.58,[],livebench_240829.csv chatgpt_4o_latest,LiveBench Instruction Following,72.52,[],livebench_240829.csv gpt_4o_2024_05_13,LiveBench Instruction Following,72.17,[],livebench_240829.csv llama3_1_405b_instruct_turbo,LiveBench Instruction Following,78.47,[],livebench_240829.csv gemini_1_5_pro_exp_0827,LiveBench Instruction Following,75.95,[],livebench_240829.csv gpt_4_turbo_2024_04_09,LiveBench Instruction Following,71.39,[],livebench_240829.csv gemini_1_5_pro_exp_0801,LiveBench Instruction Following,78.84,[],livebench_240829.csv claude_3_opus_20240229,LiveBench Instruction Following,70.87,[],livebench_240829.csv gpt_4_0125_preview,LiveBench Instruction Following,63.92,[],livebench_240829.csv dracarys_llama3_1_70b_instruct,LiveBench Instruction Following,77.37,[],livebench_240829.csv llama3_1_70b_instruct_turbo,LiveBench Instruction Following,79.08,[],livebench_240829.csv mistral_large_2407,LiveBench Instruction Following,71.85,[],livebench_240829.csv gemini_1_5_flash_exp_0827,LiveBench Instruction Following,78.11,[],livebench_240829.csv deepseek_coder_v2,LiveBench Instruction Following,67.18,[],livebench_240829.csv deepseek_chat_v2,LiveBench Instruction Following,64.61,[],livebench_240829.csv gpt_4_0613,LiveBench Instruction Following,71.79,[],livebench_240829.csv gemini_1_5_pro_api_0514,LiveBench Instruction Following,67.2,[],livebench_240829.csv gpt_4o_mini_2024_07_18,LiveBench Instruction Following,65.68,[],livebench_240829.csv gemma_2_27b_it,LiveBench Instruction Following,67.37,[],livebench_240829.csv dracarys_72b_instruct,LiveBench Instruction Following,68.08,[],livebench_240829.csv qwen2_72b_instruct,LiveBench Instruction Following,68.27,[],livebench_240829.csv hermes_3_llama3_1_70b,LiveBench Instruction Following,55.37,[],livebench_240829.csv gemini_1_5_flash_api_0514,LiveBench Instruction Following,63.01,[],livebench_240829.csv smaug_qwen2_72b_instruct,LiveBench Instruction Following,65.0,[],livebench_240829.csv mistral_large_2402,LiveBench Instruction Following,68.19,[],livebench_240829.csv claude_3_sonnet_20240229,LiveBench Instruction Following,65.0,[],livebench_240829.csv llama3_70b_instruct,LiveBench Instruction Following,63.5,[],livebench_240829.csv claude_3_haiku_20240307,LiveBench Instruction Following,64.03,[],livebench_240829.csv mixtral_8x22b_instruct_v0_1,LiveBench Instruction Following,63.17,[],livebench_240829.csv phi_3_5_moe_instruct,LiveBench Instruction Following,59.73,[],livebench_240829.csv gpt_3_5_turbo_0125,LiveBench Instruction Following,60.47,[],livebench_240829.csv mistral_small_2402,LiveBench Instruction Following,63.91,[],livebench_240829.csv command_r_plus,LiveBench Instruction Following,71.51,[],livebench_240829.csv gemma_2_9b_it,LiveBench Instruction Following,61.55,[],livebench_240829.csv phi_3_medium_4k_instruct,LiveBench Instruction Following,53.3,[],livebench_240829.csv phi_3_medium_128k_instruct,LiveBench Instruction Following,56.15,[],livebench_240829.csv phi_3_small_128k_instruct,LiveBench Instruction Following,53.47,[],livebench_240829.csv qwen1_5_110b_chat,LiveBench Instruction Following,55.26,[],livebench_240829.csv deepseek_coder_v2_lite_instruct,LiveBench Instruction Following,48.34,[],livebench_240829.csv qwen1_5_72b_chat,LiveBench Instruction Following,58.25,[],livebench_240829.csv open_mistral_nemo,LiveBench Instruction Following,51.8,[],livebench_240829.csv phi_3_5_mini_instruct,LiveBench Instruction Following,58.3,[],livebench_240829.csv llama3_1_8b_instruct_turbo,LiveBench Instruction Following,56.53,[],livebench_240829.csv phi_3_small_8k_instruct,LiveBench Instruction Following,55.81,[],livebench_240829.csv llama3_8b_instruct,LiveBench Instruction Following,57.14,[],livebench_240829.csv command_r,LiveBench Instruction Following,57.16,[],livebench_240829.csv qwen2_7b_instruct,LiveBench Instruction Following,44.74,[],livebench_240829.csv phi_3_mini_128k_instruct,LiveBench Instruction Following,51.4,[],livebench_240829.csv phi_3_mini_4k_instruct,LiveBench Instruction Following,51.25,[],livebench_240829.csv mathstral_7b_v0_1,LiveBench Instruction Following,53.25,[],livebench_240829.csv openhermes_2_5_mistral_7b,LiveBench Instruction Following,52.78,[],livebench_240829.csv mixtral_8x7b_instruct_v0_1,LiveBench Instruction Following,44.81,[],livebench_240829.csv mistral_7b_instruct_v0_3,LiveBench Instruction Following,52.37,[],livebench_240829.csv mistral_7b_instruct_v0_2,LiveBench Instruction Following,51.65,[],livebench_240829.csv gemma_1_1_7b_it,LiveBench Instruction Following,44.34,[],livebench_240829.csv zephyr_7b_alpha,LiveBench Instruction Following,52.79,[],livebench_240829.csv qwen1_5_7b_chat,LiveBench Instruction Following,44.12,[],livebench_240829.csv deepseek_v2_lite_chat,LiveBench Instruction Following,41.83,[],livebench_240829.csv zephyr_7b_beta,LiveBench Instruction Following,48.32,[],livebench_240829.csv starling_lm_7b_beta,LiveBench Instruction Following,38.32,[],livebench_240829.csv vicuna_7b_v1_5_16k,LiveBench Instruction Following,42.12,[],livebench_240829.csv vicuna_7b_v1_5,LiveBench Instruction Following,41.75,[],livebench_240829.csv llama_2_7b_chat,LiveBench Instruction Following,44.88,[],livebench_240829.csv qwen1_5_4b_chat,LiveBench Instruction Following,27.75,[],livebench_240829.csv qwen2_1_5b_instruct,LiveBench Instruction Following,25.9,[],livebench_240829.csv yi_6b_chat,LiveBench Instruction Following,27.22,[],livebench_240829.csv qwen2_0_5b_instruct,LiveBench Instruction Following,26.63,[],livebench_240829.csv qwen1_5_1_8b_chat,LiveBench Instruction Following,22.9,[],livebench_240829.csv qwen1_5_0_5b_chat,LiveBench Instruction Following,21.3,[],livebench_240829.csv gemini_1_5_pro_exp_0801,Enkrypt AI Safety,84.0,[],enkrypt_ai_safety_240916.csv gemini_1_5_pro_latest,Enkrypt AI Safety,81.0,[],enkrypt_ai_safety_240916.csv gemma_2_27b_it,Enkrypt AI Safety,79.0,[],enkrypt_ai_safety_240916.csv reflection_llama3_1_70b,Enkrypt AI Safety,81.0,[],enkrypt_ai_safety_240916.csv llama_2_7b_chat_gguf_8bit,Enkrypt AI Safety,80.0,[],enkrypt_ai_safety_240916.csv llama_2_7b_chat_gguf_4bit,Enkrypt AI Safety,80.0,[],enkrypt_ai_safety_240916.csv smollm_360m_instruct,Enkrypt AI Safety,80.0,[],enkrypt_ai_safety_240916.csv llama_2_7b_chat,Enkrypt AI Safety,78.0,[],enkrypt_ai_safety_240916.csv flan_flan-ul2,Enkrypt AI Safety,76.0,[],enkrypt_ai_safety_240916.csv o1_preview,Enkrypt AI Safety,76.0,[],enkrypt_ai_safety_240916.csv llama3_8b_instruct_rr,Enkrypt AI Safety,81.0,[],enkrypt_ai_safety_240916.csv claude_3_opus_20240229,Enkrypt AI Safety,75.0,[],enkrypt_ai_safety_240916.csv gpt_4_0125_preview,Enkrypt AI Safety,79.0,[],enkrypt_ai_safety_240916.csv sarvam_2b_v0_5,Enkrypt AI Safety,75.0,[],enkrypt_ai_safety_240916.csv llama3_8b_instruct_mopeymule,Enkrypt AI Safety,73.0,[],enkrypt_ai_safety_240916.csv claude_3_5_sonnet_20240620,Enkrypt AI Safety,71.0,[],enkrypt_ai_safety_240916.csv sea_lion_7b_instruct,Enkrypt AI Safety,73.0,[],enkrypt_ai_safety_240916.csv claude_instant_1_2,Enkrypt AI Safety,76.0,[],enkrypt_ai_safety_240916.csv gpt_4_turbo_2024_04_09,Enkrypt AI Safety,75.0,[],enkrypt_ai_safety_240916.csv llama3_1_8b_instruct_turbo,Enkrypt AI Safety,70.0,[],enkrypt_ai_safety_240916.csv rakutenai_7b_chat,Enkrypt AI Safety,68.0,[],enkrypt_ai_safety_240916.csv gemma_2_2b_it,Enkrypt AI Safety,67.0,[],enkrypt_ai_safety_240916.csv llama3_8b_instruct,Enkrypt AI Safety,72.0,[],enkrypt_ai_safety_240916.csv o1_mini,Enkrypt AI Safety,71.0,[],enkrypt_ai_safety_240916.csv mistral_7b_v0_1,Enkrypt AI Safety,70.0,[],enkrypt_ai_safety_240916.csv llama_2_13b_chat,Enkrypt AI Safety,72.0,[],enkrypt_ai_safety_240916.csv h2o_danube3_500m_chat,Enkrypt AI Safety,68.0,[],enkrypt_ai_safety_240916.csv llama_2_70b_chat,Enkrypt AI Safety,68.0,[],enkrypt_ai_safety_240916.csv gemma_2_9b_it,Enkrypt AI Safety,67.0,[],enkrypt_ai_safety_240916.csv internlm2_chat_20b,Enkrypt AI Safety,59.0,[],enkrypt_ai_safety_240916.csv gemma_2_9b,Enkrypt AI Safety,64.0,[],enkrypt_ai_safety_240916.csv nexusraven_v2_13b,Enkrypt AI Safety,63.0,[],enkrypt_ai_safety_240916.csv komodo_7b_base,Enkrypt AI Safety,61.0,[],enkrypt_ai_safety_240916.csv gpt_4o,Enkrypt AI Safety,64.0,[],enkrypt_ai_safety_240916.csv phi_2,Enkrypt AI Safety,58.0,[],enkrypt_ai_safety_240916.csv phi3_medium_128k,Enkrypt AI Safety,61.0,[],enkrypt_ai_safety_240916.csv gemma_7b_it,Enkrypt AI Safety,61.0,[],enkrypt_ai_safety_240916.csv claude_3_haiku_20240307,Enkrypt AI Safety,67.0,[],enkrypt_ai_safety_240916.csv llama3_1_405b_instruct_turbo,Enkrypt AI Safety,61.0,[],enkrypt_ai_safety_240916.csv smollm_1_7b_instruct,Enkrypt AI Safety,60.0,[],enkrypt_ai_safety_240916.csv gpt_4o_2024_08_06,Enkrypt AI Safety,60.0,[],enkrypt_ai_safety_240916.csv powerlm_3b,Enkrypt AI Safety,53.0,[],enkrypt_ai_safety_240916.csv llama3_70b_instruct,Enkrypt AI Safety,62.0,[],enkrypt_ai_safety_240916.csv starling_lm_7b_beta_gguf_4bit,Enkrypt AI Safety,54.0,[],enkrypt_ai_safety_240916.csv smaug_72b_v0_1,Enkrypt AI Safety,61.0,[],enkrypt_ai_safety_240916.csv gpt_3_5_turbo,Enkrypt AI Safety,62.0,[],enkrypt_ai_safety_240916.csv codellama_7b_instruct,Enkrypt AI Safety,56.0,[],enkrypt_ai_safety_240916.csv smaug_llama3_70b_instruct,Enkrypt AI Safety,56.0,[],enkrypt_ai_safety_240916.csv mixtral_8x7b_instruct_v0_1,Enkrypt AI Safety,54.0,[],enkrypt_ai_safety_240916.csv jamba_instruct_preview,Enkrypt AI Safety,51.0,[],enkrypt_ai_safety_240916.csv mixtral_8x22b_instruct_v0_1,Enkrypt AI Safety,53.0,[],enkrypt_ai_safety_240916.csv seallm_7b_v2,Enkrypt AI Safety,58.0,[],enkrypt_ai_safety_240916.csv qwen2_72b_instruct,Enkrypt AI Safety,55.0,[],enkrypt_ai_safety_240916.csv olmo_7b_instruct,Enkrypt AI Safety,47.0,[],enkrypt_ai_safety_240916.csv phi_3_mini_128k_instruct,Enkrypt AI Safety,55.0,[],enkrypt_ai_safety_240916.csv dbrx_instructruct,Enkrypt AI Safety,51.0,[],enkrypt_ai_safety_240916.csv falcon_mamba_7b_instruct,Enkrypt AI Safety,49.0,[],enkrypt_ai_safety_240916.csv gpt_4o_mini,Enkrypt AI Safety,55.0,[],enkrypt_ai_safety_240916.csv phi_3_5_moe_instruct,Enkrypt AI Safety,54.0,[],enkrypt_ai_safety_240916.csv qwen1_5_14b_chat,Enkrypt AI Safety,51.0,[],enkrypt_ai_safety_240916.csv c4ai_command_r_plus,Enkrypt AI Safety,48.0,[],enkrypt_ai_safety_240916.csv smaug_34b_v0_1,Enkrypt AI Safety,56.0,[],enkrypt_ai_safety_240916.csv qwen2_7b_instruct,Enkrypt AI Safety,50.0,[],enkrypt_ai_safety_240916.csv mistral_7b_instruct_v0_2_gguf_4bit,Enkrypt AI Safety,48.0,[],enkrypt_ai_safety_240916.csv llama3_1_70b_instruct_turbo,Enkrypt AI Safety,48.0,[],enkrypt_ai_safety_240916.csv k2_chat,Enkrypt AI Safety,50.0,[],enkrypt_ai_safety_240916.csv phi_3_mini_4k_instruct,Enkrypt AI Safety,50.0,[],enkrypt_ai_safety_240916.csv starling_lm_7b_beta,Enkrypt AI Safety,51.0,[],enkrypt_ai_safety_240916.csv olmoe_1b_7b_0924_instruct,Enkrypt AI Safety,49.0,[],enkrypt_ai_safety_240916.csv mistral_7b_instruct_v0_2_gguf_8bit,Enkrypt AI Safety,48.0,[],enkrypt_ai_safety_240916.csv h2o_danube3_4b_chat,Enkrypt AI Safety,47.0,[],enkrypt_ai_safety_240916.csv rakutenai_7b_instruct,Enkrypt AI Safety,44.0,[],enkrypt_ai_safety_240916.csv mistral_7b_instruct_v0_2,Enkrypt AI Safety,46.0,[],enkrypt_ai_safety_240916.csv jamba_1_5_mini,Enkrypt AI Safety,48.0,[],enkrypt_ai_safety_240916.csv aya_23_35b,Enkrypt AI Safety,47.0,[],enkrypt_ai_safety_240916.csv jamba_1_5_large,Enkrypt AI Safety,47.0,[],enkrypt_ai_safety_240916.csv phi_3_small_8k_instruct,Enkrypt AI Safety,48.0,[],enkrypt_ai_safety_240916.csv phi_3_small_128k_instruct,Enkrypt AI Safety,46.0,[],enkrypt_ai_safety_240916.csv zephyr_7b_beta,Enkrypt AI Safety,43.0,[],enkrypt_ai_safety_240916.csv powermoe_3b,Enkrypt AI Safety,47.0,[],enkrypt_ai_safety_240916.csv longwriter_glm4_9b,Enkrypt AI Safety,46.0,[],enkrypt_ai_safety_240916.csv mistral_7b_instruct_v0_1_gguf_4bit,Enkrypt AI Safety,39.0,[],enkrypt_ai_safety_240916.csv snowflake_arctic_instruct,Enkrypt AI Safety,45.0,[],enkrypt_ai_safety_240916.csv qwen2_57b_a14b_instruct,Enkrypt AI Safety,45.0,[],enkrypt_ai_safety_240916.csv palm_2_chat_bison,Enkrypt AI Safety,40.0,[],enkrypt_ai_safety_240916.csv mistral_7b_instruct_v0_1_gguf_8bit,Enkrypt AI Safety,40.0,[],enkrypt_ai_safety_240916.csv glm_4_9b_chat,Enkrypt AI Safety,43.0,[],enkrypt_ai_safety_240916.csv phi_3_medium_4k_instruct,Enkrypt AI Safety,43.0,[],enkrypt_ai_safety_240916.csv aya_23_8b,Enkrypt AI Safety,40.0,[],enkrypt_ai_safety_240916.csv mistral_7b_instruct_v0_3,Enkrypt AI Safety,39.0,[],enkrypt_ai_safety_240916.csv phi_3_5_mini_instruct,Enkrypt AI Safety,37.0,[],enkrypt_ai_safety_240916.csv dolphin_2_5_mixtral_8x7b,Enkrypt AI Safety,32.0,[],enkrypt_ai_safety_240916.csv gpt_4o_2024_05_13,WildBench Elo LC,1227.1,[],wildbench_240829.csv claude_3_5_sonnet,WildBench Elo LC,1215.4,[],wildbench_240829.csv gemini_1_5_pro,WildBench Elo LC,1214.6,[],wildbench_240829.csv gpt_4_turbo_2024_04_09,WildBench Elo LC,1209.6,[],wildbench_240829.csv yi_large_preview,WildBench Elo LC,1208.9,[],wildbench_240829.csv deepseek_v2_chat_0628_api,WildBench Elo LC,1199.1,[],wildbench_240829.csv gpt_4_0125_preview,WildBench Elo LC,1197.3,[],wildbench_240829.csv claude_3_opus,WildBench Elo LC,1196.3,[],wildbench_240829.csv gemini_1_5_flash,WildBench Elo LC,1192.0,[],wildbench_240829.csv llama3_70b_instruct,WildBench Elo LC,1187.5,[],wildbench_240829.csv deepseek_v2_coder_0614_api,WildBench Elo LC,1184.9,[],wildbench_240829.csv yi_large,WildBench Elo LC,1181.8,[],wildbench_240829.csv athene_70b,WildBench Elo LC,1180.7,[],wildbench_240829.csv nemotron_4_340b_inst,WildBench Elo LC,1178.6,[],wildbench_240829.csv gemma_2_27b_it,WildBench Elo LC,1176.4,[],wildbench_240829.csv mistral_large_2,WildBench Elo LC,1176.3,[],wildbench_240829.csv claude_3_sonnet,WildBench Elo LC,1174.7,[],wildbench_240829.csv gpt_4o_mini_2024_07_18,WildBench Elo LC,1173.5,[],wildbench_240829.csv qwen2_72b_instruct,WildBench Elo LC,1172.3,[],wildbench_240829.csv reka_core,WildBench Elo LC,1170.4,[],wildbench_240829.csv gemma_2_9b_it_simpo,WildBench Elo LC,1166.6,[],wildbench_240829.csv gemma_2_9b_it_dpo,WildBench Elo LC,1166.6,[],wildbench_240829.csv yi_1_5_34b_chat,WildBench Elo LC,1159.6,[],wildbench_240829.csv claude_3_haiku,WildBench Elo LC,1159.1,[],wildbench_240829.csv mistral_nemo_inst_12b,WildBench Elo LC,1158.6,[],wildbench_240829.csv mistral_large,WildBench Elo LC,1157.0,[],wildbench_240829.csv gemma_2_9b_it,WildBench Elo LC,1156.4,[],wildbench_240829.csv command_r_plus,WildBench Elo LC,1151.4,[],wildbench_240829.csv glm_4_9b_chat,WildBench Elo LC,1148.5,[],wildbench_240829.csv magpie_8b_align_v0_1,WildBench Elo LC,1148.4,[],wildbench_240829.csv yi_1_5_9b_chat,WildBench Elo LC,1148.0,[],wildbench_240829.csv llama3_inst_8b_simpo,WildBench Elo LC,1147.5,[],wildbench_240829.csv llama3_inst_8b_simpo_v0_2,WildBench Elo LC,1147.4,[],wildbench_240829.csv qwen1_5_72b_chat,WildBench Elo LC,1147.4,[],wildbench_240829.csv llama3_inst_8b_simpo_expo,WildBench Elo LC,1145.5,[],wildbench_240829.csv selm_llama3_8b_inst_iter3,WildBench Elo LC,1144.0,[],wildbench_240829.csv phi_3_medium_128k,WildBench Elo LC,1139.5,[],wildbench_240829.csv llama3_8b_instruct,WildBench Elo LC,1139.5,[],wildbench_240829.csv hermes_2_theta_llama3_8b,WildBench Elo LC,1137.4,[],wildbench_240829.csv starling_lm_7b_beta_expo,WildBench Elo LC,1136.0,[],wildbench_240829.csv selm_zephyr_7b_iter3,WildBench Elo LC,1134.3,[],wildbench_240829.csv reka_flash,WildBench Elo LC,1132.7,[],wildbench_240829.csv gemma_2_2b_it,WildBench Elo LC,1129.7,[],wildbench_240829.csv gpt_3_5_turbo_0125,WildBench Elo LC,1129.2,[],wildbench_240829.csv dbrx_instruct,WildBench Elo LC,1128.5,[],wildbench_240829.csv neo_7b_instruct_expo,WildBench Elo LC,1126.6,[],wildbench_240829.csv neo_7b_instruct,WildBench Elo LC,1126.2,[],wildbench_240829.csv starlinglm_7b_beta,WildBench Elo LC,1126.2,[],wildbench_240829.csv command_r,WildBench Elo LC,1125.6,[],wildbench_240829.csv mixtral_8x7b_instruct,WildBench Elo LC,1124.7,[],wildbench_240829.csv yi_1_5_6b_chat,WildBench Elo LC,1122.7,[],wildbench_240829.csv tulu_2_dpo_70b,WildBench Elo LC,1121.0,[],wildbench_240829.csv reka_edge,WildBench Elo LC,1120.8,[],wildbench_240829.csv mistral_7b_instruct_v0_2,WildBench Elo LC,1105.0,[],wildbench_240829.csv llama_2_70b_chat,WildBench Elo LC,1101.9,[],wildbench_240829.csv qwen1_5_7b_chat,WildBench Elo LC,1092.7,[],wildbench_240829.csv hermes_2_mixtral_8x7b_dpo,WildBench Elo LC,1085.8,[],wildbench_240829.csv phi_3_mini_128k,WildBench Elo LC,1082.1,[],wildbench_240829.csv gemma_7b_it,WildBench Elo LC,1079.2,[],wildbench_240829.csv llama_2_7b_chat,WildBench Elo LC,1052.5,[],wildbench_240829.csv gpt_4o_2024_05_13,WildBench Information Seeking,58.6,[],wildbench_240829.csv claude_3_5_sonnet,WildBench Information Seeking,55.5,[],wildbench_240829.csv gemini_1_5_pro,WildBench Information Seeking,52.2,[],wildbench_240829.csv gpt_4_turbo_2024_04_09,WildBench Information Seeking,57.2,[],wildbench_240829.csv yi_large_preview,WildBench Information Seeking,57.7,[],wildbench_240829.csv deepseek_v2_chat_0628_api,WildBench Information Seeking,52.7,[],wildbench_240829.csv gpt_4_0125_preview,WildBench Information Seeking,54.4,[],wildbench_240829.csv claude_3_opus,WildBench Information Seeking,53.5,[],wildbench_240829.csv gemini_1_5_flash,WildBench Information Seeking,48.7,[],wildbench_240829.csv llama3_70b_instruct,WildBench Information Seeking,52.3,[],wildbench_240829.csv deepseek_v2_coder_0614_api,WildBench Information Seeking,40.0,[],wildbench_240829.csv yi_large,WildBench Information Seeking,51.0,[],wildbench_240829.csv athene_70b,WildBench Information Seeking,60.8,[],wildbench_240829.csv nemotron_4_340b_inst,WildBench Information Seeking,53.0,[],wildbench_240829.csv gemma_2_27b_it,WildBench Information Seeking,50.5,[],wildbench_240829.csv mistral_large_2,WildBench Information Seeking,57.4,[],wildbench_240829.csv claude_3_sonnet,WildBench Information Seeking,47.1,[],wildbench_240829.csv gpt_4o_mini_2024_07_18,WildBench Information Seeking,57.4,[],wildbench_240829.csv qwen2_72b_instruct,WildBench Information Seeking,49.5,[],wildbench_240829.csv reka_core,WildBench Information Seeking,52.3,[],wildbench_240829.csv gemma_2_9b_it_simpo,WildBench Information Seeking,56.5,[],wildbench_240829.csv gemma_2_9b_it_dpo,WildBench Information Seeking,58.2,[],wildbench_240829.csv yi_1_5_34b_chat,WildBench Information Seeking,50.3,[],wildbench_240829.csv claude_3_haiku,WildBench Information Seeking,45.3,[],wildbench_240829.csv mistral_nemo_inst_12b,WildBench Information Seeking,51.9,[],wildbench_240829.csv mistral_large,WildBench Information Seeking,46.1,[],wildbench_240829.csv gemma_2_9b_it,WildBench Information Seeking,49.0,[],wildbench_240829.csv command_r_plus,WildBench Information Seeking,49.2,[],wildbench_240829.csv glm_4_9b_chat,WildBench Information Seeking,46.3,[],wildbench_240829.csv magpie_8b_align_v0_1,WildBench Information Seeking,48.9,[],wildbench_240829.csv yi_1_5_9b_chat,WildBench Information Seeking,42.6,[],wildbench_240829.csv llama3_inst_8b_simpo,WildBench Information Seeking,47.9,[],wildbench_240829.csv llama3_inst_8b_simpo_v0_2,WildBench Information Seeking,47.9,[],wildbench_240829.csv qwen1_5_72b_chat,WildBench Information Seeking,48.2,[],wildbench_240829.csv llama3_inst_8b_simpo_expo,WildBench Information Seeking,47.3,[],wildbench_240829.csv selm_llama3_8b_inst_iter3,WildBench Information Seeking,46.1,[],wildbench_240829.csv phi_3_medium_128k,WildBench Information Seeking,35.7,[],wildbench_240829.csv llama3_8b_instruct,WildBench Information Seeking,39.3,[],wildbench_240829.csv hermes_2_theta_llama3_8b,WildBench Information Seeking,41.6,[],wildbench_240829.csv starling_lm_7b_beta_expo,WildBench Information Seeking,42.9,[],wildbench_240829.csv selm_zephyr_7b_iter3,WildBench Information Seeking,41.0,[],wildbench_240829.csv reka_flash,WildBench Information Seeking,41.5,[],wildbench_240829.csv gemma_2_2b_it,WildBench Information Seeking,39.9,[],wildbench_240829.csv gpt_3_5_turbo_0125,WildBench Information Seeking,36.5,[],wildbench_240829.csv dbrx_instruct,WildBench Information Seeking,41.1,[],wildbench_240829.csv neo_7b_instruct_expo,WildBench Information Seeking,34.9,[],wildbench_240829.csv neo_7b_instruct,WildBench Information Seeking,36.3,[],wildbench_240829.csv starlinglm_7b_beta,WildBench Information Seeking,41.9,[],wildbench_240829.csv command_r,WildBench Information Seeking,44.1,[],wildbench_240829.csv mixtral_8x7b_instruct,WildBench Information Seeking,41.9,[],wildbench_240829.csv yi_1_5_6b_chat,WildBench Information Seeking,31.4,[],wildbench_240829.csv tulu_2_dpo_70b,WildBench Information Seeking,40.7,[],wildbench_240829.csv reka_edge,WildBench Information Seeking,34.4,[],wildbench_240829.csv mistral_7b_instruct_v0_2,WildBench Information Seeking,40.1,[],wildbench_240829.csv llama_2_70b_chat,WildBench Information Seeking,38.3,[],wildbench_240829.csv qwen1_5_7b_chat,WildBench Information Seeking,34.0,[],wildbench_240829.csv hermes_2_mixtral_8x7b_dpo,WildBench Information Seeking,39.8,[],wildbench_240829.csv phi_3_mini_128k,WildBench Information Seeking,28.6,[],wildbench_240829.csv gemma_7b_it,WildBench Information Seeking,12.7,[],wildbench_240829.csv llama_2_7b_chat,WildBench Information Seeking,27.7,[],wildbench_240829.csv gpt_4o_2024_05_13,WildBench Creative,59.1,[],wildbench_240829.csv claude_3_5_sonnet,WildBench Creative,55.6,[],wildbench_240829.csv gemini_1_5_pro,WildBench Creative,55.1,[],wildbench_240829.csv gpt_4_turbo_2024_04_09,WildBench Creative,58.7,[],wildbench_240829.csv yi_large_preview,WildBench Creative,57.6,[],wildbench_240829.csv deepseek_v2_chat_0628_api,WildBench Creative,56.4,[],wildbench_240829.csv gpt_4_0125_preview,WildBench Creative,57.6,[],wildbench_240829.csv claude_3_opus,WildBench Creative,53.0,[],wildbench_240829.csv gemini_1_5_flash,WildBench Creative,51.7,[],wildbench_240829.csv llama3_70b_instruct,WildBench Creative,54.3,[],wildbench_240829.csv deepseek_v2_coder_0614_api,WildBench Creative,40.8,[],wildbench_240829.csv yi_large,WildBench Creative,51.8,[],wildbench_240829.csv athene_70b,WildBench Creative,60.4,[],wildbench_240829.csv nemotron_4_340b_inst,WildBench Creative,53.3,[],wildbench_240829.csv gemma_2_27b_it,WildBench Creative,53.6,[],wildbench_240829.csv mistral_large_2,WildBench Creative,58.9,[],wildbench_240829.csv claude_3_sonnet,WildBench Creative,46.3,[],wildbench_240829.csv gpt_4o_mini_2024_07_18,WildBench Creative,60.1,[],wildbench_240829.csv qwen2_72b_instruct,WildBench Creative,49.9,[],wildbench_240829.csv reka_core,WildBench Creative,55.5,[],wildbench_240829.csv gemma_2_9b_it_simpo,WildBench Creative,58.0,[],wildbench_240829.csv gemma_2_9b_it_dpo,WildBench Creative,59.1,[],wildbench_240829.csv yi_1_5_34b_chat,WildBench Creative,53.5,[],wildbench_240829.csv claude_3_haiku,WildBench Creative,42.9,[],wildbench_240829.csv mistral_nemo_inst_12b,WildBench Creative,54.6,[],wildbench_240829.csv mistral_large,WildBench Creative,49.7,[],wildbench_240829.csv gemma_2_9b_it,WildBench Creative,51.0,[],wildbench_240829.csv command_r_plus,WildBench Creative,52.6,[],wildbench_240829.csv glm_4_9b_chat,WildBench Creative,47.8,[],wildbench_240829.csv magpie_8b_align_v0_1,WildBench Creative,49.2,[],wildbench_240829.csv yi_1_5_9b_chat,WildBench Creative,45.6,[],wildbench_240829.csv llama3_inst_8b_simpo,WildBench Creative,50.6,[],wildbench_240829.csv llama3_inst_8b_simpo_v0_2,WildBench Creative,51.8,[],wildbench_240829.csv qwen1_5_72b_chat,WildBench Creative,50.4,[],wildbench_240829.csv llama3_inst_8b_simpo_expo,WildBench Creative,49.1,[],wildbench_240829.csv selm_llama3_8b_inst_iter3,WildBench Creative,51.1,[],wildbench_240829.csv phi_3_medium_128k,WildBench Creative,33.2,[],wildbench_240829.csv llama3_8b_instruct,WildBench Creative,43.6,[],wildbench_240829.csv hermes_2_theta_llama3_8b,WildBench Creative,39.8,[],wildbench_240829.csv starling_lm_7b_beta_expo,WildBench Creative,44.3,[],wildbench_240829.csv selm_zephyr_7b_iter3,WildBench Creative,44.7,[],wildbench_240829.csv reka_flash,WildBench Creative,42.4,[],wildbench_240829.csv gemma_2_2b_it,WildBench Creative,43.6,[],wildbench_240829.csv gpt_3_5_turbo_0125,WildBench Creative,37.4,[],wildbench_240829.csv dbrx_instruct,WildBench Creative,42.3,[],wildbench_240829.csv neo_7b_instruct_expo,WildBench Creative,38.5,[],wildbench_240829.csv neo_7b_instruct,WildBench Creative,39.5,[],wildbench_240829.csv starlinglm_7b_beta,WildBench Creative,43.8,[],wildbench_240829.csv command_r,WildBench Creative,47.4,[],wildbench_240829.csv mixtral_8x7b_instruct,WildBench Creative,42.8,[],wildbench_240829.csv yi_1_5_6b_chat,WildBench Creative,31.1,[],wildbench_240829.csv tulu_2_dpo_70b,WildBench Creative,42.7,[],wildbench_240829.csv reka_edge,WildBench Creative,36.2,[],wildbench_240829.csv mistral_7b_instruct_v0_2,WildBench Creative,42.1,[],wildbench_240829.csv llama_2_70b_chat,WildBench Creative,40.0,[],wildbench_240829.csv qwen1_5_7b_chat,WildBench Creative,38.3,[],wildbench_240829.csv hermes_2_mixtral_8x7b_dpo,WildBench Creative,37.9,[],wildbench_240829.csv phi_3_mini_128k,WildBench Creative,30.6,[],wildbench_240829.csv gemma_7b_it,WildBench Creative,21.2,[],wildbench_240829.csv llama_2_7b_chat,WildBench Creative,29.8,[],wildbench_240829.csv gpt_4o_2024_05_13,WildBench Code Debugging,60.5,[],wildbench_240829.csv claude_3_5_sonnet,WildBench Code Debugging,56.5,[],wildbench_240829.csv gemini_1_5_pro,WildBench Code Debugging,55.2,[],wildbench_240829.csv gpt_4_turbo_2024_04_09,WildBench Code Debugging,55.1,[],wildbench_240829.csv yi_large_preview,WildBench Code Debugging,54.3,[],wildbench_240829.csv deepseek_v2_chat_0628_api,WildBench Code Debugging,55.0,[],wildbench_240829.csv gpt_4_0125_preview,WildBench Code Debugging,52.9,[],wildbench_240829.csv claude_3_opus,WildBench Code Debugging,53.3,[],wildbench_240829.csv gemini_1_5_flash,WildBench Code Debugging,48.7,[],wildbench_240829.csv llama3_70b_instruct,WildBench Code Debugging,44.7,[],wildbench_240829.csv deepseek_v2_coder_0614_api,WildBench Code Debugging,48.9,[],wildbench_240829.csv yi_large,WildBench Code Debugging,47.7,[],wildbench_240829.csv athene_70b,WildBench Code Debugging,59.0,[],wildbench_240829.csv nemotron_4_340b_inst,WildBench Code Debugging,46.3,[],wildbench_240829.csv gemma_2_27b_it,WildBench Code Debugging,47.0,[],wildbench_240829.csv mistral_large_2,WildBench Code Debugging,53.8,[],wildbench_240829.csv claude_3_sonnet,WildBench Code Debugging,46.1,[],wildbench_240829.csv gpt_4o_mini_2024_07_18,WildBench Code Debugging,57.2,[],wildbench_240829.csv qwen2_72b_instruct,WildBench Code Debugging,39.8,[],wildbench_240829.csv reka_core,WildBench Code Debugging,40.6,[],wildbench_240829.csv gemma_2_9b_it_simpo,WildBench Code Debugging,50.9,[],wildbench_240829.csv gemma_2_9b_it_dpo,WildBench Code Debugging,50.5,[],wildbench_240829.csv yi_1_5_34b_chat,WildBench Code Debugging,42.1,[],wildbench_240829.csv claude_3_haiku,WildBench Code Debugging,37.0,[],wildbench_240829.csv mistral_nemo_inst_12b,WildBench Code Debugging,39.7,[],wildbench_240829.csv mistral_large,WildBench Code Debugging,33.7,[],wildbench_240829.csv gemma_2_9b_it,WildBench Code Debugging,36.7,[],wildbench_240829.csv command_r_plus,WildBench Code Debugging,28.4,[],wildbench_240829.csv glm_4_9b_chat,WildBench Code Debugging,35.4,[],wildbench_240829.csv magpie_8b_align_v0_1,WildBench Code Debugging,33.7,[],wildbench_240829.csv yi_1_5_9b_chat,WildBench Code Debugging,35.0,[],wildbench_240829.csv llama3_inst_8b_simpo,WildBench Code Debugging,31.8,[],wildbench_240829.csv llama3_inst_8b_simpo_v0_2,WildBench Code Debugging,31.5,[],wildbench_240829.csv qwen1_5_72b_chat,WildBench Code Debugging,35.4,[],wildbench_240829.csv llama3_inst_8b_simpo_expo,WildBench Code Debugging,28.6,[],wildbench_240829.csv selm_llama3_8b_inst_iter3,WildBench Code Debugging,27.3,[],wildbench_240829.csv phi_3_medium_128k,WildBench Code Debugging,18.2,[],wildbench_240829.csv llama3_8b_instruct,WildBench Code Debugging,22.0,[],wildbench_240829.csv hermes_2_theta_llama3_8b,WildBench Code Debugging,23.1,[],wildbench_240829.csv starling_lm_7b_beta_expo,WildBench Code Debugging,25.3,[],wildbench_240829.csv selm_zephyr_7b_iter3,WildBench Code Debugging,11.0,[],wildbench_240829.csv reka_flash,WildBench Code Debugging,22.1,[],wildbench_240829.csv gemma_2_2b_it,WildBench Code Debugging,17.9,[],wildbench_240829.csv gpt_3_5_turbo_0125,WildBench Code Debugging,26.5,[],wildbench_240829.csv dbrx_instruct,WildBench Code Debugging,26.4,[],wildbench_240829.csv neo_7b_instruct_expo,WildBench Code Debugging,12.8,[],wildbench_240829.csv neo_7b_instruct,WildBench Code Debugging,14.0,[],wildbench_240829.csv starlinglm_7b_beta,WildBench Code Debugging,24.4,[],wildbench_240829.csv command_r,WildBench Code Debugging,19.3,[],wildbench_240829.csv mixtral_8x7b_instruct,WildBench Code Debugging,25.0,[],wildbench_240829.csv yi_1_5_6b_chat,WildBench Code Debugging,16.6,[],wildbench_240829.csv tulu_2_dpo_70b,WildBench Code Debugging,20.7,[],wildbench_240829.csv reka_edge,WildBench Code Debugging,13.5,[],wildbench_240829.csv mistral_7b_instruct_v0_2,WildBench Code Debugging,18.4,[],wildbench_240829.csv llama_2_70b_chat,WildBench Code Debugging,9.3,[],wildbench_240829.csv qwen1_5_7b_chat,WildBench Code Debugging,14.9,[],wildbench_240829.csv hermes_2_mixtral_8x7b_dpo,WildBench Code Debugging,26.0,[],wildbench_240829.csv phi_3_mini_128k,WildBench Code Debugging,21.6,[],wildbench_240829.csv gemma_7b_it,WildBench Code Debugging,1.8,[],wildbench_240829.csv llama_2_7b_chat,WildBench Code Debugging,-6.8,[],wildbench_240829.csv gpt_4o_2024_05_13,WildBench Math & Data,57.3,[],wildbench_240829.csv claude_3_5_sonnet,WildBench Math & Data,50.2,[],wildbench_240829.csv gemini_1_5_pro,WildBench Math & Data,48.6,[],wildbench_240829.csv gpt_4_turbo_2024_04_09,WildBench Math & Data,51.0,[],wildbench_240829.csv yi_large_preview,WildBench Math & Data,51.9,[],wildbench_240829.csv deepseek_v2_chat_0628_api,WildBench Math & Data,51.4,[],wildbench_240829.csv gpt_4_0125_preview,WildBench Math & Data,45.8,[],wildbench_240829.csv claude_3_opus,WildBench Math & Data,46.7,[],wildbench_240829.csv gemini_1_5_flash,WildBench Math & Data,45.3,[],wildbench_240829.csv llama3_70b_instruct,WildBench Math & Data,42.1,[],wildbench_240829.csv deepseek_v2_coder_0614_api,WildBench Math & Data,46.4,[],wildbench_240829.csv yi_large,WildBench Math & Data,44.5,[],wildbench_240829.csv athene_70b,WildBench Math & Data,57.1,[],wildbench_240829.csv nemotron_4_340b_inst,WildBench Math & Data,40.8,[],wildbench_240829.csv gemma_2_27b_it,WildBench Math & Data,43.9,[],wildbench_240829.csv mistral_large_2,WildBench Math & Data,52.7,[],wildbench_240829.csv claude_3_sonnet,WildBench Math & Data,40.6,[],wildbench_240829.csv gpt_4o_mini_2024_07_18,WildBench Math & Data,54.0,[],wildbench_240829.csv qwen2_72b_instruct,WildBench Math & Data,41.0,[],wildbench_240829.csv reka_core,WildBench Math & Data,40.3,[],wildbench_240829.csv gemma_2_9b_it_simpo,WildBench Math & Data,48.6,[],wildbench_240829.csv gemma_2_9b_it_dpo,WildBench Math & Data,47.1,[],wildbench_240829.csv yi_1_5_34b_chat,WildBench Math & Data,39.4,[],wildbench_240829.csv claude_3_haiku,WildBench Math & Data,31.4,[],wildbench_240829.csv mistral_nemo_inst_12b,WildBench Math & Data,35.6,[],wildbench_240829.csv mistral_large,WildBench Math & Data,30.9,[],wildbench_240829.csv gemma_2_9b_it,WildBench Math & Data,36.4,[],wildbench_240829.csv command_r_plus,WildBench Math & Data,23.5,[],wildbench_240829.csv glm_4_9b_chat,WildBench Math & Data,29.8,[],wildbench_240829.csv magpie_8b_align_v0_1,WildBench Math & Data,29.8,[],wildbench_240829.csv yi_1_5_9b_chat,WildBench Math & Data,32.2,[],wildbench_240829.csv llama3_inst_8b_simpo,WildBench Math & Data,24.0,[],wildbench_240829.csv llama3_inst_8b_simpo_v0_2,WildBench Math & Data,24.4,[],wildbench_240829.csv qwen1_5_72b_chat,WildBench Math & Data,29.8,[],wildbench_240829.csv llama3_inst_8b_simpo_expo,WildBench Math & Data,21.2,[],wildbench_240829.csv selm_llama3_8b_inst_iter3,WildBench Math & Data,23.5,[],wildbench_240829.csv phi_3_medium_128k,WildBench Math & Data,23.0,[],wildbench_240829.csv llama3_8b_instruct,WildBench Math & Data,17.0,[],wildbench_240829.csv hermes_2_theta_llama3_8b,WildBench Math & Data,18.7,[],wildbench_240829.csv starling_lm_7b_beta_expo,WildBench Math & Data,18.6,[],wildbench_240829.csv selm_zephyr_7b_iter3,WildBench Math & Data,12.7,[],wildbench_240829.csv reka_flash,WildBench Math & Data,20.5,[],wildbench_240829.csv gemma_2_2b_it,WildBench Math & Data,15.8,[],wildbench_240829.csv gpt_3_5_turbo_0125,WildBench Math & Data,21.6,[],wildbench_240829.csv dbrx_instruct,WildBench Math & Data,24.5,[],wildbench_240829.csv neo_7b_instruct_expo,WildBench Math & Data,12.6,[],wildbench_240829.csv neo_7b_instruct,WildBench Math & Data,15.0,[],wildbench_240829.csv starlinglm_7b_beta,WildBench Math & Data,17.0,[],wildbench_240829.csv command_r,WildBench Math & Data,16.0,[],wildbench_240829.csv mixtral_8x7b_instruct,WildBench Math & Data,22.1,[],wildbench_240829.csv yi_1_5_6b_chat,WildBench Math & Data,16.8,[],wildbench_240829.csv tulu_2_dpo_70b,WildBench Math & Data,14.8,[],wildbench_240829.csv reka_edge,WildBench Math & Data,8.9,[],wildbench_240829.csv mistral_7b_instruct_v0_2,WildBench Math & Data,10.1,[],wildbench_240829.csv llama_2_70b_chat,WildBench Math & Data,4.2,[],wildbench_240829.csv qwen1_5_7b_chat,WildBench Math & Data,11.9,[],wildbench_240829.csv hermes_2_mixtral_8x7b_dpo,WildBench Math & Data,21.8,[],wildbench_240829.csv phi_3_mini_128k,WildBench Math & Data,18.6,[],wildbench_240829.csv gemma_7b_it,WildBench Math & Data,-3.7,[],wildbench_240829.csv llama_2_7b_chat,WildBench Math & Data,-7.2,[],wildbench_240829.csv gpt_4o_2024_05_13,WildBench Reasoning & Planning,60.2,[],wildbench_240829.csv claude_3_5_sonnet,WildBench Reasoning & Planning,55.6,[],wildbench_240829.csv gemini_1_5_pro,WildBench Reasoning & Planning,53.7,[],wildbench_240829.csv gpt_4_turbo_2024_04_09,WildBench Reasoning & Planning,56.2,[],wildbench_240829.csv yi_large_preview,WildBench Reasoning & Planning,56.6,[],wildbench_240829.csv deepseek_v2_chat_0628_api,WildBench Reasoning & Planning,54.8,[],wildbench_240829.csv gpt_4_0125_preview,WildBench Reasoning & Planning,53.5,[],wildbench_240829.csv claude_3_opus,WildBench Reasoning & Planning,52.5,[],wildbench_240829.csv gemini_1_5_flash,WildBench Reasoning & Planning,50.8,[],wildbench_240829.csv llama3_70b_instruct,WildBench Reasoning & Planning,50.1,[],wildbench_240829.csv deepseek_v2_coder_0614_api,WildBench Reasoning & Planning,47.2,[],wildbench_240829.csv yi_large,WildBench Reasoning & Planning,51.3,[],wildbench_240829.csv athene_70b,WildBench Reasoning & Planning,61.0,[],wildbench_240829.csv nemotron_4_340b_inst,WildBench Reasoning & Planning,49.1,[],wildbench_240829.csv gemma_2_27b_it,WildBench Reasoning & Planning,50.6,[],wildbench_240829.csv mistral_large_2,WildBench Reasoning & Planning,57.2,[],wildbench_240829.csv claude_3_sonnet,WildBench Reasoning & Planning,47.4,[],wildbench_240829.csv gpt_4o_mini_2024_07_18,WildBench Reasoning & Planning,58.2,[],wildbench_240829.csv qwen2_72b_instruct,WildBench Reasoning & Planning,46.8,[],wildbench_240829.csv reka_core,WildBench Reasoning & Planning,48.0,[],wildbench_240829.csv gemma_2_9b_it_simpo,WildBench Reasoning & Planning,55.6,[],wildbench_240829.csv gemma_2_9b_it_dpo,WildBench Reasoning & Planning,55.5,[],wildbench_240829.csv yi_1_5_34b_chat,WildBench Reasoning & Planning,48.1,[],wildbench_240829.csv claude_3_haiku,WildBench Reasoning & Planning,41.3,[],wildbench_240829.csv mistral_nemo_inst_12b,WildBench Reasoning & Planning,47.4,[],wildbench_240829.csv mistral_large,WildBench Reasoning & Planning,41.8,[],wildbench_240829.csv gemma_2_9b_it,WildBench Reasoning & Planning,46.7,[],wildbench_240829.csv command_r_plus,WildBench Reasoning & Planning,41.9,[],wildbench_240829.csv glm_4_9b_chat,WildBench Reasoning & Planning,42.5,[],wildbench_240829.csv magpie_8b_align_v0_1,WildBench Reasoning & Planning,42.7,[],wildbench_240829.csv yi_1_5_9b_chat,WildBench Reasoning & Planning,42.4,[],wildbench_240829.csv llama3_inst_8b_simpo,WildBench Reasoning & Planning,40.9,[],wildbench_240829.csv llama3_inst_8b_simpo_v0_2,WildBench Reasoning & Planning,40.7,[],wildbench_240829.csv qwen1_5_72b_chat,WildBench Reasoning & Planning,43.5,[],wildbench_240829.csv llama3_inst_8b_simpo_expo,WildBench Reasoning & Planning,39.5,[],wildbench_240829.csv selm_llama3_8b_inst_iter3,WildBench Reasoning & Planning,39.8,[],wildbench_240829.csv phi_3_medium_128k,WildBench Reasoning & Planning,32.3,[],wildbench_240829.csv llama3_8b_instruct,WildBench Reasoning & Planning,34.4,[],wildbench_240829.csv hermes_2_theta_llama3_8b,WildBench Reasoning & Planning,33.7,[],wildbench_240829.csv starling_lm_7b_beta_expo,WildBench Reasoning & Planning,36.3,[],wildbench_240829.csv selm_zephyr_7b_iter3,WildBench Reasoning & Planning,31.6,[],wildbench_240829.csv reka_flash,WildBench Reasoning & Planning,35.0,[],wildbench_240829.csv gemma_2_2b_it,WildBench Reasoning & Planning,33.8,[],wildbench_240829.csv gpt_3_5_turbo_0125,WildBench Reasoning & Planning,33.4,[],wildbench_240829.csv dbrx_instruct,WildBench Reasoning & Planning,36.2,[],wildbench_240829.csv neo_7b_instruct_expo,WildBench Reasoning & Planning,28.7,[],wildbench_240829.csv neo_7b_instruct,WildBench Reasoning & Planning,31.4,[],wildbench_240829.csv starlinglm_7b_beta,WildBench Reasoning & Planning,34.1,[],wildbench_240829.csv command_r,WildBench Reasoning & Planning,34.6,[],wildbench_240829.csv mixtral_8x7b_instruct,WildBench Reasoning & Planning,34.6,[],wildbench_240829.csv yi_1_5_6b_chat,WildBench Reasoning & Planning,27.3,[],wildbench_240829.csv tulu_2_dpo_70b,WildBench Reasoning & Planning,32.3,[],wildbench_240829.csv reka_edge,WildBench Reasoning & Planning,25.0,[],wildbench_240829.csv mistral_7b_instruct_v0_2,WildBench Reasoning & Planning,30.1,[],wildbench_240829.csv llama_2_70b_chat,WildBench Reasoning & Planning,26.8,[],wildbench_240829.csv qwen1_5_7b_chat,WildBench Reasoning & Planning,28.9,[],wildbench_240829.csv hermes_2_mixtral_8x7b_dpo,WildBench Reasoning & Planning,34.2,[],wildbench_240829.csv phi_3_mini_128k,WildBench Reasoning & Planning,28.1,[],wildbench_240829.csv gemma_7b_it,WildBench Reasoning & Planning,10.2,[],wildbench_240829.csv llama_2_7b_chat,WildBench Reasoning & Planning,15.4,[],wildbench_240829.csv gpt_4o_2024_05_13,WildBench Score,59.3,[],wildbench_240829.csv claude_3_5_sonnet,WildBench Score,54.7,[],wildbench_240829.csv gemini_1_5_pro,WildBench Score,53.0,[],wildbench_240829.csv gpt_4_turbo_2024_04_09,WildBench Score,55.2,[],wildbench_240829.csv yi_large_preview,WildBench Score,55.3,[],wildbench_240829.csv deepseek_v2_chat_0628_api,WildBench Score,54.0,[],wildbench_240829.csv gpt_4_0125_preview,WildBench Score,52.3,[],wildbench_240829.csv claude_3_opus,WildBench Score,51.7,[],wildbench_240829.csv gemini_1_5_flash,WildBench Score,48.9,[],wildbench_240829.csv llama3_70b_instruct,WildBench Score,47.8,[],wildbench_240829.csv deepseek_v2_coder_0614_api,WildBench Score,45.7,[],wildbench_240829.csv yi_large,WildBench Score,48.9,[],wildbench_240829.csv athene_70b,WildBench Score,59.5,[],wildbench_240829.csv nemotron_4_340b_inst,WildBench Score,47.7,[],wildbench_240829.csv gemma_2_27b_it,WildBench Score,48.5,[],wildbench_240829.csv mistral_large_2,WildBench Score,55.6,[],wildbench_240829.csv claude_3_sonnet,WildBench Score,45.5,[],wildbench_240829.csv gpt_4o_mini_2024_07_18,WildBench Score,57.1,[],wildbench_240829.csv qwen2_72b_instruct,WildBench Score,44.5,[],wildbench_240829.csv reka_core,WildBench Score,45.9,[],wildbench_240829.csv gemma_2_9b_it_simpo,WildBench Score,53.3,[],wildbench_240829.csv gemma_2_9b_it_dpo,WildBench Score,53.2,[],wildbench_240829.csv yi_1_5_34b_chat,WildBench Score,45.6,[],wildbench_240829.csv claude_3_haiku,WildBench Score,38.9,[],wildbench_240829.csv mistral_nemo_inst_12b,WildBench Score,44.4,[],wildbench_240829.csv mistral_large,WildBench Score,38.9,[],wildbench_240829.csv gemma_2_9b_it,WildBench Score,42.7,[],wildbench_240829.csv command_r_plus,WildBench Score,36.8,[],wildbench_240829.csv glm_4_9b_chat,WildBench Score,39.1,[],wildbench_240829.csv magpie_8b_align_v0_1,WildBench Score,39.3,[],wildbench_240829.csv yi_1_5_9b_chat,WildBench Score,38.7,[],wildbench_240829.csv llama3_inst_8b_simpo,WildBench Score,37.0,[],wildbench_240829.csv llama3_inst_8b_simpo_v0_2,WildBench Score,37.2,[],wildbench_240829.csv qwen1_5_72b_chat,WildBench Score,39.9,[],wildbench_240829.csv llama3_inst_8b_simpo_expo,WildBench Score,35.0,[],wildbench_240829.csv selm_llama3_8b_inst_iter3,WildBench Score,35.3,[],wildbench_240829.csv phi_3_medium_128k,WildBench Score,27.3,[],wildbench_240829.csv llama3_8b_instruct,WildBench Score,29.2,[],wildbench_240829.csv hermes_2_theta_llama3_8b,WildBench Score,29.6,[],wildbench_240829.csv starling_lm_7b_beta_expo,WildBench Score,31.6,[],wildbench_240829.csv selm_zephyr_7b_iter3,WildBench Score,25.1,[],wildbench_240829.csv reka_flash,WildBench Score,30.4,[],wildbench_240829.csv gemma_2_2b_it,WildBench Score,27.8,[],wildbench_240829.csv gpt_3_5_turbo_0125,WildBench Score,30.0,[],wildbench_240829.csv dbrx_instruct,WildBench Score,32.6,[],wildbench_240829.csv neo_7b_instruct_expo,WildBench Score,23.1,[],wildbench_240829.csv neo_7b_instruct,WildBench Score,25.0,[],wildbench_240829.csv starlinglm_7b_beta,WildBench Score,30.2,[],wildbench_240829.csv command_r,WildBench Score,29.5,[],wildbench_240829.csv mixtral_8x7b_instruct,WildBench Score,31.5,[],wildbench_240829.csv yi_1_5_6b_chat,WildBench Score,23.3,[],wildbench_240829.csv tulu_2_dpo_70b,WildBench Score,28.0,[],wildbench_240829.csv reka_edge,WildBench Score,21.3,[],wildbench_240829.csv mistral_7b_instruct_v0_2,WildBench Score,25.6,[],wildbench_240829.csv llama_2_70b_chat,WildBench Score,20.7,[],wildbench_240829.csv qwen1_5_7b_chat,WildBench Score,23.4,[],wildbench_240829.csv hermes_2_mixtral_8x7b_dpo,WildBench Score,30.7,[],wildbench_240829.csv phi_3_mini_128k,WildBench Score,24.7,[],wildbench_240829.csv gemma_7b_it,WildBench Score,6.6,[],wildbench_240829.csv llama_2_7b_chat,WildBench Score,8.3,[],wildbench_240829.csv o1_mini,Decentralized Arena (0-1 Normalized),1.0,[],dec_arena_241022.csv o1_preview,Decentralized Arena (0-1 Normalized),0.988296,[],dec_arena_241022.csv chatgpt_4o_latest_2024_09_03,Decentralized Arena (0-1 Normalized),0.971391,[],dec_arena_241022.csv yi_lightning,Decentralized Arena (0-1 Normalized),0.955415,[],dec_arena_241022.csv glm_4_plus,Decentralized Arena (0-1 Normalized),0.910273,[],dec_arena_241022.csv claude_3_5_sonnet,Decentralized Arena (0-1 Normalized),0.897083,[],dec_arena_241022.csv gpt_4o_2024_05_13,Decentralized Arena (0-1 Normalized),0.894297,[],dec_arena_241022.csv gpt_4o_2024_08_06,Decentralized Arena (0-1 Normalized),0.889095,[],dec_arena_241022.csv nemotron_70b,Decentralized Arena (0-1 Normalized),0.881107,[],dec_arena_241022.csv gpt_4o_mini_2024_07_18,Decentralized Arena (0-1 Normalized),0.873119,[],dec_arena_241022.csv gpt_4_turbo_2024_04_09,Decentralized Arena (0-1 Normalized),0.865131,[],dec_arena_241022.csv gemini_1_5_pro_001,Decentralized Arena (0-1 Normalized),0.854542,[],dec_arena_241022.csv qwen2_72b_instruct,Decentralized Arena (0-1 Normalized),0.814787,[],dec_arena_241022.csv claude_3_opus,Decentralized Arena (0-1 Normalized),0.804198,[],dec_arena_241022.csv gpt4_1106,Decentralized Arena (0-1 Normalized),0.761657,[],dec_arena_241022.csv gemini_1_5_flash_001,Decentralized Arena (0-1 Normalized),0.761657,[],dec_arena_241022.csv llama3_1_70b_instruct,Decentralized Arena (0-1 Normalized),0.759056,[],dec_arena_241022.csv gemma_2_9b_it_simpo,Decentralized Arena (0-1 Normalized),0.73695,[],dec_arena_241022.csv gemma_2_27b_it,Decentralized Arena (0-1 Normalized),0.716515,[],dec_arena_241022.csv google_gemma_2_9b_it,Decentralized Arena (0-1 Normalized),0.687349,[],dec_arena_241022.csv yi_1_5_34b_chat,Decentralized Arena (0-1 Normalized),0.671373,[],dec_arena_241022.csv llama3_70b_instruct,Decentralized Arena (0-1 Normalized),0.658183,[],dec_arena_241022.csv claude_3_haiku,Decentralized Arena (0-1 Normalized),0.591863,[],dec_arena_241022.csv qwen1_5_72b_chat,Decentralized Arena (0-1 Normalized),0.583875,[],dec_arena_241022.csv llama3_1_8b_instruct,Decentralized Arena (0-1 Normalized),0.533346,[],dec_arena_241022.csv qwen1_5_32b_chat,Decentralized Arena (0-1 Normalized),0.533346,[],dec_arena_241022.csv claude_2_1,Decentralized Arena (0-1 Normalized),0.509567,[],dec_arena_241022.csv claude_2_0,Decentralized Arena (0-1 Normalized),0.501579,[],dec_arena_241022.csv starling_lm_7b_beta,Decentralized Arena (0-1 Normalized),0.464425,[],dec_arena_241022.csv qwen1_5_14b_chat,Decentralized Arena (0-1 Normalized),0.43786,[],dec_arena_241022.csv mistral_8x7b_instruct_v0_1,Decentralized Arena (0-1 Normalized),0.43786,[],dec_arena_241022.csv llama3_8b_instruct,Decentralized Arena (0-1 Normalized),0.421884,[],dec_arena_241022.csv gemma_2_2b_it,Decentralized Arena (0-1 Normalized),0.414081,[],dec_arena_241022.csv gpt3_5_turbo_0125,Decentralized Arena (0-1 Normalized),0.411295,[],dec_arena_241022.csv command_r_08_2024,Decentralized Arena (0-1 Normalized),0.392718,[],dec_arena_241022.csv openchat_3_5_0106,Decentralized Arena (0-1 Normalized),0.387516,[],dec_arena_241022.csv openchat_3_5,Decentralized Arena (0-1 Normalized),0.374141,[],dec_arena_241022.csv command_r_04_2024,Decentralized Arena (0-1 Normalized),0.339773,[],dec_arena_241022.csv gemma_1_1_7b_it,Decentralized Arena (0-1 Normalized),0.336987,[],dec_arena_241022.csv starling_lm_7b_alpha,Decentralized Arena (0-1 Normalized),0.331785,[],dec_arena_241022.csv gemini_1_0_pro_001,Decentralized Arena (0-1 Normalized),0.326398,[],dec_arena_241022.csv mistral_7b_instruct_2,Decentralized Arena (0-1 Normalized),0.260078,[],dec_arena_241022.csv llama3_2_3b_it,Decentralized Arena (0-1 Normalized),0.25209,[],dec_arena_241022.csv vicuna_33b,Decentralized Arena (0-1 Normalized),0.2389,[],dec_arena_241022.csv gemma_7b_it,Decentralized Arena (0-1 Normalized),0.228311,[],dec_arena_241022.csv qwen1_5_4b_chat,Decentralized Arena (0-1 Normalized),0.146015,[],dec_arena_241022.csv mistral_7b_instruct_1,Decentralized Arena (0-1 Normalized),0.143229,[],dec_arena_241022.csv vicuna_13b,Decentralized Arena (0-1 Normalized),0.140628,[],dec_arena_241022.csv gemma_1_1_2b_it,Decentralized Arena (0-1 Normalized),0.135426,[],dec_arena_241022.csv llama2_7b_chat,Decentralized Arena (0-1 Normalized),0.127438,[],dec_arena_241022.csv llama2_13b_chat,Decentralized Arena (0-1 Normalized),0.116849,[],dec_arena_241022.csv gemma_2b_it,Decentralized Arena (0-1 Normalized),0.087498,[],dec_arena_241022.csv vicuna_7b,Decentralized Arena (0-1 Normalized),0.071707,[],dec_arena_241022.csv zephyr_7b_beta,Decentralized Arena (0-1 Normalized),0.058332,[],dec_arena_241022.csv koala_13b,Decentralized Arena (0-1 Normalized),0.026565,[],dec_arena_241022.csv openassistant_pythia_12b,Decentralized Arena (0-1 Normalized),0.0,[],dec_arena_241022.csv claude_3_5_sonnet_20240620,Arena Hard,79.3,[],arena_hard_240829.csv gpt_4o_2024_05_13,Arena Hard,79.2,[],arena_hard_240829.csv gpt_4_0125_preview,Arena Hard,78.0,[],arena_hard_240829.csv gpt_4o_2024_08_06,Arena Hard,77.9,[],arena_hard_240829.csv athene_70b,Arena Hard,77.6,[],arena_hard_240829.csv gpt_4o_mini,Arena Hard,74.9,[],arena_hard_240829.csv gemini_1_5_pro_api_preview,Arena Hard,72.0,[],arena_hard_240829.csv mistral_large_2407,Arena Hard,70.4,[],arena_hard_240829.csv llama3_1_405b_instruct,Arena Hard,64.1,[],arena_hard_240829.csv glm_4_0520,Arena Hard,63.8,[],arena_hard_240829.csv yi_large,Arena Hard,63.7,[],arena_hard_240829.csv deepseek_coder_v2,Arena Hard,62.3,[],arena_hard_240829.csv claude_3_opus_20240229,Arena Hard,60.4,[],arena_hard_240829.csv gemma_2_27b_it,Arena Hard,57.5,[],arena_hard_240829.csv llama3_1_70b_instruct,Arena Hard,55.7,[],arena_hard_240829.csv glm_4_0116,Arena Hard,55.7,[],arena_hard_240829.csv glm_4_air,Arena Hard,50.9,[],arena_hard_240829.csv gpt_4_0314,Arena Hard,50.0,[],arena_hard_240829.csv gemini_1_5_flash_api_preview,Arena Hard,49.6,[],arena_hard_240829.csv qwen2_72b_instruct,Arena Hard,46.9,[],arena_hard_240829.csv claude_3_sonnet_20240229,Arena Hard,46.8,[],arena_hard_240829.csv llama3_70b_instruct,Arena Hard,46.6,[],arena_hard_240829.csv claude_3_haiku_20240307,Arena Hard,41.5,[],arena_hard_240829.csv gpt_4_0613,Arena Hard,37.9,[],arena_hard_240829.csv mistral_large_2402,Arena Hard,37.7,[],arena_hard_240829.csv mixtral_8x22b_instruct_v0_1,Arena Hard,36.4,[],arena_hard_240829.csv qwen1_5_72b_chat,Arena Hard,36.1,[],arena_hard_240829.csv phi_3_medium_4k_instruct,Arena Hard,33.4,[],arena_hard_240829.csv command_r_plus,Arena Hard,33.1,[],arena_hard_240829.csv mistral_medium,Arena Hard,31.9,[],arena_hard_240829.csv internlm2_5_20b_chat,Arena Hard,31.2,[],arena_hard_240829.csv phi_3_small_8k_instruct,Arena Hard,29.8,[],arena_hard_240829.csv mistral_next,Arena Hard,27.4,[],arena_hard_240829.csv gpt_3_5_turbo_0613,Arena Hard,24.8,[],arena_hard_240829.csv dbrx_instructruct_preview,Arena Hard,24.6,[],arena_hard_240829.csv internlm2_20b_chat,Arena Hard,24.4,[],arena_hard_240829.csv claude_2_0,Arena Hard,24.0,[],arena_hard_240829.csv mixtral_8x7b_instruct_v0_1,Arena Hard,23.4,[],arena_hard_240829.csv gpt_3_5_turbo_0125,Arena Hard,23.3,[],arena_hard_240829.csv yi_34b_chat,Arena Hard,23.1,[],arena_hard_240829.csv starling_lm_7b_beta,Arena Hard,23.0,[],arena_hard_240829.csv claude_2_1,Arena Hard,22.8,[],arena_hard_240829.csv llama3_1_8b_instruct,Arena Hard,21.3,[],arena_hard_240829.csv snorkel_mistral_pairrm_dpo,Arena Hard,20.7,[],arena_hard_240829.csv llama3_8b_instruct,Arena Hard,20.6,[],arena_hard_240829.csv gpt_3_5_turbo_1106,Arena Hard,18.9,[],arena_hard_240829.csv gpt_3_5_turbo_0301,Arena Hard,18.1,[],arena_hard_240829.csv gemini_1_0_pro,Arena Hard,17.8,[],arena_hard_240829.csv snowflake_arctic_instruct,Arena Hard,17.6,[],arena_hard_240829.csv command_r,Arena Hard,17.0,[],arena_hard_240829.csv phi_3_mini_128k_instruct,Arena Hard,15.4,[],arena_hard_240829.csv tulu_2_dpo_70b,Arena Hard,15.0,[],arena_hard_240829.csv starling_lm_7b_alpha,Arena Hard,12.8,[],arena_hard_240829.csv mistral_7b_instruct,Arena Hard,12.6,[],arena_hard_240829.csv gemma_1_1_7b_it,Arena Hard,12.1,[],arena_hard_240829.csv llama_2_70b_chat,Arena Hard,11.6,[],arena_hard_240829.csv vicuna_33b_v1_3,Arena Hard,8.6,[],arena_hard_240829.csv gemma_7b_it,Arena Hard,7.5,[],arena_hard_240829.csv llama_2_7b_chat,Arena Hard,4.6,[],arena_hard_240829.csv gemma_1_1_2b_it,Arena Hard,3.4,[],arena_hard_240829.csv gemma_2b_it,Arena Hard,3.0,[],arena_hard_240829.csv gpt_4_0613,AgentBench,4.01,[],agenbench_240829.csv claude_2,AgentBench,2.49,[],agenbench_240829.csv claude_v1_3,AgentBench,2.44,[],agenbench_240829.csv gpt_3_5_turbo_0613,AgentBench,2.32,[],agenbench_240829.csv text_davinci_003,AgentBench,1.71,[],agenbench_240829.csv claude_instant_v1_1,AgentBench,1.6,[],agenbench_240829.csv chat_bison_001,AgentBench,1.39,[],agenbench_240829.csv text_davinci_002,AgentBench,1.25,[],agenbench_240829.csv llama_2_70b_chat,AgentBench,0.78,[],agenbench_240829.csv guanaco_65b,AgentBench,0.54,[],agenbench_240829.csv codellama34b_instruct,AgentBench,0.96,[],agenbench_240829.csv vicuna_33b_v1_3,AgentBench,0.73,[],agenbench_240829.csv wizardlm_30b_v1_0,AgentBench,0.46,[],agenbench_240829.csv guanaco_33b,AgentBench,0.39,[],agenbench_240829.csv vicuna_13b_v1_5,AgentBench,0.93,[],agenbench_240829.csv llama_2_13b_chat,AgentBench,0.77,[],agenbench_240829.csv openchat_13b_v3_2,AgentBench,0.7,[],agenbench_240829.csv wizardlm_13b_v1_2,AgentBench,0.66,[],agenbench_240829.csv vicuna_7b_v1_5,AgentBench,0.56,[],agenbench_240829.csv codellama_13b_instruct,AgentBench,0.56,[],agenbench_240829.csv codellama_7b_instruct,AgentBench,0.5,[],agenbench_240829.csv koala_13b,AgentBench,0.34,[],agenbench_240829.csv llama_2_7b_chat,AgentBench,0.34,[],agenbench_240829.csv codegeex2_6b,AgentBench,0.27,[],agenbench_240829.csv dolly_12b_v2,AgentBench,0.14,[],agenbench_240829.csv chatglm_6b_v1_1,AgentBench,0.11,[],agenbench_240829.csv oasst_12b_sft_4,AgentBench,0.03,[],agenbench_240829.csv gpt_4,MT-Bench,8.99,[],mtbench_240829_frozen.csv gpt_3_5_turbo,MT-Bench,7.94,[],mtbench_240829_frozen.csv claude_v1,MT-Bench,7.9,[],mtbench_240829_frozen.csv claude_instant_v1,MT-Bench,7.85,[],mtbench_240829_frozen.csv vicuna_33b,MT-Bench,7.12,[],mtbench_240829_frozen.csv wizardlm_30b,MT-Bench,7.01,[],mtbench_240829_frozen.csv guanaco_33b,MT-Bench,6.53,[],mtbench_240829_frozen.csv tulu_30b,MT-Bench,6.43,[],mtbench_240829_frozen.csv guanaco_65b,MT-Bench,6.41,[],mtbench_240829_frozen.csv openassistant_llama30b,MT-Bench,6.41,[],mtbench_240829_frozen.csv palm_chat_bison_001,MT-Bench,6.4,[],mtbench_240829_frozen.csv vicuna_13b,MT-Bench,6.39,[],mtbench_240829_frozen.csv mpt_30b_chat,MT-Bench,6.39,[],mtbench_240829_frozen.csv wizardlm_13b,MT-Bench,6.35,[],mtbench_240829_frozen.csv vicuna_7b,MT-Bench,6.0,[],mtbench_240829_frozen.csv baize_v2_13b,MT-Bench,5.75,[],mtbench_240829_frozen.csv nous_hermes_13b,MT-Bench,5.51,[],mtbench_240829_frozen.csv mpt_7b_chat,MT-Bench,5.42,[],mtbench_240829_frozen.csv gpt4all_13b_snoozy,MT-Bench,5.41,[],mtbench_240829_frozen.csv koala_13b,MT-Bench,5.35,[],mtbench_240829_frozen.csv mpt_30b_instruct,MT-Bench,5.22,[],mtbench_240829_frozen.csv falcon_40b_instruct,MT-Bench,5.17,[],mtbench_240829_frozen.csv h2o_oasst_openllama_13b,MT-Bench,4.63,[],mtbench_240829_frozen.csv alpaca_13b,MT-Bench,4.53,[],mtbench_240829_frozen.csv chatglm_6b,MT-Bench,4.5,[],mtbench_240829_frozen.csv openassistant_pythia_12b,MT-Bench,4.32,[],mtbench_240829_frozen.csv rwkv_4_raven_14b,MT-Bench,3.98,[],mtbench_240829_frozen.csv dolly_v2_12b,MT-Bench,3.28,[],mtbench_240829_frozen.csv fastchat_t5_3b,MT-Bench,3.04,[],mtbench_240829_frozen.csv stablelm_tuned_alpha_7b,MT-Bench,2.75,[],mtbench_240829_frozen.csv llama_13b,MT-Bench,2.61,[],mtbench_240829_frozen.csv 0001_dpo_iter_2,HF OpenLLM v1,59.01,,hf_open_llm_v1_240829_frozen.csv 0001_dpo_iter_2,HFv1 ARC,60.41,,hf_open_llm_v1_240829_frozen.csv 0001_dpo_iter_2,HFv1 GSM8K,18.8,,hf_open_llm_v1_240829_frozen.csv 0001_dpo_iter_2,HFv1 HellaSwag,84.52,,hf_open_llm_v1_240829_frozen.csv 0001_dpo_iter_2,HFv1 MMLU,60.02,,hf_open_llm_v1_240829_frozen.csv 0001_dpo_iter_2,HFv1 TruthfulQA,53.11,,hf_open_llm_v1_240829_frozen.csv 0001_dpo_iter_2,HFv1 Winogrande,77.19,,hf_open_llm_v1_240829_frozen.csv 0_0001_withdpo_4iters_bs256_5102lr_iter_4,HF OpenLLM v1,58.94,,hf_open_llm_v1_240829_frozen.csv 0_0001_withdpo_4iters_bs256_5102lr_iter_4,HFv1 ARC,61.95,,hf_open_llm_v1_240829_frozen.csv 0_0001_withdpo_4iters_bs256_5102lr_iter_4,HFv1 GSM8K,17.06,,hf_open_llm_v1_240829_frozen.csv 0_0001_withdpo_4iters_bs256_5102lr_iter_4,HFv1 HellaSwag,84.88,,hf_open_llm_v1_240829_frozen.csv 0_0001_withdpo_4iters_bs256_5102lr_iter_4,HFv1 MMLU,60.46,,hf_open_llm_v1_240829_frozen.csv 0_0001_withdpo_4iters_bs256_5102lr_iter_4,HFv1 TruthfulQA,51.71,,hf_open_llm_v1_240829_frozen.csv 0_0001_withdpo_4iters_bs256_5102lr_iter_4,HFv1 Winogrande,77.58,,hf_open_llm_v1_240829_frozen.csv 0_0001_withdpo_4iters_bs256_511lr_iter_2,HF OpenLLM v1,59.6,,hf_open_llm_v1_240829_frozen.csv 0_0001_withdpo_4iters_bs256_511lr_iter_2,HFv1 ARC,61.6,,hf_open_llm_v1_240829_frozen.csv 0_0001_withdpo_4iters_bs256_511lr_iter_2,HFv1 GSM8K,20.62,,hf_open_llm_v1_240829_frozen.csv 0_0001_withdpo_4iters_bs256_511lr_iter_2,HFv1 HellaSwag,84.83,,hf_open_llm_v1_240829_frozen.csv 0_0001_withdpo_4iters_bs256_511lr_iter_2,HFv1 MMLU,60.6,,hf_open_llm_v1_240829_frozen.csv 0_0001_withdpo_4iters_bs256_511lr_iter_2,HFv1 TruthfulQA,52.2,,hf_open_llm_v1_240829_frozen.csv 0_0001_withdpo_4iters_bs256_511lr_iter_2,HFv1 Winogrande,77.74,,hf_open_llm_v1_240829_frozen.csv 0_0001_withdpo_4iters_bs256_511lr_iter_3,HF OpenLLM v1,58.97,,hf_open_llm_v1_240829_frozen.csv 0_0001_withdpo_4iters_bs256_511lr_iter_3,HFv1 ARC,61.95,,hf_open_llm_v1_240829_frozen.csv 0_0001_withdpo_4iters_bs256_511lr_iter_3,HFv1 GSM8K,17.13,,hf_open_llm_v1_240829_frozen.csv 0_0001_withdpo_4iters_bs256_511lr_iter_3,HFv1 HellaSwag,84.87,,hf_open_llm_v1_240829_frozen.csv 0_0001_withdpo_4iters_bs256_511lr_iter_3,HFv1 MMLU,60.49,,hf_open_llm_v1_240829_frozen.csv 0_0001_withdpo_4iters_bs256_511lr_iter_3,HFv1 TruthfulQA,51.79,,hf_open_llm_v1_240829_frozen.csv 0_0001_withdpo_4iters_bs256_511lr_iter_3,HFv1 Winogrande,77.58,,hf_open_llm_v1_240829_frozen.csv 0_0005_llama_4iters_bs128_5551lr_iter_1,HF OpenLLM v1,68.56,,hf_open_llm_v1_240829_frozen.csv 0_0005_llama_4iters_bs128_5551lr_iter_1,HFv1 ARC,63.99,,hf_open_llm_v1_240829_frozen.csv 0_0005_llama_4iters_bs128_5551lr_iter_1,HFv1 GSM8K,68.92,,hf_open_llm_v1_240829_frozen.csv 0_0005_llama_4iters_bs128_5551lr_iter_1,HFv1 HellaSwag,80.62,,hf_open_llm_v1_240829_frozen.csv 0_0005_llama_4iters_bs128_5551lr_iter_1,HFv1 MMLU,68.28,,hf_open_llm_v1_240829_frozen.csv 0_0005_llama_4iters_bs128_5551lr_iter_1,HFv1 TruthfulQA,53.85,,hf_open_llm_v1_240829_frozen.csv 0_0005_llama_4iters_bs128_5551lr_iter_1,HFv1 Winogrande,75.69,,hf_open_llm_v1_240829_frozen.csv 0_0005_llama_4iters_bs128_5551lr_iter_2,HF OpenLLM v1,68.73,,hf_open_llm_v1_240829_frozen.csv 0_0005_llama_4iters_bs128_5551lr_iter_2,HFv1 ARC,64.93,,hf_open_llm_v1_240829_frozen.csv 0_0005_llama_4iters_bs128_5551lr_iter_2,HFv1 GSM8K,66.49,,hf_open_llm_v1_240829_frozen.csv 0_0005_llama_4iters_bs128_5551lr_iter_2,HFv1 HellaSwag,81.38,,hf_open_llm_v1_240829_frozen.csv 0_0005_llama_4iters_bs128_5551lr_iter_2,HFv1 MMLU,68.11,,hf_open_llm_v1_240829_frozen.csv 0_0005_llama_4iters_bs128_5551lr_iter_2,HFv1 TruthfulQA,56.0,,hf_open_llm_v1_240829_frozen.csv 0_0005_llama_4iters_bs128_5551lr_iter_2,HFv1 Winogrande,75.45,,hf_open_llm_v1_240829_frozen.csv 0_0005_llama_4iters_bs128_5551lr_iter_3,HF OpenLLM v1,68.68,,hf_open_llm_v1_240829_frozen.csv 0_0005_llama_4iters_bs128_5551lr_iter_3,HFv1 ARC,64.76,,hf_open_llm_v1_240829_frozen.csv 0_0005_llama_4iters_bs128_5551lr_iter_3,HFv1 GSM8K,64.9,,hf_open_llm_v1_240829_frozen.csv 0_0005_llama_4iters_bs128_5551lr_iter_3,HFv1 HellaSwag,81.61,,hf_open_llm_v1_240829_frozen.csv 0_0005_llama_4iters_bs128_5551lr_iter_3,HFv1 MMLU,68.08,,hf_open_llm_v1_240829_frozen.csv 0_0005_llama_4iters_bs128_5551lr_iter_3,HFv1 TruthfulQA,57.36,,hf_open_llm_v1_240829_frozen.csv 0_0005_llama_4iters_bs128_5551lr_iter_3,HFv1 Winogrande,75.37,,hf_open_llm_v1_240829_frozen.csv 0_0005_llama_nodpo_3iters_bs128_531lr_iter_1,HF OpenLLM v1,68.73,,hf_open_llm_v1_240829_frozen.csv 0_0005_llama_nodpo_3iters_bs128_531lr_iter_1,HFv1 ARC,64.33,,hf_open_llm_v1_240829_frozen.csv 0_0005_llama_nodpo_3iters_bs128_531lr_iter_1,HFv1 GSM8K,67.93,,hf_open_llm_v1_240829_frozen.csv 0_0005_llama_nodpo_3iters_bs128_531lr_iter_1,HFv1 HellaSwag,81.24,,hf_open_llm_v1_240829_frozen.csv 0_0005_llama_nodpo_3iters_bs128_531lr_iter_1,HFv1 MMLU,68.36,,hf_open_llm_v1_240829_frozen.csv 0_0005_llama_nodpo_3iters_bs128_531lr_iter_1,HFv1 TruthfulQA,55.21,,hf_open_llm_v1_240829_frozen.csv 0_0005_llama_nodpo_3iters_bs128_531lr_iter_1,HFv1 Winogrande,75.3,,hf_open_llm_v1_240829_frozen.csv 0_0005_llama_nodpo_3iters_bs128_531lr_iter_2,HF OpenLLM v1,68.63,,hf_open_llm_v1_240829_frozen.csv 0_0005_llama_nodpo_3iters_bs128_531lr_iter_2,HFv1 ARC,64.42,,hf_open_llm_v1_240829_frozen.csv 0_0005_llama_nodpo_3iters_bs128_531lr_iter_2,HFv1 GSM8K,66.87,,hf_open_llm_v1_240829_frozen.csv 0_0005_llama_nodpo_3iters_bs128_531lr_iter_2,HFv1 HellaSwag,81.37,,hf_open_llm_v1_240829_frozen.csv 0_0005_llama_nodpo_3iters_bs128_531lr_iter_2,HFv1 MMLU,68.1,,hf_open_llm_v1_240829_frozen.csv 0_0005_llama_nodpo_3iters_bs128_531lr_iter_2,HFv1 TruthfulQA,56.33,,hf_open_llm_v1_240829_frozen.csv 0_0005_llama_nodpo_3iters_bs128_531lr_iter_2,HFv1 Winogrande,74.66,,hf_open_llm_v1_240829_frozen.csv 0_0005_llama_nodpo_3iters_bs128_531lr_oldtrl_iter_2,HF OpenLLM v1,68.23,,hf_open_llm_v1_240829_frozen.csv 0_0005_llama_nodpo_3iters_bs128_531lr_oldtrl_iter_2,HFv1 ARC,64.68,,hf_open_llm_v1_240829_frozen.csv 0_0005_llama_nodpo_3iters_bs128_531lr_oldtrl_iter_2,HFv1 GSM8K,64.52,,hf_open_llm_v1_240829_frozen.csv 0_0005_llama_nodpo_3iters_bs128_531lr_oldtrl_iter_2,HFv1 HellaSwag,81.21,,hf_open_llm_v1_240829_frozen.csv 0_0005_llama_nodpo_3iters_bs128_531lr_oldtrl_iter_2,HFv1 MMLU,68.29,,hf_open_llm_v1_240829_frozen.csv 0_0005_llama_nodpo_3iters_bs128_531lr_oldtrl_iter_2,HFv1 TruthfulQA,56.35,,hf_open_llm_v1_240829_frozen.csv 0_0005_llama_nodpo_3iters_bs128_531lr_oldtrl_iter_2,HFv1 Winogrande,74.35,,hf_open_llm_v1_240829_frozen.csv 0_0005_withdpo_4iters_bs256_5551lr_iter_4,HF OpenLLM v1,58.46,,hf_open_llm_v1_240829_frozen.csv 0_0005_withdpo_4iters_bs256_5551lr_iter_4,HFv1 ARC,60.92,,hf_open_llm_v1_240829_frozen.csv 0_0005_withdpo_4iters_bs256_5551lr_iter_4,HFv1 GSM8K,15.01,,hf_open_llm_v1_240829_frozen.csv 0_0005_withdpo_4iters_bs256_5551lr_iter_4,HFv1 HellaSwag,85.0,,hf_open_llm_v1_240829_frozen.csv 0_0005_withdpo_4iters_bs256_5551lr_iter_4,HFv1 MMLU,60.17,,hf_open_llm_v1_240829_frozen.csv 0_0005_withdpo_4iters_bs256_5551lr_iter_4,HFv1 TruthfulQA,52.22,,hf_open_llm_v1_240829_frozen.csv 0_0005_withdpo_4iters_bs256_5551lr_iter_4,HFv1 Winogrande,77.43,,hf_open_llm_v1_240829_frozen.csv 0_0005_withdpo_4iters_bs256_555lr_iter_2,HF OpenLLM v1,59.5,,hf_open_llm_v1_240829_frozen.csv 0_0005_withdpo_4iters_bs256_555lr_iter_2,HFv1 ARC,61.69,,hf_open_llm_v1_240829_frozen.csv 0_0005_withdpo_4iters_bs256_555lr_iter_2,HFv1 GSM8K,19.79,,hf_open_llm_v1_240829_frozen.csv 0_0005_withdpo_4iters_bs256_555lr_iter_2,HFv1 HellaSwag,84.82,,hf_open_llm_v1_240829_frozen.csv 0_0005_withdpo_4iters_bs256_555lr_iter_2,HFv1 MMLU,60.56,,hf_open_llm_v1_240829_frozen.csv 0_0005_withdpo_4iters_bs256_555lr_iter_2,HFv1 TruthfulQA,52.25,,hf_open_llm_v1_240829_frozen.csv 0_0005_withdpo_4iters_bs256_555lr_iter_2,HFv1 Winogrande,77.9,,hf_open_llm_v1_240829_frozen.csv 0_001_3iters_bs256_nodpo_only4w_iter_3,HF OpenLLM v1,59.19,,hf_open_llm_v1_240829_frozen.csv 0_001_3iters_bs256_nodpo_only4w_iter_3,HFv1 ARC,63.23,,hf_open_llm_v1_240829_frozen.csv 0_001_3iters_bs256_nodpo_only4w_iter_3,HFv1 GSM8K,11.68,,hf_open_llm_v1_240829_frozen.csv 0_001_3iters_bs256_nodpo_only4w_iter_3,HFv1 HellaSwag,84.87,,hf_open_llm_v1_240829_frozen.csv 0_001_3iters_bs256_nodpo_only4w_iter_3,HFv1 MMLU,60.25,,hf_open_llm_v1_240829_frozen.csv 0_001_3iters_bs256_nodpo_only4w_iter_3,HFv1 TruthfulQA,57.74,,hf_open_llm_v1_240829_frozen.csv 0_001_3iters_bs256_nodpo_only4w_iter_3,HFv1 Winogrande,77.35,,hf_open_llm_v1_240829_frozen.csv 0_001_4iters_bs256_nodpo_only4w_iter_4,HF OpenLLM v1,58.66,,hf_open_llm_v1_240829_frozen.csv 0_001_4iters_bs256_nodpo_only4w_iter_4,HFv1 ARC,61.77,,hf_open_llm_v1_240829_frozen.csv 0_001_4iters_bs256_nodpo_only4w_iter_4,HFv1 GSM8K,12.59,,hf_open_llm_v1_240829_frozen.csv 0_001_4iters_bs256_nodpo_only4w_iter_4,HFv1 HellaSwag,84.62,,hf_open_llm_v1_240829_frozen.csv 0_001_4iters_bs256_nodpo_only4w_iter_4,HFv1 MMLU,60.08,,hf_open_llm_v1_240829_frozen.csv 0_001_4iters_bs256_nodpo_only4w_iter_4,HFv1 TruthfulQA,56.04,,hf_open_llm_v1_240829_frozen.csv 0_001_4iters_bs256_nodpo_only4w_iter_4,HFv1 Winogrande,76.87,,hf_open_llm_v1_240829_frozen.csv 0_001_4iters_bs256_nodpo_only4w_userresponse_iter_4,HF OpenLLM v1,60.64,,hf_open_llm_v1_240829_frozen.csv 0_001_4iters_bs256_nodpo_only4w_userresponse_iter_4,HFv1 ARC,61.69,,hf_open_llm_v1_240829_frozen.csv 0_001_4iters_bs256_nodpo_only4w_userresponse_iter_4,HFv1 GSM8K,23.35,,hf_open_llm_v1_240829_frozen.csv 0_001_4iters_bs256_nodpo_only4w_userresponse_iter_4,HFv1 HellaSwag,84.43,,hf_open_llm_v1_240829_frozen.csv 0_001_4iters_bs256_nodpo_only4w_userresponse_iter_4,HFv1 MMLU,60.92,,hf_open_llm_v1_240829_frozen.csv 0_001_4iters_bs256_nodpo_only4w_userresponse_iter_4,HFv1 TruthfulQA,55.39,,hf_open_llm_v1_240829_frozen.csv 0_001_4iters_bs256_nodpo_only4w_userresponse_iter_4,HFv1 Winogrande,78.06,,hf_open_llm_v1_240829_frozen.csv 0_001_ablation_5iters_bs256_iter_5,HF OpenLLM v1,59.03,,hf_open_llm_v1_240829_frozen.csv 0_001_ablation_5iters_bs256_iter_5,HFv1 ARC,61.35,,hf_open_llm_v1_240829_frozen.csv 0_001_ablation_5iters_bs256_iter_5,HFv1 GSM8K,15.24,,hf_open_llm_v1_240829_frozen.csv 0_001_ablation_5iters_bs256_iter_5,HFv1 HellaSwag,85.03,,hf_open_llm_v1_240829_frozen.csv 0_001_ablation_5iters_bs256_iter_5,HFv1 MMLU,60.09,,hf_open_llm_v1_240829_frozen.csv 0_001_ablation_5iters_bs256_iter_5,HFv1 TruthfulQA,55.2,,hf_open_llm_v1_240829_frozen.csv 0_001_ablation_5iters_bs256_iter_5,HFv1 Winogrande,77.27,,hf_open_llm_v1_240829_frozen.csv 0_001_idpo_declr_4iters_iter_2,HF OpenLLM v1,60.33,,hf_open_llm_v1_240829_frozen.csv 0_001_idpo_declr_4iters_iter_2,HFv1 ARC,62.46,,hf_open_llm_v1_240829_frozen.csv 0_001_idpo_declr_4iters_iter_2,HFv1 GSM8K,27.82,,hf_open_llm_v1_240829_frozen.csv 0_001_idpo_declr_4iters_iter_2,HFv1 HellaSwag,84.81,,hf_open_llm_v1_240829_frozen.csv 0_001_idpo_declr_4iters_iter_2,HFv1 MMLU,61.11,,hf_open_llm_v1_240829_frozen.csv 0_001_idpo_declr_4iters_iter_2,HFv1 TruthfulQA,48.18,,hf_open_llm_v1_240829_frozen.csv 0_001_idpo_declr_4iters_iter_2,HFv1 Winogrande,77.58,,hf_open_llm_v1_240829_frozen.csv 0_001_idpo_declr_4iters_iter_3,HF OpenLLM v1,60.82,,hf_open_llm_v1_240829_frozen.csv 0_001_idpo_declr_4iters_iter_3,HFv1 ARC,63.05,,hf_open_llm_v1_240829_frozen.csv 0_001_idpo_declr_4iters_iter_3,HFv1 GSM8K,28.28,,hf_open_llm_v1_240829_frozen.csv 0_001_idpo_declr_4iters_iter_3,HFv1 HellaSwag,84.98,,hf_open_llm_v1_240829_frozen.csv 0_001_idpo_declr_4iters_iter_3,HFv1 MMLU,60.69,,hf_open_llm_v1_240829_frozen.csv 0_001_idpo_declr_4iters_iter_3,HFv1 TruthfulQA,50.33,,hf_open_llm_v1_240829_frozen.csv 0_001_idpo_declr_4iters_iter_3,HFv1 Winogrande,77.58,,hf_open_llm_v1_240829_frozen.csv 0_001_idpo_declr_4iters_iter_4,HF OpenLLM v1,60.67,,hf_open_llm_v1_240829_frozen.csv 0_001_idpo_declr_4iters_iter_4,HFv1 ARC,62.71,,hf_open_llm_v1_240829_frozen.csv 0_001_idpo_declr_4iters_iter_4,HFv1 GSM8K,27.37,,hf_open_llm_v1_240829_frozen.csv 0_001_idpo_declr_4iters_iter_4,HFv1 HellaSwag,85.09,,hf_open_llm_v1_240829_frozen.csv 0_001_idpo_declr_4iters_iter_4,HFv1 MMLU,60.47,,hf_open_llm_v1_240829_frozen.csv 0_001_idpo_declr_4iters_iter_4,HFv1 TruthfulQA,51.01,,hf_open_llm_v1_240829_frozen.csv 0_001_idpo_declr_4iters_iter_4,HFv1 Winogrande,77.35,,hf_open_llm_v1_240829_frozen.csv 0_001_idpo_declr_iter_2,HF OpenLLM v1,61.96,,hf_open_llm_v1_240829_frozen.csv 0_001_idpo_declr_iter_2,HFv1 ARC,64.08,,hf_open_llm_v1_240829_frozen.csv 0_001_idpo_declr_iter_2,HFv1 GSM8K,30.25,,hf_open_llm_v1_240829_frozen.csv 0_001_idpo_declr_iter_2,HFv1 HellaSwag,85.25,,hf_open_llm_v1_240829_frozen.csv 0_001_idpo_declr_iter_2,HFv1 MMLU,60.52,,hf_open_llm_v1_240829_frozen.csv 0_001_idpo_declr_iter_2,HFv1 TruthfulQA,54.23,,hf_open_llm_v1_240829_frozen.csv 0_001_idpo_declr_iter_2,HFv1 Winogrande,77.43,,hf_open_llm_v1_240829_frozen.csv 0_001_idpo_declr_iter_3,HF OpenLLM v1,61.7,,hf_open_llm_v1_240829_frozen.csv 0_001_idpo_declr_iter_3,HFv1 ARC,64.16,,hf_open_llm_v1_240829_frozen.csv 0_001_idpo_declr_iter_3,HFv1 GSM8K,28.51,,hf_open_llm_v1_240829_frozen.csv 0_001_idpo_declr_iter_3,HFv1 HellaSwag,85.3,,hf_open_llm_v1_240829_frozen.csv 0_001_idpo_declr_iter_3,HFv1 MMLU,60.31,,hf_open_llm_v1_240829_frozen.csv 0_001_idpo_declr_iter_3,HFv1 TruthfulQA,54.72,,hf_open_llm_v1_240829_frozen.csv 0_001_idpo_declr_iter_3,HFv1 Winogrande,77.19,,hf_open_llm_v1_240829_frozen.csv 0_001_idpo_iter_1,HF OpenLLM v1,61.31,,hf_open_llm_v1_240829_frozen.csv 0_001_idpo_iter_1,HFv1 ARC,63.65,,hf_open_llm_v1_240829_frozen.csv 0_001_idpo_iter_1,HFv1 GSM8K,28.81,,hf_open_llm_v1_240829_frozen.csv 0_001_idpo_iter_1,HFv1 HellaSwag,84.87,,hf_open_llm_v1_240829_frozen.csv 0_001_idpo_iter_1,HFv1 MMLU,60.74,,hf_open_llm_v1_240829_frozen.csv 0_001_idpo_iter_1,HFv1 TruthfulQA,52.05,,hf_open_llm_v1_240829_frozen.csv 0_001_idpo_iter_1,HFv1 Winogrande,77.74,,hf_open_llm_v1_240829_frozen.csv 0_001_idpo_iter_2,HF OpenLLM v1,62.51,,hf_open_llm_v1_240829_frozen.csv 0_001_idpo_iter_2,HFv1 ARC,64.42,,hf_open_llm_v1_240829_frozen.csv 0_001_idpo_iter_2,HFv1 GSM8K,32.22,,hf_open_llm_v1_240829_frozen.csv 0_001_idpo_iter_2,HFv1 HellaSwag,85.47,,hf_open_llm_v1_240829_frozen.csv 0_001_idpo_iter_2,HFv1 MMLU,60.72,,hf_open_llm_v1_240829_frozen.csv 0_001_idpo_iter_2,HFv1 TruthfulQA,54.4,,hf_open_llm_v1_240829_frozen.csv 0_001_idpo_iter_2,HFv1 Winogrande,77.82,,hf_open_llm_v1_240829_frozen.csv 0_001_idpo_iter_3,HF OpenLLM v1,62.62,,hf_open_llm_v1_240829_frozen.csv 0_001_idpo_iter_3,HFv1 ARC,63.74,,hf_open_llm_v1_240829_frozen.csv 0_001_idpo_iter_3,HFv1 GSM8K,32.75,,hf_open_llm_v1_240829_frozen.csv 0_001_idpo_iter_3,HFv1 HellaSwag,85.58,,hf_open_llm_v1_240829_frozen.csv 0_001_idpo_iter_3,HFv1 MMLU,60.33,,hf_open_llm_v1_240829_frozen.csv 0_001_idpo_iter_3,HFv1 TruthfulQA,55.48,,hf_open_llm_v1_240829_frozen.csv 0_001_idpo_iter_3,HFv1 Winogrande,77.82,,hf_open_llm_v1_240829_frozen.csv 0_001_idpo_noreplacerej_iter_2,HF OpenLLM v1,61.77,,hf_open_llm_v1_240829_frozen.csv 0_001_idpo_noreplacerej_iter_2,HFv1 ARC,63.23,,hf_open_llm_v1_240829_frozen.csv 0_001_idpo_noreplacerej_iter_2,HFv1 GSM8K,30.33,,hf_open_llm_v1_240829_frozen.csv 0_001_idpo_noreplacerej_iter_2,HFv1 HellaSwag,85.42,,hf_open_llm_v1_240829_frozen.csv 0_001_idpo_noreplacerej_iter_2,HFv1 MMLU,60.66,,hf_open_llm_v1_240829_frozen.csv 0_001_idpo_noreplacerej_iter_2,HFv1 TruthfulQA,52.85,,hf_open_llm_v1_240829_frozen.csv 0_001_idpo_noreplacerej_iter_2,HFv1 Winogrande,78.14,,hf_open_llm_v1_240829_frozen.csv 0_001_idpo_noreplacerej_iter_3,HF OpenLLM v1,62.45,,hf_open_llm_v1_240829_frozen.csv 0_001_idpo_noreplacerej_iter_3,HFv1 ARC,65.02,,hf_open_llm_v1_240829_frozen.csv 0_001_idpo_noreplacerej_iter_3,HFv1 GSM8K,31.69,,hf_open_llm_v1_240829_frozen.csv 0_001_idpo_noreplacerej_iter_3,HFv1 HellaSwag,85.55,,hf_open_llm_v1_240829_frozen.csv 0_001_idpo_noreplacerej_iter_3,HFv1 MMLU,60.48,,hf_open_llm_v1_240829_frozen.csv 0_001_idpo_noreplacerej_iter_3,HFv1 TruthfulQA,54.31,,hf_open_llm_v1_240829_frozen.csv 0_001_idpo_noreplacerej_iter_3,HFv1 Winogrande,77.66,,hf_open_llm_v1_240829_frozen.csv 0_001_idpo_same_noreplacerej_declr_iter_2,HF OpenLLM v1,60.81,,hf_open_llm_v1_240829_frozen.csv 0_001_idpo_same_noreplacerej_declr_iter_2,HFv1 ARC,62.88,,hf_open_llm_v1_240829_frozen.csv 0_001_idpo_same_noreplacerej_declr_iter_2,HFv1 GSM8K,23.96,,hf_open_llm_v1_240829_frozen.csv 0_001_idpo_same_noreplacerej_declr_iter_2,HFv1 HellaSwag,85.21,,hf_open_llm_v1_240829_frozen.csv 0_001_idpo_same_noreplacerej_declr_iter_2,HFv1 MMLU,60.24,,hf_open_llm_v1_240829_frozen.csv 0_001_idpo_same_noreplacerej_declr_iter_2,HFv1 TruthfulQA,54.44,,hf_open_llm_v1_240829_frozen.csv 0_001_idpo_same_noreplacerej_declr_iter_2,HFv1 Winogrande,78.14,,hf_open_llm_v1_240829_frozen.csv 0_001_idpo_same_noreplacerej_declr_iter_3,HF OpenLLM v1,60.38,,hf_open_llm_v1_240829_frozen.csv 0_001_idpo_same_noreplacerej_declr_iter_3,HFv1 ARC,62.63,,hf_open_llm_v1_240829_frozen.csv 0_001_idpo_same_noreplacerej_declr_iter_3,HFv1 GSM8K,21.83,,hf_open_llm_v1_240829_frozen.csv 0_001_idpo_same_noreplacerej_declr_iter_3,HFv1 HellaSwag,85.26,,hf_open_llm_v1_240829_frozen.csv 0_001_idpo_same_noreplacerej_declr_iter_3,HFv1 MMLU,60.18,,hf_open_llm_v1_240829_frozen.csv 0_001_idpo_same_noreplacerej_declr_iter_3,HFv1 TruthfulQA,54.38,,hf_open_llm_v1_240829_frozen.csv 0_001_idpo_same_noreplacerej_declr_iter_3,HFv1 Winogrande,77.98,,hf_open_llm_v1_240829_frozen.csv 0_001_llama3_nodpo_3iters_bs128_531lr_iter_1,HF OpenLLM v1,68.77,,hf_open_llm_v1_240829_frozen.csv 0_001_llama3_nodpo_3iters_bs128_531lr_iter_1,HFv1 ARC,64.42,,hf_open_llm_v1_240829_frozen.csv 0_001_llama3_nodpo_3iters_bs128_531lr_iter_1,HFv1 GSM8K,67.93,,hf_open_llm_v1_240829_frozen.csv 0_001_llama3_nodpo_3iters_bs128_531lr_iter_1,HFv1 HellaSwag,81.25,,hf_open_llm_v1_240829_frozen.csv 0_001_llama3_nodpo_3iters_bs128_531lr_iter_1,HFv1 MMLU,68.38,,hf_open_llm_v1_240829_frozen.csv 0_001_llama3_nodpo_3iters_bs128_531lr_iter_1,HFv1 TruthfulQA,55.24,,hf_open_llm_v1_240829_frozen.csv 0_001_llama3_nodpo_3iters_bs128_531lr_iter_1,HFv1 Winogrande,75.37,,hf_open_llm_v1_240829_frozen.csv 0_001_llama3_nodpo_3iters_bs128_531lr_iter_2,HF OpenLLM v1,68.61,,hf_open_llm_v1_240829_frozen.csv 0_001_llama3_nodpo_3iters_bs128_531lr_iter_2,HFv1 ARC,64.42,,hf_open_llm_v1_240829_frozen.csv 0_001_llama3_nodpo_3iters_bs128_531lr_iter_2,HFv1 GSM8K,66.64,,hf_open_llm_v1_240829_frozen.csv 0_001_llama3_nodpo_3iters_bs128_531lr_iter_2,HFv1 HellaSwag,81.36,,hf_open_llm_v1_240829_frozen.csv 0_001_llama3_nodpo_3iters_bs128_531lr_iter_2,HFv1 MMLU,68.18,,hf_open_llm_v1_240829_frozen.csv 0_001_llama3_nodpo_3iters_bs128_531lr_iter_2,HFv1 TruthfulQA,56.47,,hf_open_llm_v1_240829_frozen.csv 0_001_llama3_nodpo_3iters_bs128_531lr_iter_2,HFv1 Winogrande,74.59,,hf_open_llm_v1_240829_frozen.csv 0_001_llama3_nodpo_3iters_bs128_531lr_iter_3,HF OpenLLM v1,68.67,,hf_open_llm_v1_240829_frozen.csv 0_001_llama3_nodpo_3iters_bs128_531lr_iter_3,HFv1 ARC,64.93,,hf_open_llm_v1_240829_frozen.csv 0_001_llama3_nodpo_3iters_bs128_531lr_iter_3,HFv1 GSM8K,66.49,,hf_open_llm_v1_240829_frozen.csv 0_001_llama3_nodpo_3iters_bs128_531lr_iter_3,HFv1 HellaSwag,81.4,,hf_open_llm_v1_240829_frozen.csv 0_001_llama3_nodpo_3iters_bs128_531lr_iter_3,HFv1 MMLU,68.16,,hf_open_llm_v1_240829_frozen.csv 0_001_llama3_nodpo_3iters_bs128_531lr_iter_3,HFv1 TruthfulQA,56.61,,hf_open_llm_v1_240829_frozen.csv 0_001_llama3_nodpo_3iters_bs128_531lr_iter_3,HFv1 Winogrande,74.43,,hf_open_llm_v1_240829_frozen.csv 0_001_zephyr_5551_4iters_bs256_iter_1,HF OpenLLM v1,60.85,,hf_open_llm_v1_240829_frozen.csv 0_001_zephyr_5551_4iters_bs256_iter_1,HFv1 ARC,61.77,,hf_open_llm_v1_240829_frozen.csv 0_001_zephyr_5551_4iters_bs256_iter_1,HFv1 GSM8K,33.97,,hf_open_llm_v1_240829_frozen.csv 0_001_zephyr_5551_4iters_bs256_iter_1,HFv1 HellaSwag,84.18,,hf_open_llm_v1_240829_frozen.csv 0_001_zephyr_5551_4iters_bs256_iter_1,HFv1 MMLU,61.41,,hf_open_llm_v1_240829_frozen.csv 0_001_zephyr_5551_4iters_bs256_iter_1,HFv1 TruthfulQA,45.45,,hf_open_llm_v1_240829_frozen.csv 0_001_zephyr_5551_4iters_bs256_iter_1,HFv1 Winogrande,78.3,,hf_open_llm_v1_240829_frozen.csv 0_001_zephyr_5551_4iters_bs256_iter_3,HF OpenLLM v1,59.24,,hf_open_llm_v1_240829_frozen.csv 0_001_zephyr_5551_4iters_bs256_iter_3,HFv1 ARC,61.6,,hf_open_llm_v1_240829_frozen.csv 0_001_zephyr_5551_4iters_bs256_iter_3,HFv1 GSM8K,18.42,,hf_open_llm_v1_240829_frozen.csv 0_001_zephyr_5551_4iters_bs256_iter_3,HFv1 HellaSwag,85.14,,hf_open_llm_v1_240829_frozen.csv 0_001_zephyr_5551_4iters_bs256_iter_3,HFv1 MMLU,60.11,,hf_open_llm_v1_240829_frozen.csv 0_001_zephyr_5551_4iters_bs256_iter_3,HFv1 TruthfulQA,52.75,,hf_open_llm_v1_240829_frozen.csv 0_001_zephyr_5551_4iters_bs256_iter_3,HFv1 Winogrande,77.43,,hf_open_llm_v1_240829_frozen.csv 0_001_zephyr_5551_4iters_bs256_iter_4,HF OpenLLM v1,58.86,,hf_open_llm_v1_240829_frozen.csv 0_001_zephyr_5551_4iters_bs256_iter_4,HFv1 ARC,61.18,,hf_open_llm_v1_240829_frozen.csv 0_001_zephyr_5551_4iters_bs256_iter_4,HFv1 GSM8K,17.06,,hf_open_llm_v1_240829_frozen.csv 0_001_zephyr_5551_4iters_bs256_iter_4,HFv1 HellaSwag,85.02,,hf_open_llm_v1_240829_frozen.csv 0_001_zephyr_5551_4iters_bs256_iter_4,HFv1 MMLU,60.01,,hf_open_llm_v1_240829_frozen.csv 0_001_zephyr_5551_4iters_bs256_iter_4,HFv1 TruthfulQA,52.25,,hf_open_llm_v1_240829_frozen.csv 0_001_zephyr_5551_4iters_bs256_iter_4,HFv1 Winogrande,77.66,,hf_open_llm_v1_240829_frozen.csv 0_0_ablation_sample1_4iters_bs256_iter_1,HF OpenLLM v1,60.81,,hf_open_llm_v1_240829_frozen.csv 0_0_ablation_sample1_4iters_bs256_iter_1,HFv1 ARC,61.6,,hf_open_llm_v1_240829_frozen.csv 0_0_ablation_sample1_4iters_bs256_iter_1,HFv1 GSM8K,34.19,,hf_open_llm_v1_240829_frozen.csv 0_0_ablation_sample1_4iters_bs256_iter_1,HFv1 HellaSwag,84.08,,hf_open_llm_v1_240829_frozen.csv 0_0_ablation_sample1_4iters_bs256_iter_1,HFv1 MMLU,61.54,,hf_open_llm_v1_240829_frozen.csv 0_0_ablation_sample1_4iters_bs256_iter_1,HFv1 TruthfulQA,45.45,,hf_open_llm_v1_240829_frozen.csv 0_0_ablation_sample1_4iters_bs256_iter_1,HFv1 Winogrande,77.98,,hf_open_llm_v1_240829_frozen.csv 0_0_ablation_sample1_4iters_bs256_iter_2,HF OpenLLM v1,59.55,,hf_open_llm_v1_240829_frozen.csv 0_0_ablation_sample1_4iters_bs256_iter_2,HFv1 ARC,61.69,,hf_open_llm_v1_240829_frozen.csv 0_0_ablation_sample1_4iters_bs256_iter_2,HFv1 GSM8K,20.32,,hf_open_llm_v1_240829_frozen.csv 0_0_ablation_sample1_4iters_bs256_iter_2,HFv1 HellaSwag,84.82,,hf_open_llm_v1_240829_frozen.csv 0_0_ablation_sample1_4iters_bs256_iter_2,HFv1 MMLU,60.54,,hf_open_llm_v1_240829_frozen.csv 0_0_ablation_sample1_4iters_bs256_iter_2,HFv1 TruthfulQA,52.18,,hf_open_llm_v1_240829_frozen.csv 0_0_ablation_sample1_4iters_bs256_iter_2,HFv1 Winogrande,77.74,,hf_open_llm_v1_240829_frozen.csv 0_0_ablation_sample1_4iters_bs256_iter_3,HF OpenLLM v1,58.95,,hf_open_llm_v1_240829_frozen.csv 0_0_ablation_sample1_4iters_bs256_iter_3,HFv1 ARC,61.52,,hf_open_llm_v1_240829_frozen.csv 0_0_ablation_sample1_4iters_bs256_iter_3,HFv1 GSM8K,16.22,,hf_open_llm_v1_240829_frozen.csv 0_0_ablation_sample1_4iters_bs256_iter_3,HFv1 HellaSwag,85.07,,hf_open_llm_v1_240829_frozen.csv 0_0_ablation_sample1_4iters_bs256_iter_3,HFv1 MMLU,60.47,,hf_open_llm_v1_240829_frozen.csv 0_0_ablation_sample1_4iters_bs256_iter_3,HFv1 TruthfulQA,53.18,,hf_open_llm_v1_240829_frozen.csv 0_0_ablation_sample1_4iters_bs256_iter_3,HFv1 Winogrande,77.27,,hf_open_llm_v1_240829_frozen.csv 0_0_ablation_sample1_4iters_bs256_iter_4,HF OpenLLM v1,58.28,,hf_open_llm_v1_240829_frozen.csv 0_0_ablation_sample1_4iters_bs256_iter_4,HFv1 ARC,60.75,,hf_open_llm_v1_240829_frozen.csv 0_0_ablation_sample1_4iters_bs256_iter_4,HFv1 GSM8K,12.13,,hf_open_llm_v1_240829_frozen.csv 0_0_ablation_sample1_4iters_bs256_iter_4,HFv1 HellaSwag,85.06,,hf_open_llm_v1_240829_frozen.csv 0_0_ablation_sample1_4iters_bs256_iter_4,HFv1 MMLU,60.16,,hf_open_llm_v1_240829_frozen.csv 0_0_ablation_sample1_4iters_bs256_iter_4,HFv1 TruthfulQA,54.17,,hf_open_llm_v1_240829_frozen.csv 0_0_ablation_sample1_4iters_bs256_iter_4,HFv1 Winogrande,77.43,,hf_open_llm_v1_240829_frozen.csv 0_0_llama_nodpo_3iters_bs128_531lr_iter_1,HF OpenLLM v1,68.8,,hf_open_llm_v1_240829_frozen.csv 0_0_llama_nodpo_3iters_bs128_531lr_iter_1,HFv1 ARC,64.33,,hf_open_llm_v1_240829_frozen.csv 0_0_llama_nodpo_3iters_bs128_531lr_iter_1,HFv1 GSM8K,68.16,,hf_open_llm_v1_240829_frozen.csv 0_0_llama_nodpo_3iters_bs128_531lr_iter_1,HFv1 HellaSwag,81.26,,hf_open_llm_v1_240829_frozen.csv 0_0_llama_nodpo_3iters_bs128_531lr_iter_1,HFv1 MMLU,68.34,,hf_open_llm_v1_240829_frozen.csv 0_0_llama_nodpo_3iters_bs128_531lr_iter_1,HFv1 TruthfulQA,55.25,,hf_open_llm_v1_240829_frozen.csv 0_0_llama_nodpo_3iters_bs128_531lr_iter_1,HFv1 Winogrande,75.45,,hf_open_llm_v1_240829_frozen.csv 0_0_llama_nodpo_3iters_bs128_531lr_iter_2,HF OpenLLM v1,68.63,,hf_open_llm_v1_240829_frozen.csv 0_0_llama_nodpo_3iters_bs128_531lr_iter_2,HFv1 ARC,64.42,,hf_open_llm_v1_240829_frozen.csv 0_0_llama_nodpo_3iters_bs128_531lr_iter_2,HFv1 GSM8K,66.64,,hf_open_llm_v1_240829_frozen.csv 0_0_llama_nodpo_3iters_bs128_531lr_iter_2,HFv1 HellaSwag,81.35,,hf_open_llm_v1_240829_frozen.csv 0_0_llama_nodpo_3iters_bs128_531lr_iter_2,HFv1 MMLU,68.21,,hf_open_llm_v1_240829_frozen.csv 0_0_llama_nodpo_3iters_bs128_531lr_iter_2,HFv1 TruthfulQA,56.4,,hf_open_llm_v1_240829_frozen.csv 0_0_llama_nodpo_3iters_bs128_531lr_iter_2,HFv1 Winogrande,74.74,,hf_open_llm_v1_240829_frozen.csv 0_0_llama_nodpo_3iters_bs128_531lr_iter_3,HF OpenLLM v1,68.66,,hf_open_llm_v1_240829_frozen.csv 0_0_llama_nodpo_3iters_bs128_531lr_iter_3,HFv1 ARC,64.68,,hf_open_llm_v1_240829_frozen.csv 0_0_llama_nodpo_3iters_bs128_531lr_iter_3,HFv1 GSM8K,66.49,,hf_open_llm_v1_240829_frozen.csv 0_0_llama_nodpo_3iters_bs128_531lr_iter_3,HFv1 HellaSwag,81.38,,hf_open_llm_v1_240829_frozen.csv 0_0_llama_nodpo_3iters_bs128_531lr_iter_3,HFv1 MMLU,68.12,,hf_open_llm_v1_240829_frozen.csv 0_0_llama_nodpo_3iters_bs128_531lr_iter_3,HFv1 TruthfulQA,56.54,,hf_open_llm_v1_240829_frozen.csv 0_0_llama_nodpo_3iters_bs128_531lr_iter_3,HFv1 Winogrande,74.74,,hf_open_llm_v1_240829_frozen.csv 0_0_withdpo_4iters_bs256_531lr_iter_3,HF OpenLLM v1,58.72,,hf_open_llm_v1_240829_frozen.csv 0_0_withdpo_4iters_bs256_531lr_iter_3,HFv1 ARC,61.69,,hf_open_llm_v1_240829_frozen.csv 0_0_withdpo_4iters_bs256_531lr_iter_3,HFv1 GSM8K,15.77,,hf_open_llm_v1_240829_frozen.csv 0_0_withdpo_4iters_bs256_531lr_iter_3,HFv1 HellaSwag,84.92,,hf_open_llm_v1_240829_frozen.csv 0_0_withdpo_4iters_bs256_531lr_iter_3,HFv1 MMLU,60.28,,hf_open_llm_v1_240829_frozen.csv 0_0_withdpo_4iters_bs256_531lr_iter_3,HFv1 TruthfulQA,52.64,,hf_open_llm_v1_240829_frozen.csv 0_0_withdpo_4iters_bs256_531lr_iter_3,HFv1 Winogrande,77.03,,hf_open_llm_v1_240829_frozen.csv 0_0_withdpo_4iters_bs256_5551lr_iter_4,HF OpenLLM v1,58.69,,hf_open_llm_v1_240829_frozen.csv 0_0_withdpo_4iters_bs256_5551lr_iter_4,HFv1 ARC,60.84,,hf_open_llm_v1_240829_frozen.csv 0_0_withdpo_4iters_bs256_5551lr_iter_4,HFv1 GSM8K,14.4,,hf_open_llm_v1_240829_frozen.csv 0_0_withdpo_4iters_bs256_5551lr_iter_4,HFv1 HellaSwag,85.22,,hf_open_llm_v1_240829_frozen.csv 0_0_withdpo_4iters_bs256_5551lr_iter_4,HFv1 MMLU,60.4,,hf_open_llm_v1_240829_frozen.csv 0_0_withdpo_4iters_bs256_5551lr_iter_4,HFv1 TruthfulQA,53.88,,hf_open_llm_v1_240829_frozen.csv 0_0_withdpo_4iters_bs256_5551lr_iter_4,HFv1 Winogrande,77.43,,hf_open_llm_v1_240829_frozen.csv 0_0_zephyr_withdpo_4iters_bs128_5551lr_iter_2,HF OpenLLM v1,59.98,,hf_open_llm_v1_240829_frozen.csv 0_0_zephyr_withdpo_4iters_bs128_5551lr_iter_2,HFv1 ARC,62.29,,hf_open_llm_v1_240829_frozen.csv 0_0_zephyr_withdpo_4iters_bs128_5551lr_iter_2,HFv1 GSM8K,22.21,,hf_open_llm_v1_240829_frozen.csv 0_0_zephyr_withdpo_4iters_bs128_5551lr_iter_2,HFv1 HellaSwag,84.95,,hf_open_llm_v1_240829_frozen.csv 0_0_zephyr_withdpo_4iters_bs128_5551lr_iter_2,HFv1 MMLU,60.43,,hf_open_llm_v1_240829_frozen.csv 0_0_zephyr_withdpo_4iters_bs128_5551lr_iter_2,HFv1 TruthfulQA,52.35,,hf_open_llm_v1_240829_frozen.csv 0_0_zephyr_withdpo_4iters_bs128_5551lr_iter_2,HFv1 Winogrande,77.66,,hf_open_llm_v1_240829_frozen.csv 0_0_zephyr_withdpo_4iters_bs128_5551lr_iter_3,HF OpenLLM v1,59.17,,hf_open_llm_v1_240829_frozen.csv 0_0_zephyr_withdpo_4iters_bs128_5551lr_iter_3,HFv1 ARC,61.69,,hf_open_llm_v1_240829_frozen.csv 0_0_zephyr_withdpo_4iters_bs128_5551lr_iter_3,HFv1 GSM8K,16.91,,hf_open_llm_v1_240829_frozen.csv 0_0_zephyr_withdpo_4iters_bs128_5551lr_iter_3,HFv1 HellaSwag,85.16,,hf_open_llm_v1_240829_frozen.csv 0_0_zephyr_withdpo_4iters_bs128_5551lr_iter_3,HFv1 MMLU,60.23,,hf_open_llm_v1_240829_frozen.csv 0_0_zephyr_withdpo_4iters_bs128_5551lr_iter_3,HFv1 TruthfulQA,53.42,,hf_open_llm_v1_240829_frozen.csv 0_0_zephyr_withdpo_4iters_bs128_5551lr_iter_3,HFv1 Winogrande,77.58,,hf_open_llm_v1_240829_frozen.csv 0_0_zephyr_withdpo_4iters_bs128_5551lr_iter_4,HF OpenLLM v1,58.69,,hf_open_llm_v1_240829_frozen.csv 0_0_zephyr_withdpo_4iters_bs128_5551lr_iter_4,HFv1 ARC,61.01,,hf_open_llm_v1_240829_frozen.csv 0_0_zephyr_withdpo_4iters_bs128_5551lr_iter_4,HFv1 GSM8K,15.54,,hf_open_llm_v1_240829_frozen.csv 0_0_zephyr_withdpo_4iters_bs128_5551lr_iter_4,HFv1 HellaSwag,85.08,,hf_open_llm_v1_240829_frozen.csv 0_0_zephyr_withdpo_4iters_bs128_5551lr_iter_4,HFv1 MMLU,60.12,,hf_open_llm_v1_240829_frozen.csv 0_0_zephyr_withdpo_4iters_bs128_5551lr_iter_4,HFv1 TruthfulQA,52.67,,hf_open_llm_v1_240829_frozen.csv 0_0_zephyr_withdpo_4iters_bs128_5551lr_iter_4,HFv1 Winogrande,77.74,,hf_open_llm_v1_240829_frozen.csv 10_7bx2_dpo_200,HF OpenLLM v1,73.83,,hf_open_llm_v1_240829_frozen.csv 10_7bx2_dpo_200,HFv1 ARC,70.22,,hf_open_llm_v1_240829_frozen.csv 10_7bx2_dpo_200,HFv1 GSM8K,60.96,,hf_open_llm_v1_240829_frozen.csv 10_7bx2_dpo_200,HFv1 HellaSwag,88.23,,hf_open_llm_v1_240829_frozen.csv 10_7bx2_dpo_200,HFv1 MMLU,66.25,,hf_open_llm_v1_240829_frozen.csv 10_7bx2_dpo_200,HFv1 TruthfulQA,75.38,,hf_open_llm_v1_240829_frozen.csv 10_7bx2_dpo_200,HFv1 Winogrande,81.93,,hf_open_llm_v1_240829_frozen.csv 13b_thorns_l2,HF OpenLLM v1,54.72,,hf_open_llm_v1_240829_frozen.csv 13b_thorns_l2,HFv1 ARC,62.88,,hf_open_llm_v1_240829_frozen.csv 13b_thorns_l2,HFv1 GSM8K,0.91,,hf_open_llm_v1_240829_frozen.csv 13b_thorns_l2,HFv1 HellaSwag,83.57,,hf_open_llm_v1_240829_frozen.csv 13b_thorns_l2,HFv1 MMLU,56.95,,hf_open_llm_v1_240829_frozen.csv 13b_thorns_l2,HFv1 TruthfulQA,49.52,,hf_open_llm_v1_240829_frozen.csv 13b_thorns_l2,HFv1 Winogrande,74.51,,hf_open_llm_v1_240829_frozen.csv 22_neuro_model,HF OpenLLM v1,50.23,,hf_open_llm_v1_240829_frozen.csv 22_neuro_model,HFv1 ARC,49.15,,hf_open_llm_v1_240829_frozen.csv 22_neuro_model,HFv1 GSM8K,1.14,,hf_open_llm_v1_240829_frozen.csv 22_neuro_model,HFv1 HellaSwag,62.31,,hf_open_llm_v1_240829_frozen.csv 22_neuro_model,HFv1 MMLU,62.01,,hf_open_llm_v1_240829_frozen.csv 22_neuro_model,HFv1 TruthfulQA,60.23,,hf_open_llm_v1_240829_frozen.csv 22_neuro_model,HFv1 Winogrande,66.54,,hf_open_llm_v1_240829_frozen.csv 2x_lora_assemble_nova_13b,HF OpenLLM v1,57.26,,hf_open_llm_v1_240829_frozen.csv 2x_lora_assemble_nova_13b,HFv1 ARC,62.63,,hf_open_llm_v1_240829_frozen.csv 2x_lora_assemble_nova_13b,HFv1 GSM8K,10.24,,hf_open_llm_v1_240829_frozen.csv 2x_lora_assemble_nova_13b,HFv1 HellaSwag,83.24,,hf_open_llm_v1_240829_frozen.csv 2x_lora_assemble_nova_13b,HFv1 MMLU,58.64,,hf_open_llm_v1_240829_frozen.csv 2x_lora_assemble_nova_13b,HFv1 TruthfulQA,51.88,,hf_open_llm_v1_240829_frozen.csv 2x_lora_assemble_nova_13b,HFv1 Winogrande,76.95,,hf_open_llm_v1_240829_frozen.csv 2x_lora_assemble_platypus2_13b,HF OpenLLM v1,55.33,,hf_open_llm_v1_240829_frozen.csv 2x_lora_assemble_platypus2_13b,HFv1 ARC,60.58,,hf_open_llm_v1_240829_frozen.csv 2x_lora_assemble_platypus2_13b,HFv1 GSM8K,0.91,,hf_open_llm_v1_240829_frozen.csv 2x_lora_assemble_platypus2_13b,HFv1 HellaSwag,82.56,,hf_open_llm_v1_240829_frozen.csv 2x_lora_assemble_platypus2_13b,HFv1 MMLU,58.25,,hf_open_llm_v1_240829_frozen.csv 2x_lora_assemble_platypus2_13b,HFv1 TruthfulQA,54.77,,hf_open_llm_v1_240829_frozen.csv 2x_lora_assemble_platypus2_13b,HFv1 Winogrande,74.9,,hf_open_llm_v1_240829_frozen.csv 3b_redpajama_conditional_alpha,HF OpenLLM v1,36.88,,hf_open_llm_v1_240829_frozen.csv 3b_redpajama_conditional_alpha,HFv1 ARC,36.26,,hf_open_llm_v1_240829_frozen.csv 3b_redpajama_conditional_alpha,HFv1 GSM8K,0.61,,hf_open_llm_v1_240829_frozen.csv 3b_redpajama_conditional_alpha,HFv1 HellaSwag,61.9,,hf_open_llm_v1_240829_frozen.csv 3b_redpajama_conditional_alpha,HFv1 MMLU,25.42,,hf_open_llm_v1_240829_frozen.csv 3b_redpajama_conditional_alpha,HFv1 TruthfulQA,36.31,,hf_open_llm_v1_240829_frozen.csv 3b_redpajama_conditional_alpha,HFv1 Winogrande,60.77,,hf_open_llm_v1_240829_frozen.csv 42dot_llm_sft_1_3b,HF OpenLLM v1,36.61,,hf_open_llm_v1_240829_frozen.csv 42dot_llm_sft_1_3b,HFv1 ARC,36.09,,hf_open_llm_v1_240829_frozen.csv 42dot_llm_sft_1_3b,HFv1 GSM8K,0.68,,hf_open_llm_v1_240829_frozen.csv 42dot_llm_sft_1_3b,HFv1 HellaSwag,58.96,,hf_open_llm_v1_240829_frozen.csv 42dot_llm_sft_1_3b,HFv1 MMLU,25.51,,hf_open_llm_v1_240829_frozen.csv 42dot_llm_sft_1_3b,HFv1 TruthfulQA,39.98,,hf_open_llm_v1_240829_frozen.csv 42dot_llm_sft_1_3b,HFv1 Winogrande,58.41,,hf_open_llm_v1_240829_frozen.csv 774m_03_09_2024,HF OpenLLM v1,33.22,,hf_open_llm_v1_240829_frozen.csv 774m_03_09_2024,HFv1 ARC,30.29,,hf_open_llm_v1_240829_frozen.csv 774m_03_09_2024,HFv1 GSM8K,0.3,,hf_open_llm_v1_240829_frozen.csv 774m_03_09_2024,HFv1 HellaSwag,53.88,,hf_open_llm_v1_240829_frozen.csv 774m_03_09_2024,HFv1 MMLU,25.33,,hf_open_llm_v1_240829_frozen.csv 774m_03_09_2024,HFv1 TruthfulQA,34.44,,hf_open_llm_v1_240829_frozen.csv 774m_03_09_2024,HFv1 Winogrande,55.09,,hf_open_llm_v1_240829_frozen.csv 7b_redpajama_conditional_alpha,HF OpenLLM v1,40.56,,hf_open_llm_v1_240829_frozen.csv 7b_redpajama_conditional_alpha,HFv1 ARC,42.58,,hf_open_llm_v1_240829_frozen.csv 7b_redpajama_conditional_alpha,HFv1 GSM8K,0.76,,hf_open_llm_v1_240829_frozen.csv 7b_redpajama_conditional_alpha,HFv1 HellaSwag,69.91,,hf_open_llm_v1_240829_frozen.csv 7b_redpajama_conditional_alpha,HFv1 MMLU,26.53,,hf_open_llm_v1_240829_frozen.csv 7b_redpajama_conditional_alpha,HFv1 TruthfulQA,36.42,,hf_open_llm_v1_240829_frozen.csv 7b_redpajama_conditional_alpha,HFv1 Winogrande,67.17,,hf_open_llm_v1_240829_frozen.csv 7bx4_dpo,HF OpenLLM v1,73.2,,hf_open_llm_v1_240829_frozen.csv 7bx4_dpo,HFv1 ARC,69.37,,hf_open_llm_v1_240829_frozen.csv 7bx4_dpo,HFv1 GSM8K,71.95,,hf_open_llm_v1_240829_frozen.csv 7bx4_dpo,HFv1 HellaSwag,86.89,,hf_open_llm_v1_240829_frozen.csv 7bx4_dpo,HFv1 MMLU,64.73,,hf_open_llm_v1_240829_frozen.csv 7bx4_dpo,HFv1 TruthfulQA,65.66,,hf_open_llm_v1_240829_frozen.csv 7bx4_dpo,HFv1 Winogrande,80.58,,hf_open_llm_v1_240829_frozen.csv 7bx4_dpo_2e,HF OpenLLM v1,72.99,,hf_open_llm_v1_240829_frozen.csv 7bx4_dpo_2e,HFv1 ARC,68.94,,hf_open_llm_v1_240829_frozen.csv 7bx4_dpo_2e,HFv1 GSM8K,71.34,,hf_open_llm_v1_240829_frozen.csv 7bx4_dpo_2e,HFv1 HellaSwag,86.8,,hf_open_llm_v1_240829_frozen.csv 7bx4_dpo_2e,HFv1 MMLU,64.5,,hf_open_llm_v1_240829_frozen.csv 7bx4_dpo_2e,HFv1 TruthfulQA,65.6,,hf_open_llm_v1_240829_frozen.csv 7bx4_dpo_2e,HFv1 Winogrande,80.74,,hf_open_llm_v1_240829_frozen.csv 7bx4_dpo_700,HF OpenLLM v1,70.85,,hf_open_llm_v1_240829_frozen.csv 7bx4_dpo_700,HFv1 ARC,64.68,,hf_open_llm_v1_240829_frozen.csv 7bx4_dpo_700,HFv1 GSM8K,63.38,,hf_open_llm_v1_240829_frozen.csv 7bx4_dpo_700,HFv1 HellaSwag,86.12,,hf_open_llm_v1_240829_frozen.csv 7bx4_dpo_700,HFv1 MMLU,62.23,,hf_open_llm_v1_240829_frozen.csv 7bx4_dpo_700,HFv1 TruthfulQA,68.99,,hf_open_llm_v1_240829_frozen.csv 7bx4_dpo_700,HFv1 Winogrande,79.72,,hf_open_llm_v1_240829_frozen.csv aanaphi2_v0_1,HF OpenLLM v1,63.28,,hf_open_llm_v1_240829_frozen.csv aanaphi2_v0_1,HFv1 ARC,63.91,,hf_open_llm_v1_240829_frozen.csv aanaphi2_v0_1,HFv1 GSM8K,54.89,,hf_open_llm_v1_240829_frozen.csv aanaphi2_v0_1,HFv1 HellaSwag,77.97,,hf_open_llm_v1_240829_frozen.csv aanaphi2_v0_1,HFv1 MMLU,57.73,,hf_open_llm_v1_240829_frozen.csv aanaphi2_v0_1,HFv1 TruthfulQA,51.56,,hf_open_llm_v1_240829_frozen.csv aanaphi2_v0_1,HFv1 Winogrande,73.64,,hf_open_llm_v1_240829_frozen.csv ablation_model_fineweb_v1,HF OpenLLM v1,36.76,,hf_open_llm_v1_240829_frozen.csv ablation_model_fineweb_v1,HFv1 ARC,35.41,,hf_open_llm_v1_240829_frozen.csv ablation_model_fineweb_v1,HFv1 GSM8K,1.52,,hf_open_llm_v1_240829_frozen.csv ablation_model_fineweb_v1,HFv1 HellaSwag,66.31,,hf_open_llm_v1_240829_frozen.csv ablation_model_fineweb_v1,HFv1 MMLU,25.66,,hf_open_llm_v1_240829_frozen.csv ablation_model_fineweb_v1,HFv1 TruthfulQA,30.18,,hf_open_llm_v1_240829_frozen.csv ablation_model_fineweb_v1,HFv1 Winogrande,61.48,,hf_open_llm_v1_240829_frozen.csv adelie_sft,HF OpenLLM v1,52.75,,hf_open_llm_v1_240829_frozen.csv adelie_sft,HFv1 ARC,54.1,,hf_open_llm_v1_240829_frozen.csv adelie_sft,HFv1 GSM8K,19.64,,hf_open_llm_v1_240829_frozen.csv adelie_sft,HFv1 HellaSwag,78.22,,hf_open_llm_v1_240829_frozen.csv adelie_sft,HFv1 MMLU,47.67,,hf_open_llm_v1_240829_frozen.csv adelie_sft,HFv1 TruthfulQA,42.75,,hf_open_llm_v1_240829_frozen.csv adelie_sft,HFv1 Winogrande,74.11,,hf_open_llm_v1_240829_frozen.csv aeonium_v1_baseweb_1b,HF OpenLLM v1,29.15,,hf_open_llm_v1_240829_frozen.csv aeonium_v1_baseweb_1b,HFv1 ARC,20.99,,hf_open_llm_v1_240829_frozen.csv aeonium_v1_baseweb_1b,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv aeonium_v1_baseweb_1b,HFv1 HellaSwag,32.24,,hf_open_llm_v1_240829_frozen.csv aeonium_v1_baseweb_1b,HFv1 MMLU,26.15,,hf_open_llm_v1_240829_frozen.csv aeonium_v1_baseweb_1b,HFv1 TruthfulQA,46.17,,hf_open_llm_v1_240829_frozen.csv aeonium_v1_baseweb_1b,HFv1 Winogrande,49.33,,hf_open_llm_v1_240829_frozen.csv aether_7b_chat_v1_0,HF OpenLLM v1,59.05,,hf_open_llm_v1_240829_frozen.csv aether_7b_chat_v1_0,HFv1 ARC,57.76,,hf_open_llm_v1_240829_frozen.csv aether_7b_chat_v1_0,HFv1 GSM8K,32.98,,hf_open_llm_v1_240829_frozen.csv aether_7b_chat_v1_0,HFv1 HellaSwag,81.76,,hf_open_llm_v1_240829_frozen.csv aether_7b_chat_v1_0,HFv1 MMLU,60.53,,hf_open_llm_v1_240829_frozen.csv aether_7b_chat_v1_0,HFv1 TruthfulQA,44.21,,hf_open_llm_v1_240829_frozen.csv aether_7b_chat_v1_0,HFv1 Winogrande,77.03,,hf_open_llm_v1_240829_frozen.csv agiin_13_6b_v0_1,HF OpenLLM v1,68.4,,hf_open_llm_v1_240829_frozen.csv agiin_13_6b_v0_1,HFv1 ARC,69.45,,hf_open_llm_v1_240829_frozen.csv agiin_13_6b_v0_1,HFv1 GSM8K,46.47,,hf_open_llm_v1_240829_frozen.csv agiin_13_6b_v0_1,HFv1 HellaSwag,86.64,,hf_open_llm_v1_240829_frozen.csv agiin_13_6b_v0_1,HFv1 MMLU,61.15,,hf_open_llm_v1_240829_frozen.csv agiin_13_6b_v0_1,HFv1 TruthfulQA,67.97,,hf_open_llm_v1_240829_frozen.csv agiin_13_6b_v0_1,HFv1 Winogrande,78.69,,hf_open_llm_v1_240829_frozen.csv aira_2_1b1,HF OpenLLM v1,29.32,,hf_open_llm_v1_240829_frozen.csv aira_2_1b1,HFv1 ARC,23.21,,hf_open_llm_v1_240829_frozen.csv aira_2_1b1,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv aira_2_1b1,HFv1 HellaSwag,26.97,,hf_open_llm_v1_240829_frozen.csv aira_2_1b1,HFv1 MMLU,24.86,,hf_open_llm_v1_240829_frozen.csv aira_2_1b1,HFv1 TruthfulQA,50.63,,hf_open_llm_v1_240829_frozen.csv aira_2_1b1,HFv1 Winogrande,50.28,,hf_open_llm_v1_240829_frozen.csv aira_2_355m,HF OpenLLM v1,31.0,,hf_open_llm_v1_240829_frozen.csv aira_2_355m,HFv1 ARC,27.56,,hf_open_llm_v1_240829_frozen.csv aira_2_355m,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv aira_2_355m,HFv1 HellaSwag,38.92,,hf_open_llm_v1_240829_frozen.csv aira_2_355m,HFv1 MMLU,27.26,,hf_open_llm_v1_240829_frozen.csv aira_2_355m,HFv1 TruthfulQA,38.53,,hf_open_llm_v1_240829_frozen.csv aira_2_355m,HFv1 Winogrande,53.75,,hf_open_llm_v1_240829_frozen.csv aira_2_774m,HF OpenLLM v1,31.33,,hf_open_llm_v1_240829_frozen.csv aira_2_774m,HFv1 ARC,28.75,,hf_open_llm_v1_240829_frozen.csv aira_2_774m,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv aira_2_774m,HFv1 HellaSwag,40.8,,hf_open_llm_v1_240829_frozen.csv aira_2_774m,HFv1 MMLU,25.1,,hf_open_llm_v1_240829_frozen.csv aira_2_774m,HFv1 TruthfulQA,41.33,,hf_open_llm_v1_240829_frozen.csv aira_2_774m,HFv1 Winogrande,52.01,,hf_open_llm_v1_240829_frozen.csv airboros2_1_platypus2_13b_qlora_0_80_epoch,HF OpenLLM v1,53.15,,hf_open_llm_v1_240829_frozen.csv airboros2_1_platypus2_13b_qlora_0_80_epoch,HFv1 ARC,58.96,,hf_open_llm_v1_240829_frozen.csv airboros2_1_platypus2_13b_qlora_0_80_epoch,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv airboros2_1_platypus2_13b_qlora_0_80_epoch,HFv1 HellaSwag,82.46,,hf_open_llm_v1_240829_frozen.csv airboros2_1_platypus2_13b_qlora_0_80_epoch,HFv1 MMLU,54.62,,hf_open_llm_v1_240829_frozen.csv airboros2_1_platypus2_13b_qlora_0_80_epoch,HFv1 TruthfulQA,47.71,,hf_open_llm_v1_240829_frozen.csv airboros2_1_platypus2_13b_qlora_0_80_epoch,HFv1 Winogrande,75.14,,hf_open_llm_v1_240829_frozen.csv airic_the_mistral,HF OpenLLM v1,59.95,,hf_open_llm_v1_240829_frozen.csv airic_the_mistral,HFv1 ARC,59.98,,hf_open_llm_v1_240829_frozen.csv airic_the_mistral,HFv1 GSM8K,30.86,,hf_open_llm_v1_240829_frozen.csv airic_the_mistral,HFv1 HellaSwag,82.98,,hf_open_llm_v1_240829_frozen.csv airic_the_mistral,HFv1 MMLU,60.67,,hf_open_llm_v1_240829_frozen.csv airic_the_mistral,HFv1 TruthfulQA,48.24,,hf_open_llm_v1_240829_frozen.csv airic_the_mistral,HFv1 Winogrande,76.95,,hf_open_llm_v1_240829_frozen.csv airoboros_2_1_llama_2_13b_qlora,HF OpenLLM v1,53.23,,hf_open_llm_v1_240829_frozen.csv airoboros_2_1_llama_2_13b_qlora,HFv1 ARC,59.73,,hf_open_llm_v1_240829_frozen.csv airoboros_2_1_llama_2_13b_qlora,HFv1 GSM8K,2.81,,hf_open_llm_v1_240829_frozen.csv airoboros_2_1_llama_2_13b_qlora,HFv1 HellaSwag,82.91,,hf_open_llm_v1_240829_frozen.csv airoboros_2_1_llama_2_13b_qlora,HFv1 MMLU,54.77,,hf_open_llm_v1_240829_frozen.csv airoboros_2_1_llama_2_13b_qlora,HFv1 TruthfulQA,45.14,,hf_open_llm_v1_240829_frozen.csv airoboros_2_1_llama_2_13b_qlora,HFv1 Winogrande,74.03,,hf_open_llm_v1_240829_frozen.csv airoboros_33b_2_1,HF OpenLLM v1,57.16,,hf_open_llm_v1_240829_frozen.csv airoboros_33b_2_1,HFv1 ARC,63.65,,hf_open_llm_v1_240829_frozen.csv airoboros_33b_2_1,HFv1 GSM8K,6.6,,hf_open_llm_v1_240829_frozen.csv airoboros_33b_2_1,HFv1 HellaSwag,84.97,,hf_open_llm_v1_240829_frozen.csv airoboros_33b_2_1,HFv1 MMLU,57.37,,hf_open_llm_v1_240829_frozen.csv airoboros_33b_2_1,HFv1 TruthfulQA,52.17,,hf_open_llm_v1_240829_frozen.csv airoboros_33b_2_1,HFv1 Winogrande,78.22,,hf_open_llm_v1_240829_frozen.csv airoboros_33b_gpt4_1_3,HF OpenLLM v1,57.43,,hf_open_llm_v1_240829_frozen.csv airoboros_33b_gpt4_1_3,HFv1 ARC,63.91,,hf_open_llm_v1_240829_frozen.csv airoboros_33b_gpt4_1_3,HFv1 GSM8K,13.04,,hf_open_llm_v1_240829_frozen.csv airoboros_33b_gpt4_1_3,HFv1 HellaSwag,85.04,,hf_open_llm_v1_240829_frozen.csv airoboros_33b_gpt4_1_3,HFv1 MMLU,58.53,,hf_open_llm_v1_240829_frozen.csv airoboros_33b_gpt4_1_3,HFv1 TruthfulQA,45.36,,hf_open_llm_v1_240829_frozen.csv airoboros_33b_gpt4_1_3,HFv1 Winogrande,78.69,,hf_open_llm_v1_240829_frozen.csv airoboros_c34b_2_1,HF OpenLLM v1,51.52,,hf_open_llm_v1_240829_frozen.csv airoboros_c34b_2_1,HFv1 ARC,54.69,,hf_open_llm_v1_240829_frozen.csv airoboros_c34b_2_1,HFv1 GSM8K,8.34,,hf_open_llm_v1_240829_frozen.csv airoboros_c34b_2_1,HFv1 HellaSwag,76.45,,hf_open_llm_v1_240829_frozen.csv airoboros_c34b_2_1,HFv1 MMLU,55.08,,hf_open_llm_v1_240829_frozen.csv airoboros_c34b_2_1,HFv1 TruthfulQA,46.15,,hf_open_llm_v1_240829_frozen.csv airoboros_c34b_2_1,HFv1 Winogrande,68.43,,hf_open_llm_v1_240829_frozen.csv airoboros_c34b_2_2_1,HF OpenLLM v1,55.15,,hf_open_llm_v1_240829_frozen.csv airoboros_c34b_2_2_1,HFv1 ARC,54.69,,hf_open_llm_v1_240829_frozen.csv airoboros_c34b_2_2_1,HFv1 GSM8K,20.02,,hf_open_llm_v1_240829_frozen.csv airoboros_c34b_2_2_1,HFv1 HellaSwag,76.84,,hf_open_llm_v1_240829_frozen.csv airoboros_c34b_2_2_1,HFv1 MMLU,55.43,,hf_open_llm_v1_240829_frozen.csv airoboros_c34b_2_2_1,HFv1 TruthfulQA,51.36,,hf_open_llm_v1_240829_frozen.csv airoboros_c34b_2_2_1,HFv1 Winogrande,72.53,,hf_open_llm_v1_240829_frozen.csv airoboros_l2_13b_2_2_1,HF OpenLLM v1,56.36,,hf_open_llm_v1_240829_frozen.csv airoboros_l2_13b_2_2_1,HFv1 ARC,60.92,,hf_open_llm_v1_240829_frozen.csv airoboros_l2_13b_2_2_1,HFv1 GSM8K,11.6,,hf_open_llm_v1_240829_frozen.csv airoboros_l2_13b_2_2_1,HFv1 HellaSwag,83.77,,hf_open_llm_v1_240829_frozen.csv airoboros_l2_13b_2_2_1,HFv1 MMLU,56.47,,hf_open_llm_v1_240829_frozen.csv airoboros_l2_13b_2_2_1,HFv1 TruthfulQA,49.42,,hf_open_llm_v1_240829_frozen.csv airoboros_l2_13b_2_2_1,HFv1 Winogrande,76.01,,hf_open_llm_v1_240829_frozen.csv airoboros_l2_70b_2_2_1,HF OpenLLM v1,69.13,,hf_open_llm_v1_240829_frozen.csv airoboros_l2_70b_2_2_1,HFv1 ARC,69.71,,hf_open_llm_v1_240829_frozen.csv airoboros_l2_70b_2_2_1,HFv1 GSM8K,44.88,,hf_open_llm_v1_240829_frozen.csv airoboros_l2_70b_2_2_1,HFv1 HellaSwag,87.95,,hf_open_llm_v1_240829_frozen.csv airoboros_l2_70b_2_2_1,HFv1 MMLU,69.79,,hf_open_llm_v1_240829_frozen.csv airoboros_l2_70b_2_2_1,HFv1 TruthfulQA,59.49,,hf_open_llm_v1_240829_frozen.csv airoboros_l2_70b_2_2_1,HFv1 Winogrande,82.95,,hf_open_llm_v1_240829_frozen.csv airoboros_l2_7b_2_2_1,HF OpenLLM v1,51.22,,hf_open_llm_v1_240829_frozen.csv airoboros_l2_7b_2_2_1,HFv1 ARC,55.03,,hf_open_llm_v1_240829_frozen.csv airoboros_l2_7b_2_2_1,HFv1 GSM8K,6.14,,hf_open_llm_v1_240829_frozen.csv airoboros_l2_7b_2_2_1,HFv1 HellaSwag,80.06,,hf_open_llm_v1_240829_frozen.csv airoboros_l2_7b_2_2_1,HFv1 MMLU,47.64,,hf_open_llm_v1_240829_frozen.csv airoboros_l2_7b_2_2_1,HFv1 TruthfulQA,44.65,,hf_open_llm_v1_240829_frozen.csv airoboros_l2_7b_2_2_1,HFv1 Winogrande,73.8,,hf_open_llm_v1_240829_frozen.csv airocoder_34b_2_1,HF OpenLLM v1,49.61,,hf_open_llm_v1_240829_frozen.csv airocoder_34b_2_1,HFv1 ARC,54.18,,hf_open_llm_v1_240829_frozen.csv airocoder_34b_2_1,HFv1 GSM8K,8.34,,hf_open_llm_v1_240829_frozen.csv airocoder_34b_2_1,HFv1 HellaSwag,73.84,,hf_open_llm_v1_240829_frozen.csv airocoder_34b_2_1,HFv1 MMLU,50.67,,hf_open_llm_v1_240829_frozen.csv airocoder_34b_2_1,HFv1 TruthfulQA,40.7,,hf_open_llm_v1_240829_frozen.csv airocoder_34b_2_1,HFv1 Winogrande,69.93,,hf_open_llm_v1_240829_frozen.csv aisquare_instruct_llama2_koen_13b_v0_9_24,HF OpenLLM v1,56.98,,hf_open_llm_v1_240829_frozen.csv aisquare_instruct_llama2_koen_13b_v0_9_24,HFv1 ARC,55.63,,hf_open_llm_v1_240829_frozen.csv aisquare_instruct_llama2_koen_13b_v0_9_24,HFv1 GSM8K,23.2,,hf_open_llm_v1_240829_frozen.csv aisquare_instruct_llama2_koen_13b_v0_9_24,HFv1 HellaSwag,81.35,,hf_open_llm_v1_240829_frozen.csv aisquare_instruct_llama2_koen_13b_v0_9_24,HFv1 MMLU,51.76,,hf_open_llm_v1_240829_frozen.csv aisquare_instruct_llama2_koen_13b_v0_9_24,HFv1 TruthfulQA,53.0,,hf_open_llm_v1_240829_frozen.csv aisquare_instruct_llama2_koen_13b_v0_9_24,HFv1 Winogrande,76.95,,hf_open_llm_v1_240829_frozen.csv aisquare_instruct_solar_10_7b_v0_5_31,HF OpenLLM v1,61.05,,hf_open_llm_v1_240829_frozen.csv aisquare_instruct_solar_10_7b_v0_5_31,HFv1 ARC,60.67,,hf_open_llm_v1_240829_frozen.csv aisquare_instruct_solar_10_7b_v0_5_31,HFv1 GSM8K,34.27,,hf_open_llm_v1_240829_frozen.csv aisquare_instruct_solar_10_7b_v0_5_31,HFv1 HellaSwag,84.2,,hf_open_llm_v1_240829_frozen.csv aisquare_instruct_solar_10_7b_v0_5_31,HFv1 MMLU,52.86,,hf_open_llm_v1_240829_frozen.csv aisquare_instruct_solar_10_7b_v0_5_31,HFv1 TruthfulQA,51.35,,hf_open_llm_v1_240829_frozen.csv aisquare_instruct_solar_10_7b_v0_5_31,HFv1 Winogrande,82.95,,hf_open_llm_v1_240829_frozen.csv aisquare_instruct_solar_10_7b_v0_5_32,HF OpenLLM v1,59.79,,hf_open_llm_v1_240829_frozen.csv aisquare_instruct_solar_10_7b_v0_5_32,HFv1 ARC,61.86,,hf_open_llm_v1_240829_frozen.csv aisquare_instruct_solar_10_7b_v0_5_32,HFv1 GSM8K,15.09,,hf_open_llm_v1_240829_frozen.csv aisquare_instruct_solar_10_7b_v0_5_32,HFv1 HellaSwag,84.66,,hf_open_llm_v1_240829_frozen.csv aisquare_instruct_solar_10_7b_v0_5_32,HFv1 MMLU,63.13,,hf_open_llm_v1_240829_frozen.csv aisquare_instruct_solar_10_7b_v0_5_32,HFv1 TruthfulQA,51.19,,hf_open_llm_v1_240829_frozen.csv aisquare_instruct_solar_10_7b_v0_5_32,HFv1 Winogrande,82.79,,hf_open_llm_v1_240829_frozen.csv algae_550m_base,HF OpenLLM v1,28.97,,hf_open_llm_v1_240829_frozen.csv algae_550m_base,HFv1 ARC,22.53,,hf_open_llm_v1_240829_frozen.csv algae_550m_base,HFv1 GSM8K,0.76,,hf_open_llm_v1_240829_frozen.csv algae_550m_base,HFv1 HellaSwag,28.32,,hf_open_llm_v1_240829_frozen.csv algae_550m_base,HFv1 MMLU,25.83,,hf_open_llm_v1_240829_frozen.csv algae_550m_base,HFv1 TruthfulQA,45.54,,hf_open_llm_v1_240829_frozen.csv algae_550m_base,HFv1 Winogrande,50.83,,hf_open_llm_v1_240829_frozen.csv alma_13b_r,HF OpenLLM v1,49.32,,hf_open_llm_v1_240829_frozen.csv alma_13b_r,HFv1 ARC,55.55,,hf_open_llm_v1_240829_frozen.csv alma_13b_r,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv alma_13b_r,HFv1 HellaSwag,79.45,,hf_open_llm_v1_240829_frozen.csv alma_13b_r,HFv1 MMLU,49.52,,hf_open_llm_v1_240829_frozen.csv alma_13b_r,HFv1 TruthfulQA,36.09,,hf_open_llm_v1_240829_frozen.csv alma_13b_r,HFv1 Winogrande,75.3,,hf_open_llm_v1_240829_frozen.csv alooowso,HF OpenLLM v1,65.63,,hf_open_llm_v1_240829_frozen.csv alooowso,HFv1 ARC,62.97,,hf_open_llm_v1_240829_frozen.csv alooowso,HFv1 GSM8K,39.58,,hf_open_llm_v1_240829_frozen.csv alooowso,HFv1 HellaSwag,84.87,,hf_open_llm_v1_240829_frozen.csv alooowso,HFv1 MMLU,60.78,,hf_open_llm_v1_240829_frozen.csv alooowso,HFv1 TruthfulQA,68.18,,hf_open_llm_v1_240829_frozen.csv alooowso,HFv1 Winogrande,77.43,,hf_open_llm_v1_240829_frozen.csv alpagasus_2_13b_qlora_merged,HF OpenLLM v1,54.2,,hf_open_llm_v1_240829_frozen.csv alpagasus_2_13b_qlora_merged,HFv1 ARC,60.84,,hf_open_llm_v1_240829_frozen.csv alpagasus_2_13b_qlora_merged,HFv1 GSM8K,10.84,,hf_open_llm_v1_240829_frozen.csv alpagasus_2_13b_qlora_merged,HFv1 HellaSwag,82.43,,hf_open_llm_v1_240829_frozen.csv alpagasus_2_13b_qlora_merged,HFv1 MMLU,55.55,,hf_open_llm_v1_240829_frozen.csv alpagasus_2_13b_qlora_merged,HFv1 TruthfulQA,38.65,,hf_open_llm_v1_240829_frozen.csv alpagasus_2_13b_qlora_merged,HFv1 Winogrande,76.87,,hf_open_llm_v1_240829_frozen.csv alphamonarch_7b,HF OpenLLM v1,75.99,,hf_open_llm_v1_240829_frozen.csv alphamonarch_7b,HFv1 ARC,73.04,,hf_open_llm_v1_240829_frozen.csv alphamonarch_7b,HFv1 GSM8K,66.72,,hf_open_llm_v1_240829_frozen.csv alphamonarch_7b,HFv1 HellaSwag,89.18,,hf_open_llm_v1_240829_frozen.csv alphamonarch_7b,HFv1 MMLU,64.4,,hf_open_llm_v1_240829_frozen.csv alphamonarch_7b,HFv1 TruthfulQA,77.91,,hf_open_llm_v1_240829_frozen.csv alphamonarch_7b,HFv1 Winogrande,84.69,,hf_open_llm_v1_240829_frozen.csv alphamonarch_daser,HF OpenLLM v1,75.94,,hf_open_llm_v1_240829_frozen.csv alphamonarch_daser,HFv1 ARC,73.04,,hf_open_llm_v1_240829_frozen.csv alphamonarch_daser,HFv1 GSM8K,66.26,,hf_open_llm_v1_240829_frozen.csv alphamonarch_daser,HFv1 HellaSwag,89.23,,hf_open_llm_v1_240829_frozen.csv alphamonarch_daser,HFv1 MMLU,64.43,,hf_open_llm_v1_240829_frozen.csv alphamonarch_daser,HFv1 TruthfulQA,78.01,,hf_open_llm_v1_240829_frozen.csv alphamonarch_daser,HFv1 Winogrande,84.69,,hf_open_llm_v1_240829_frozen.csv alphamonarch_dora,HF OpenLLM v1,75.86,,hf_open_llm_v1_240829_frozen.csv alphamonarch_dora,HFv1 ARC,73.21,,hf_open_llm_v1_240829_frozen.csv alphamonarch_dora,HFv1 GSM8K,65.73,,hf_open_llm_v1_240829_frozen.csv alphamonarch_dora,HFv1 HellaSwag,89.26,,hf_open_llm_v1_240829_frozen.csv alphamonarch_dora,HFv1 MMLU,64.47,,hf_open_llm_v1_240829_frozen.csv alphamonarch_dora,HFv1 TruthfulQA,78.02,,hf_open_llm_v1_240829_frozen.csv alphamonarch_dora,HFv1 Winogrande,84.45,,hf_open_llm_v1_240829_frozen.csv alphamonarch_laser,HF OpenLLM v1,76.0,,hf_open_llm_v1_240829_frozen.csv alphamonarch_laser,HFv1 ARC,73.12,,hf_open_llm_v1_240829_frozen.csv alphamonarch_laser,HFv1 GSM8K,66.72,,hf_open_llm_v1_240829_frozen.csv alphamonarch_laser,HFv1 HellaSwag,89.21,,hf_open_llm_v1_240829_frozen.csv alphamonarch_laser,HFv1 MMLU,64.43,,hf_open_llm_v1_240829_frozen.csv alphamonarch_laser,HFv1 TruthfulQA,77.9,,hf_open_llm_v1_240829_frozen.csv alphamonarch_laser,HFv1 Winogrande,84.61,,hf_open_llm_v1_240829_frozen.csv amber,HF OpenLLM v1,40.97,,hf_open_llm_v1_240829_frozen.csv amber,HFv1 ARC,40.96,,hf_open_llm_v1_240829_frozen.csv amber,HFv1 GSM8K,2.81,,hf_open_llm_v1_240829_frozen.csv amber,HFv1 HellaSwag,73.79,,hf_open_llm_v1_240829_frozen.csv amber,HFv1 MMLU,26.84,,hf_open_llm_v1_240829_frozen.csv amber,HFv1 TruthfulQA,33.56,,hf_open_llm_v1_240829_frozen.csv amber,HFv1 Winogrande,67.88,,hf_open_llm_v1_240829_frozen.csv anfeng_v3_avocet,HF OpenLLM v1,73.96,,hf_open_llm_v1_240829_frozen.csv anfeng_v3_avocet,HFv1 ARC,67.75,,hf_open_llm_v1_240829_frozen.csv anfeng_v3_avocet,HFv1 GSM8K,74.0,,hf_open_llm_v1_240829_frozen.csv anfeng_v3_avocet,HFv1 HellaSwag,87.43,,hf_open_llm_v1_240829_frozen.csv anfeng_v3_avocet,HFv1 MMLU,74.79,,hf_open_llm_v1_240829_frozen.csv anfeng_v3_avocet,HFv1 TruthfulQA,58.63,,hf_open_llm_v1_240829_frozen.csv anfeng_v3_avocet,HFv1 Winogrande,81.14,,hf_open_llm_v1_240829_frozen.csv apollo_7b_orpo_experimental,HF OpenLLM v1,71.81,,hf_open_llm_v1_240829_frozen.csv apollo_7b_orpo_experimental,HFv1 ARC,64.85,,hf_open_llm_v1_240829_frozen.csv apollo_7b_orpo_experimental,HFv1 GSM8K,68.99,,hf_open_llm_v1_240829_frozen.csv apollo_7b_orpo_experimental,HFv1 HellaSwag,85.5,,hf_open_llm_v1_240829_frozen.csv apollo_7b_orpo_experimental,HFv1 MMLU,63.93,,hf_open_llm_v1_240829_frozen.csv apollo_7b_orpo_experimental,HFv1 TruthfulQA,63.52,,hf_open_llm_v1_240829_frozen.csv apollo_7b_orpo_experimental,HFv1 Winogrande,84.06,,hf_open_llm_v1_240829_frozen.csv aquila2_34b,HF OpenLLM v1,54.5,,hf_open_llm_v1_240829_frozen.csv aquila2_34b,HFv1 ARC,52.65,,hf_open_llm_v1_240829_frozen.csv aquila2_34b,HFv1 GSM8K,0.45,,hf_open_llm_v1_240829_frozen.csv aquila2_34b,HFv1 HellaSwag,81.99,,hf_open_llm_v1_240829_frozen.csv aquila2_34b,HFv1 MMLU,76.02,,hf_open_llm_v1_240829_frozen.csv aquila2_34b,HFv1 TruthfulQA,40.8,,hf_open_llm_v1_240829_frozen.csv aquila2_34b,HFv1 Winogrande,75.06,,hf_open_llm_v1_240829_frozen.csv arc1,HF OpenLLM v1,66.69,,hf_open_llm_v1_240829_frozen.csv arc1,HFv1 ARC,58.79,,hf_open_llm_v1_240829_frozen.csv arc1,HFv1 GSM8K,69.83,,hf_open_llm_v1_240829_frozen.csv arc1,HFv1 HellaSwag,76.41,,hf_open_llm_v1_240829_frozen.csv arc1,HFv1 MMLU,65.73,,hf_open_llm_v1_240829_frozen.csv arc1,HFv1 TruthfulQA,52.73,,hf_open_llm_v1_240829_frozen.csv arc1,HFv1 Winogrande,76.64,,hf_open_llm_v1_240829_frozen.csv archangel_sft_kto_llama13b,HF OpenLLM v1,52.87,,hf_open_llm_v1_240829_frozen.csv archangel_sft_kto_llama13b,HFv1 ARC,56.14,,hf_open_llm_v1_240829_frozen.csv archangel_sft_kto_llama13b,HFv1 GSM8K,16.83,,hf_open_llm_v1_240829_frozen.csv archangel_sft_kto_llama13b,HFv1 HellaSwag,80.8,,hf_open_llm_v1_240829_frozen.csv archangel_sft_kto_llama13b,HFv1 MMLU,47.84,,hf_open_llm_v1_240829_frozen.csv archangel_sft_kto_llama13b,HFv1 TruthfulQA,39.42,,hf_open_llm_v1_240829_frozen.csv archangel_sft_kto_llama13b,HFv1 Winogrande,76.16,,hf_open_llm_v1_240829_frozen.csv asclepius_llama2_13b,HF OpenLLM v1,50.25,,hf_open_llm_v1_240829_frozen.csv asclepius_llama2_13b,HFv1 ARC,55.89,,hf_open_llm_v1_240829_frozen.csv asclepius_llama2_13b,HFv1 GSM8K,0.15,,hf_open_llm_v1_240829_frozen.csv asclepius_llama2_13b,HFv1 HellaSwag,79.66,,hf_open_llm_v1_240829_frozen.csv asclepius_llama2_13b,HFv1 MMLU,52.38,,hf_open_llm_v1_240829_frozen.csv asclepius_llama2_13b,HFv1 TruthfulQA,40.76,,hf_open_llm_v1_240829_frozen.csv asclepius_llama2_13b,HFv1 Winogrande,72.69,,hf_open_llm_v1_240829_frozen.csv asclepius_llama2_7b,HF OpenLLM v1,47.15,,hf_open_llm_v1_240829_frozen.csv asclepius_llama2_7b,HFv1 ARC,50.85,,hf_open_llm_v1_240829_frozen.csv asclepius_llama2_7b,HFv1 GSM8K,0.3,,hf_open_llm_v1_240829_frozen.csv asclepius_llama2_7b,HFv1 HellaSwag,76.53,,hf_open_llm_v1_240829_frozen.csv asclepius_llama2_7b,HFv1 MMLU,43.61,,hf_open_llm_v1_240829_frozen.csv asclepius_llama2_7b,HFv1 TruthfulQA,43.31,,hf_open_llm_v1_240829_frozen.csv asclepius_llama2_7b,HFv1 Winogrande,68.27,,hf_open_llm_v1_240829_frozen.csv athena_platypus2_13b_qlora_0_80_epoch,HF OpenLLM v1,53.16,,hf_open_llm_v1_240829_frozen.csv athena_platypus2_13b_qlora_0_80_epoch,HFv1 ARC,56.66,,hf_open_llm_v1_240829_frozen.csv athena_platypus2_13b_qlora_0_80_epoch,HFv1 GSM8K,0.08,,hf_open_llm_v1_240829_frozen.csv athena_platypus2_13b_qlora_0_80_epoch,HFv1 HellaSwag,80.56,,hf_open_llm_v1_240829_frozen.csv athena_platypus2_13b_qlora_0_80_epoch,HFv1 MMLU,55.43,,hf_open_llm_v1_240829_frozen.csv athena_platypus2_13b_qlora_0_80_epoch,HFv1 TruthfulQA,53.62,,hf_open_llm_v1_240829_frozen.csv athena_platypus2_13b_qlora_0_80_epoch,HFv1 Winogrande,72.61,,hf_open_llm_v1_240829_frozen.csv athena_zephyr_7b,HF OpenLLM v1,59.34,,hf_open_llm_v1_240829_frozen.csv athena_zephyr_7b,HFv1 ARC,56.14,,hf_open_llm_v1_240829_frozen.csv athena_zephyr_7b,HFv1 GSM8K,28.13,,hf_open_llm_v1_240829_frozen.csv athena_zephyr_7b,HFv1 HellaSwag,81.63,,hf_open_llm_v1_240829_frozen.csv athena_zephyr_7b,HFv1 MMLU,59.8,,hf_open_llm_v1_240829_frozen.csv athena_zephyr_7b,HFv1 TruthfulQA,54.2,,hf_open_llm_v1_240829_frozen.csv athena_zephyr_7b,HFv1 Winogrande,76.16,,hf_open_llm_v1_240829_frozen.csv autotrain_llama3_70b_orpo_v2,HF OpenLLM v1,78.17,,hf_open_llm_v1_240829_frozen.csv autotrain_llama3_70b_orpo_v2,HFv1 ARC,70.9,,hf_open_llm_v1_240829_frozen.csv autotrain_llama3_70b_orpo_v2,HFv1 GSM8K,84.23,,hf_open_llm_v1_240829_frozen.csv autotrain_llama3_70b_orpo_v2,HFv1 HellaSwag,86.09,,hf_open_llm_v1_240829_frozen.csv autotrain_llama3_70b_orpo_v2,HFv1 MMLU,80.07,,hf_open_llm_v1_240829_frozen.csv autotrain_llama3_70b_orpo_v2,HFv1 TruthfulQA,62.82,,hf_open_llm_v1_240829_frozen.csv autotrain_llama3_70b_orpo_v2,HFv1 Winogrande,84.93,,hf_open_llm_v1_240829_frozen.csv average_dolphin_8x7b,HF OpenLLM v1,69.64,,hf_open_llm_v1_240829_frozen.csv average_dolphin_8x7b,HFv1 ARC,68.6,,hf_open_llm_v1_240829_frozen.csv average_dolphin_8x7b,HFv1 GSM8K,56.56,,hf_open_llm_v1_240829_frozen.csv average_dolphin_8x7b,HFv1 HellaSwag,85.99,,hf_open_llm_v1_240829_frozen.csv average_dolphin_8x7b,HFv1 MMLU,70.84,,hf_open_llm_v1_240829_frozen.csv average_dolphin_8x7b,HFv1 TruthfulQA,54.51,,hf_open_llm_v1_240829_frozen.csv average_dolphin_8x7b,HFv1 Winogrande,81.37,,hf_open_llm_v1_240829_frozen.csv awanllm_llama3_8b_dolfin_v0_3_dpo,HF OpenLLM v1,53.96,,hf_open_llm_v1_240829_frozen.csv awanllm_llama3_8b_dolfin_v0_3_dpo,HFv1 ARC,56.48,,hf_open_llm_v1_240829_frozen.csv awanllm_llama3_8b_dolfin_v0_3_dpo,HFv1 GSM8K,15.47,,hf_open_llm_v1_240829_frozen.csv awanllm_llama3_8b_dolfin_v0_3_dpo,HFv1 HellaSwag,75.43,,hf_open_llm_v1_240829_frozen.csv awanllm_llama3_8b_dolfin_v0_3_dpo,HFv1 MMLU,49.05,,hf_open_llm_v1_240829_frozen.csv awanllm_llama3_8b_dolfin_v0_3_dpo,HFv1 TruthfulQA,57.27,,hf_open_llm_v1_240829_frozen.csv awanllm_llama3_8b_dolfin_v0_3_dpo,HFv1 Winogrande,70.09,,hf_open_llm_v1_240829_frozen.csv awanllm_llama3_8b_instruct_dpo_v0_2,HF OpenLLM v1,58.12,,hf_open_llm_v1_240829_frozen.csv awanllm_llama3_8b_instruct_dpo_v0_2,HFv1 ARC,57.42,,hf_open_llm_v1_240829_frozen.csv awanllm_llama3_8b_instruct_dpo_v0_2,HFv1 GSM8K,25.93,,hf_open_llm_v1_240829_frozen.csv awanllm_llama3_8b_instruct_dpo_v0_2,HFv1 HellaSwag,77.21,,hf_open_llm_v1_240829_frozen.csv awanllm_llama3_8b_instruct_dpo_v0_2,HFv1 MMLU,52.31,,hf_open_llm_v1_240829_frozen.csv awanllm_llama3_8b_instruct_dpo_v0_2,HFv1 TruthfulQA,58.4,,hf_open_llm_v1_240829_frozen.csv awanllm_llama3_8b_instruct_dpo_v0_2,HFv1 Winogrande,77.43,,hf_open_llm_v1_240829_frozen.csv babyllama_v0_6,HF OpenLLM v1,36.92,,hf_open_llm_v1_240829_frozen.csv babyllama_v0_6,HFv1 ARC,36.09,,hf_open_llm_v1_240829_frozen.csv babyllama_v0_6,HFv1 GSM8K,1.59,,hf_open_llm_v1_240829_frozen.csv babyllama_v0_6,HFv1 HellaSwag,61.59,,hf_open_llm_v1_240829_frozen.csv babyllama_v0_6,HFv1 MMLU,25.37,,hf_open_llm_v1_240829_frozen.csv babyllama_v0_6,HFv1 TruthfulQA,35.84,,hf_open_llm_v1_240829_frozen.csv babyllama_v0_6,HFv1 Winogrande,61.01,,hf_open_llm_v1_240829_frozen.csv bagel_7b_v0_4,HF OpenLLM v1,64.82,,hf_open_llm_v1_240829_frozen.csv bagel_7b_v0_4,HFv1 ARC,63.57,,hf_open_llm_v1_240829_frozen.csv bagel_7b_v0_4,HFv1 GSM8K,47.31,,hf_open_llm_v1_240829_frozen.csv bagel_7b_v0_4,HFv1 HellaSwag,82.67,,hf_open_llm_v1_240829_frozen.csv bagel_7b_v0_4,HFv1 MMLU,62.25,,hf_open_llm_v1_240829_frozen.csv bagel_7b_v0_4,HFv1 TruthfulQA,54.2,,hf_open_llm_v1_240829_frozen.csv bagel_7b_v0_4,HFv1 Winogrande,78.93,,hf_open_llm_v1_240829_frozen.csv bagel_dpo_34b_v0_2,HF OpenLLM v1,74.69,,hf_open_llm_v1_240829_frozen.csv bagel_dpo_34b_v0_2,HFv1 ARC,71.93,,hf_open_llm_v1_240829_frozen.csv bagel_dpo_34b_v0_2,HFv1 GSM8K,60.96,,hf_open_llm_v1_240829_frozen.csv bagel_dpo_34b_v0_2,HFv1 HellaSwag,85.25,,hf_open_llm_v1_240829_frozen.csv bagel_dpo_34b_v0_2,HFv1 MMLU,76.58,,hf_open_llm_v1_240829_frozen.csv bagel_dpo_34b_v0_2,HFv1 TruthfulQA,70.05,,hf_open_llm_v1_240829_frozen.csv bagel_dpo_34b_v0_2,HFv1 Winogrande,83.35,,hf_open_llm_v1_240829_frozen.csv bagel_dpo_7b_v0_4,HF OpenLLM v1,67.13,,hf_open_llm_v1_240829_frozen.csv bagel_dpo_7b_v0_4,HFv1 ARC,67.58,,hf_open_llm_v1_240829_frozen.csv bagel_dpo_7b_v0_4,HFv1 GSM8K,46.85,,hf_open_llm_v1_240829_frozen.csv bagel_dpo_7b_v0_4,HFv1 HellaSwag,84.3,,hf_open_llm_v1_240829_frozen.csv bagel_dpo_7b_v0_4,HFv1 MMLU,61.95,,hf_open_llm_v1_240829_frozen.csv bagel_dpo_7b_v0_4,HFv1 TruthfulQA,63.94,,hf_open_llm_v1_240829_frozen.csv bagel_dpo_7b_v0_4,HFv1 Winogrande,78.14,,hf_open_llm_v1_240829_frozen.csv bagel_dpo_7b_v0_5,HF OpenLLM v1,68.84,,hf_open_llm_v1_240829_frozen.csv bagel_dpo_7b_v0_5,HFv1 ARC,66.3,,hf_open_llm_v1_240829_frozen.csv bagel_dpo_7b_v0_5,HFv1 GSM8K,53.37,,hf_open_llm_v1_240829_frozen.csv bagel_dpo_7b_v0_5,HFv1 HellaSwag,84.22,,hf_open_llm_v1_240829_frozen.csv bagel_dpo_7b_v0_5,HFv1 MMLU,65.27,,hf_open_llm_v1_240829_frozen.csv bagel_dpo_7b_v0_5,HFv1 TruthfulQA,62.41,,hf_open_llm_v1_240829_frozen.csv bagel_dpo_7b_v0_5,HFv1 Winogrande,81.45,,hf_open_llm_v1_240829_frozen.csv bagellake_7b_slerp,HF OpenLLM v1,70.41,,hf_open_llm_v1_240829_frozen.csv bagellake_7b_slerp,HFv1 ARC,68.26,,hf_open_llm_v1_240829_frozen.csv bagellake_7b_slerp,HFv1 GSM8K,57.39,,hf_open_llm_v1_240829_frozen.csv bagellake_7b_slerp,HFv1 HellaSwag,85.07,,hf_open_llm_v1_240829_frozen.csv bagellake_7b_slerp,HFv1 MMLU,64.3,,hf_open_llm_v1_240829_frozen.csv bagellake_7b_slerp,HFv1 TruthfulQA,63.76,,hf_open_llm_v1_240829_frozen.csv bagellake_7b_slerp,HFv1 Winogrande,83.66,,hf_open_llm_v1_240829_frozen.csv bageltoppylake_7b_slerp,HF OpenLLM v1,69.22,,hf_open_llm_v1_240829_frozen.csv bageltoppylake_7b_slerp,HFv1 ARC,67.15,,hf_open_llm_v1_240829_frozen.csv bageltoppylake_7b_slerp,HFv1 GSM8K,55.04,,hf_open_llm_v1_240829_frozen.csv bageltoppylake_7b_slerp,HFv1 HellaSwag,84.79,,hf_open_llm_v1_240829_frozen.csv bageltoppylake_7b_slerp,HFv1 MMLU,64.31,,hf_open_llm_v1_240829_frozen.csv bageltoppylake_7b_slerp,HFv1 TruthfulQA,62.15,,hf_open_llm_v1_240829_frozen.csv bageltoppylake_7b_slerp,HFv1 Winogrande,81.85,,hf_open_llm_v1_240829_frozen.csv bageluccine_2_7b_slerp,HF OpenLLM v1,67.05,,hf_open_llm_v1_240829_frozen.csv bageluccine_2_7b_slerp,HFv1 ARC,66.38,,hf_open_llm_v1_240829_frozen.csv bageluccine_2_7b_slerp,HFv1 GSM8K,45.72,,hf_open_llm_v1_240829_frozen.csv bageluccine_2_7b_slerp,HFv1 HellaSwag,85.51,,hf_open_llm_v1_240829_frozen.csv bageluccine_2_7b_slerp,HFv1 MMLU,62.23,,hf_open_llm_v1_240829_frozen.csv bageluccine_2_7b_slerp,HFv1 TruthfulQA,65.57,,hf_open_llm_v1_240829_frozen.csv bageluccine_2_7b_slerp,HFv1 Winogrande,76.87,,hf_open_llm_v1_240829_frozen.csv bageluccine_7b_slerp,HF OpenLLM v1,65.97,,hf_open_llm_v1_240829_frozen.csv bageluccine_7b_slerp,HFv1 ARC,65.1,,hf_open_llm_v1_240829_frozen.csv bageluccine_7b_slerp,HFv1 GSM8K,46.25,,hf_open_llm_v1_240829_frozen.csv bageluccine_7b_slerp,HFv1 HellaSwag,85.06,,hf_open_llm_v1_240829_frozen.csv bageluccine_7b_slerp,HFv1 MMLU,61.75,,hf_open_llm_v1_240829_frozen.csv bageluccine_7b_slerp,HFv1 TruthfulQA,60.33,,hf_open_llm_v1_240829_frozen.csv bageluccine_7b_slerp,HFv1 Winogrande,77.35,,hf_open_llm_v1_240829_frozen.csv baize_healthcare_lora_7b,HF OpenLLM v1,47.62,,hf_open_llm_v1_240829_frozen.csv baize_healthcare_lora_7b,HFv1 ARC,54.1,,hf_open_llm_v1_240829_frozen.csv baize_healthcare_lora_7b,HFv1 GSM8K,4.4,,hf_open_llm_v1_240829_frozen.csv baize_healthcare_lora_7b,HFv1 HellaSwag,77.32,,hf_open_llm_v1_240829_frozen.csv baize_healthcare_lora_7b,HFv1 MMLU,37.09,,hf_open_llm_v1_240829_frozen.csv baize_healthcare_lora_7b,HFv1 TruthfulQA,39.96,,hf_open_llm_v1_240829_frozen.csv baize_healthcare_lora_7b,HFv1 Winogrande,72.85,,hf_open_llm_v1_240829_frozen.csv beyonder_4x7b_random_lora,HF OpenLLM v1,73.91,,hf_open_llm_v1_240829_frozen.csv beyonder_4x7b_random_lora,HFv1 ARC,71.25,,hf_open_llm_v1_240829_frozen.csv beyonder_4x7b_random_lora,HFv1 GSM8K,67.4,,hf_open_llm_v1_240829_frozen.csv beyonder_4x7b_random_lora,HFv1 HellaSwag,87.4,,hf_open_llm_v1_240829_frozen.csv beyonder_4x7b_random_lora,HFv1 MMLU,64.78,,hf_open_llm_v1_240829_frozen.csv beyonder_4x7b_random_lora,HFv1 TruthfulQA,70.49,,hf_open_llm_v1_240829_frozen.csv beyonder_4x7b_random_lora,HFv1 Winogrande,82.16,,hf_open_llm_v1_240829_frozen.csv bggpt_7b_instruct_v0_1,HF OpenLLM v1,64.82,,hf_open_llm_v1_240829_frozen.csv bggpt_7b_instruct_v0_1,HFv1 ARC,60.24,,hf_open_llm_v1_240829_frozen.csv bggpt_7b_instruct_v0_1,HFv1 GSM8K,56.71,,hf_open_llm_v1_240829_frozen.csv bggpt_7b_instruct_v0_1,HFv1 HellaSwag,81.6,,hf_open_llm_v1_240829_frozen.csv bggpt_7b_instruct_v0_1,HFv1 MMLU,59.66,,hf_open_llm_v1_240829_frozen.csv bggpt_7b_instruct_v0_1,HFv1 TruthfulQA,53.68,,hf_open_llm_v1_240829_frozen.csv bggpt_7b_instruct_v0_1,HFv1 Winogrande,77.03,,hf_open_llm_v1_240829_frozen.csv bggpt_7b_instruct_v0_2,HF OpenLLM v1,63.08,,hf_open_llm_v1_240829_frozen.csv bggpt_7b_instruct_v0_2,HFv1 ARC,60.58,,hf_open_llm_v1_240829_frozen.csv bggpt_7b_instruct_v0_2,HFv1 GSM8K,44.12,,hf_open_llm_v1_240829_frozen.csv bggpt_7b_instruct_v0_2,HFv1 HellaSwag,82.18,,hf_open_llm_v1_240829_frozen.csv bggpt_7b_instruct_v0_2,HFv1 MMLU,60.5,,hf_open_llm_v1_240829_frozen.csv bggpt_7b_instruct_v0_2,HFv1 TruthfulQA,54.63,,hf_open_llm_v1_240829_frozen.csv bggpt_7b_instruct_v0_2,HFv1 Winogrande,76.48,,hf_open_llm_v1_240829_frozen.csv bielik_7b_instruct_v0_1,HF OpenLLM v1,51.26,,hf_open_llm_v1_240829_frozen.csv bielik_7b_instruct_v0_1,HFv1 ARC,47.53,,hf_open_llm_v1_240829_frozen.csv bielik_7b_instruct_v0_1,HFv1 GSM8K,30.25,,hf_open_llm_v1_240829_frozen.csv bielik_7b_instruct_v0_1,HFv1 HellaSwag,68.91,,hf_open_llm_v1_240829_frozen.csv bielik_7b_instruct_v0_1,HFv1 MMLU,49.47,,hf_open_llm_v1_240829_frozen.csv bielik_7b_instruct_v0_1,HFv1 TruthfulQA,46.18,,hf_open_llm_v1_240829_frozen.csv bielik_7b_instruct_v0_1,HFv1 Winogrande,65.59,,hf_open_llm_v1_240829_frozen.csv bigstral_12b_32k,HF OpenLLM v1,62.17,,hf_open_llm_v1_240829_frozen.csv bigstral_12b_32k,HFv1 ARC,59.98,,hf_open_llm_v1_240829_frozen.csv bigstral_12b_32k,HFv1 GSM8K,26.91,,hf_open_llm_v1_240829_frozen.csv bigstral_12b_32k,HFv1 HellaSwag,84.1,,hf_open_llm_v1_240829_frozen.csv bigstral_12b_32k,HFv1 MMLU,59.14,,hf_open_llm_v1_240829_frozen.csv bigstral_12b_32k,HFv1 TruthfulQA,68.21,,hf_open_llm_v1_240829_frozen.csv bigstral_12b_32k,HFv1 Winogrande,74.66,,hf_open_llm_v1_240829_frozen.csv bigyi_15b,HF OpenLLM v1,54.29,,hf_open_llm_v1_240829_frozen.csv bigyi_15b,HFv1 ARC,56.06,,hf_open_llm_v1_240829_frozen.csv bigyi_15b,HFv1 GSM8K,21.61,,hf_open_llm_v1_240829_frozen.csv bigyi_15b,HFv1 HellaSwag,75.9,,hf_open_llm_v1_240829_frozen.csv bigyi_15b,HFv1 MMLU,64.6,,hf_open_llm_v1_240829_frozen.csv bigyi_15b,HFv1 TruthfulQA,37.33,,hf_open_llm_v1_240829_frozen.csv bigyi_15b,HFv1 Winogrande,70.24,,hf_open_llm_v1_240829_frozen.csv bilingual_gpt_neox_4b,HF OpenLLM v1,32.14,,hf_open_llm_v1_240829_frozen.csv bilingual_gpt_neox_4b,HFv1 ARC,29.18,,hf_open_llm_v1_240829_frozen.csv bilingual_gpt_neox_4b,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv bilingual_gpt_neox_4b,HFv1 HellaSwag,43.73,,hf_open_llm_v1_240829_frozen.csv bilingual_gpt_neox_4b,HFv1 MMLU,23.1,,hf_open_llm_v1_240829_frozen.csv bilingual_gpt_neox_4b,HFv1 TruthfulQA,45.0,,hf_open_llm_v1_240829_frozen.csv bilingual_gpt_neox_4b,HFv1 Winogrande,51.85,,hf_open_llm_v1_240829_frozen.csv bilingual_gpt_neox_4b_8k,HF OpenLLM v1,32.23,,hf_open_llm_v1_240829_frozen.csv bilingual_gpt_neox_4b_8k,HFv1 ARC,28.58,,hf_open_llm_v1_240829_frozen.csv bilingual_gpt_neox_4b_8k,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv bilingual_gpt_neox_4b_8k,HFv1 HellaSwag,43.94,,hf_open_llm_v1_240829_frozen.csv bilingual_gpt_neox_4b_8k,HFv1 MMLU,25.38,,hf_open_llm_v1_240829_frozen.csv bilingual_gpt_neox_4b_8k,HFv1 TruthfulQA,47.48,,hf_open_llm_v1_240829_frozen.csv bilingual_gpt_neox_4b_8k,HFv1 Winogrande,47.99,,hf_open_llm_v1_240829_frozen.csv bilingual_gpt_neox_4b_instruction_ppo,HF OpenLLM v1,32.5,,hf_open_llm_v1_240829_frozen.csv bilingual_gpt_neox_4b_instruction_ppo,HFv1 ARC,28.24,,hf_open_llm_v1_240829_frozen.csv bilingual_gpt_neox_4b_instruction_ppo,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv bilingual_gpt_neox_4b_instruction_ppo,HFv1 HellaSwag,47.9,,hf_open_llm_v1_240829_frozen.csv bilingual_gpt_neox_4b_instruction_ppo,HFv1 MMLU,23.12,,hf_open_llm_v1_240829_frozen.csv bilingual_gpt_neox_4b_instruction_ppo,HFv1 TruthfulQA,43.5,,hf_open_llm_v1_240829_frozen.csv bilingual_gpt_neox_4b_instruction_ppo,HFv1 Winogrande,52.25,,hf_open_llm_v1_240829_frozen.csv bilingual_gpt_neox_4b_instruction_sft,HF OpenLLM v1,32.46,,hf_open_llm_v1_240829_frozen.csv bilingual_gpt_neox_4b_instruction_sft,HFv1 ARC,28.07,,hf_open_llm_v1_240829_frozen.csv bilingual_gpt_neox_4b_instruction_sft,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv bilingual_gpt_neox_4b_instruction_sft,HFv1 HellaSwag,47.5,,hf_open_llm_v1_240829_frozen.csv bilingual_gpt_neox_4b_instruction_sft,HFv1 MMLU,23.12,,hf_open_llm_v1_240829_frozen.csv bilingual_gpt_neox_4b_instruction_sft,HFv1 TruthfulQA,43.76,,hf_open_llm_v1_240829_frozen.csv bilingual_gpt_neox_4b_instruction_sft,HFv1 Winogrande,52.33,,hf_open_llm_v1_240829_frozen.csv black_goo_recipe_a,HF OpenLLM v1,38.73,,hf_open_llm_v1_240829_frozen.csv black_goo_recipe_a,HFv1 ARC,38.14,,hf_open_llm_v1_240829_frozen.csv black_goo_recipe_a,HFv1 GSM8K,0.53,,hf_open_llm_v1_240829_frozen.csv black_goo_recipe_a,HFv1 HellaSwag,66.56,,hf_open_llm_v1_240829_frozen.csv black_goo_recipe_a,HFv1 MMLU,25.75,,hf_open_llm_v1_240829_frozen.csv black_goo_recipe_a,HFv1 TruthfulQA,37.46,,hf_open_llm_v1_240829_frozen.csv black_goo_recipe_a,HFv1 Winogrande,63.93,,hf_open_llm_v1_240829_frozen.csv black_goo_recipe_b,HF OpenLLM v1,38.49,,hf_open_llm_v1_240829_frozen.csv black_goo_recipe_b,HFv1 ARC,37.63,,hf_open_llm_v1_240829_frozen.csv black_goo_recipe_b,HFv1 GSM8K,0.08,,hf_open_llm_v1_240829_frozen.csv black_goo_recipe_b,HFv1 HellaSwag,66.72,,hf_open_llm_v1_240829_frozen.csv black_goo_recipe_b,HFv1 MMLU,25.68,,hf_open_llm_v1_240829_frozen.csv black_goo_recipe_b,HFv1 TruthfulQA,37.09,,hf_open_llm_v1_240829_frozen.csv black_goo_recipe_b,HFv1 Winogrande,63.77,,hf_open_llm_v1_240829_frozen.csv black_goo_recipe_c,HF OpenLLM v1,39.01,,hf_open_llm_v1_240829_frozen.csv black_goo_recipe_c,HFv1 ARC,38.74,,hf_open_llm_v1_240829_frozen.csv black_goo_recipe_c,HFv1 GSM8K,0.68,,hf_open_llm_v1_240829_frozen.csv black_goo_recipe_c,HFv1 HellaSwag,66.83,,hf_open_llm_v1_240829_frozen.csv black_goo_recipe_c,HFv1 MMLU,26.57,,hf_open_llm_v1_240829_frozen.csv black_goo_recipe_c,HFv1 TruthfulQA,36.54,,hf_open_llm_v1_240829_frozen.csv black_goo_recipe_c,HFv1 Winogrande,64.72,,hf_open_llm_v1_240829_frozen.csv black_goo_recipe_d,HF OpenLLM v1,38.57,,hf_open_llm_v1_240829_frozen.csv black_goo_recipe_d,HFv1 ARC,37.8,,hf_open_llm_v1_240829_frozen.csv black_goo_recipe_d,HFv1 GSM8K,0.38,,hf_open_llm_v1_240829_frozen.csv black_goo_recipe_d,HFv1 HellaSwag,66.5,,hf_open_llm_v1_240829_frozen.csv black_goo_recipe_d,HFv1 MMLU,26.64,,hf_open_llm_v1_240829_frozen.csv black_goo_recipe_d,HFv1 TruthfulQA,36.46,,hf_open_llm_v1_240829_frozen.csv black_goo_recipe_d,HFv1 Winogrande,63.61,,hf_open_llm_v1_240829_frozen.csv bleagle_7b_v0_1_test,HF OpenLLM v1,73.89,,hf_open_llm_v1_240829_frozen.csv bleagle_7b_v0_1_test,HFv1 ARC,72.27,,hf_open_llm_v1_240829_frozen.csv bleagle_7b_v0_1_test,HFv1 GSM8K,65.13,,hf_open_llm_v1_240829_frozen.csv bleagle_7b_v0_1_test,HFv1 HellaSwag,88.24,,hf_open_llm_v1_240829_frozen.csv bleagle_7b_v0_1_test,HFv1 MMLU,64.37,,hf_open_llm_v1_240829_frozen.csv bleagle_7b_v0_1_test,HFv1 TruthfulQA,67.83,,hf_open_llm_v1_240829_frozen.csv bleagle_7b_v0_1_test,HFv1 Winogrande,85.48,,hf_open_llm_v1_240829_frozen.csv bloom,HF OpenLLM v1,46.07,,hf_open_llm_v1_240829_frozen.csv bloom,HFv1 ARC,50.43,,hf_open_llm_v1_240829_frozen.csv bloom,HFv1 GSM8K,6.9,,hf_open_llm_v1_240829_frozen.csv bloom,HFv1 HellaSwag,76.41,,hf_open_llm_v1_240829_frozen.csv bloom,HFv1 MMLU,30.85,,hf_open_llm_v1_240829_frozen.csv bloom,HFv1 TruthfulQA,39.76,,hf_open_llm_v1_240829_frozen.csv bloom,HFv1 Winogrande,72.06,,hf_open_llm_v1_240829_frozen.csv bloom_1b1_rlhf,HF OpenLLM v1,30.14,,hf_open_llm_v1_240829_frozen.csv bloom_1b1_rlhf,HFv1 ARC,27.99,,hf_open_llm_v1_240829_frozen.csv bloom_1b1_rlhf,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv bloom_1b1_rlhf,HFv1 HellaSwag,26.19,,hf_open_llm_v1_240829_frozen.csv bloom_1b1_rlhf,HFv1 MMLU,26.86,,hf_open_llm_v1_240829_frozen.csv bloom_1b1_rlhf,HFv1 TruthfulQA,48.88,,hf_open_llm_v1_240829_frozen.csv bloom_1b1_rlhf,HFv1 Winogrande,50.91,,hf_open_llm_v1_240829_frozen.csv bloom_560m_rlhf,HF OpenLLM v1,29.86,,hf_open_llm_v1_240829_frozen.csv bloom_560m_rlhf,HFv1 ARC,24.4,,hf_open_llm_v1_240829_frozen.csv bloom_560m_rlhf,HFv1 GSM8K,0.3,,hf_open_llm_v1_240829_frozen.csv bloom_560m_rlhf,HFv1 HellaSwag,36.96,,hf_open_llm_v1_240829_frozen.csv bloom_560m_rlhf,HFv1 MMLU,23.63,,hf_open_llm_v1_240829_frozen.csv bloom_560m_rlhf,HFv1 TruthfulQA,40.76,,hf_open_llm_v1_240829_frozen.csv bloom_560m_rlhf,HFv1 Winogrande,53.12,,hf_open_llm_v1_240829_frozen.csv bloom_560m_rlhf_v2,HF OpenLLM v1,30.43,,hf_open_llm_v1_240829_frozen.csv bloom_560m_rlhf_v2,HFv1 ARC,26.45,,hf_open_llm_v1_240829_frozen.csv bloom_560m_rlhf_v2,HFv1 GSM8K,0.08,,hf_open_llm_v1_240829_frozen.csv bloom_560m_rlhf_v2,HFv1 HellaSwag,37.67,,hf_open_llm_v1_240829_frozen.csv bloom_560m_rlhf_v2,HFv1 MMLU,23.95,,hf_open_llm_v1_240829_frozen.csv bloom_560m_rlhf_v2,HFv1 TruthfulQA,43.51,,hf_open_llm_v1_240829_frozen.csv bloom_560m_rlhf_v2,HFv1 Winogrande,50.91,,hf_open_llm_v1_240829_frozen.csv bloom_7b1,HF OpenLLM v1,39.18,,hf_open_llm_v1_240829_frozen.csv bloom_7b1,HFv1 ARC,41.13,,hf_open_llm_v1_240829_frozen.csv bloom_7b1,HFv1 GSM8K,1.36,,hf_open_llm_v1_240829_frozen.csv bloom_7b1,HFv1 HellaSwag,62.0,,hf_open_llm_v1_240829_frozen.csv bloom_7b1,HFv1 MMLU,26.25,,hf_open_llm_v1_240829_frozen.csv bloom_7b1,HFv1 TruthfulQA,38.9,,hf_open_llm_v1_240829_frozen.csv bloom_7b1,HFv1 Winogrande,65.43,,hf_open_llm_v1_240829_frozen.csv blossom_v2_llama2_7b,HF OpenLLM v1,51.71,,hf_open_llm_v1_240829_frozen.csv blossom_v2_llama2_7b,HFv1 ARC,54.1,,hf_open_llm_v1_240829_frozen.csv blossom_v2_llama2_7b,HFv1 GSM8K,4.78,,hf_open_llm_v1_240829_frozen.csv blossom_v2_llama2_7b,HFv1 HellaSwag,78.57,,hf_open_llm_v1_240829_frozen.csv blossom_v2_llama2_7b,HFv1 MMLU,51.66,,hf_open_llm_v1_240829_frozen.csv blossom_v2_llama2_7b,HFv1 TruthfulQA,46.84,,hf_open_llm_v1_240829_frozen.csv blossom_v2_llama2_7b,HFv1 Winogrande,74.35,,hf_open_llm_v1_240829_frozen.csv blossom_v4_qwen1_5_14b,HF OpenLLM v1,66.74,,hf_open_llm_v1_240829_frozen.csv blossom_v4_qwen1_5_14b,HFv1 ARC,57.34,,hf_open_llm_v1_240829_frozen.csv blossom_v4_qwen1_5_14b,HFv1 GSM8K,66.49,,hf_open_llm_v1_240829_frozen.csv blossom_v4_qwen1_5_14b,HFv1 HellaSwag,79.84,,hf_open_llm_v1_240829_frozen.csv blossom_v4_qwen1_5_14b,HFv1 MMLU,67.92,,hf_open_llm_v1_240829_frozen.csv blossom_v4_qwen1_5_14b,HFv1 TruthfulQA,55.21,,hf_open_llm_v1_240829_frozen.csv blossom_v4_qwen1_5_14b,HFv1 Winogrande,73.64,,hf_open_llm_v1_240829_frozen.csv blossom_v4_qwen1_5_4b,HF OpenLLM v1,56.34,,hf_open_llm_v1_240829_frozen.csv blossom_v4_qwen1_5_4b,HFv1 ARC,46.08,,hf_open_llm_v1_240829_frozen.csv blossom_v4_qwen1_5_4b,HFv1 GSM8K,51.1,,hf_open_llm_v1_240829_frozen.csv blossom_v4_qwen1_5_4b,HFv1 HellaSwag,70.8,,hf_open_llm_v1_240829_frozen.csv blossom_v4_qwen1_5_4b,HFv1 MMLU,55.11,,hf_open_llm_v1_240829_frozen.csv blossom_v4_qwen1_5_4b,HFv1 TruthfulQA,47.29,,hf_open_llm_v1_240829_frozen.csv blossom_v4_qwen1_5_4b,HFv1 Winogrande,67.64,,hf_open_llm_v1_240829_frozen.csv blossom_v4_qwen1_5_7b,HF OpenLLM v1,62.11,,hf_open_llm_v1_240829_frozen.csv blossom_v4_qwen1_5_7b,HFv1 ARC,54.44,,hf_open_llm_v1_240829_frozen.csv blossom_v4_qwen1_5_7b,HFv1 GSM8K,56.71,,hf_open_llm_v1_240829_frozen.csv blossom_v4_qwen1_5_7b,HFv1 HellaSwag,76.11,,hf_open_llm_v1_240829_frozen.csv blossom_v4_qwen1_5_7b,HFv1 MMLU,60.43,,hf_open_llm_v1_240829_frozen.csv blossom_v4_qwen1_5_7b,HFv1 TruthfulQA,53.69,,hf_open_llm_v1_240829_frozen.csv blossom_v4_qwen1_5_7b,HFv1 Winogrande,71.27,,hf_open_llm_v1_240829_frozen.csv blossom_v4_yi_34b,HF OpenLLM v1,71.67,,hf_open_llm_v1_240829_frozen.csv blossom_v4_yi_34b,HFv1 ARC,66.81,,hf_open_llm_v1_240829_frozen.csv blossom_v4_yi_34b,HFv1 GSM8K,64.14,,hf_open_llm_v1_240829_frozen.csv blossom_v4_yi_34b,HFv1 HellaSwag,84.44,,hf_open_llm_v1_240829_frozen.csv blossom_v4_yi_34b,HFv1 MMLU,74.34,,hf_open_llm_v1_240829_frozen.csv blossom_v4_yi_34b,HFv1 TruthfulQA,57.89,,hf_open_llm_v1_240829_frozen.csv blossom_v4_yi_34b,HFv1 Winogrande,82.4,,hf_open_llm_v1_240829_frozen.csv blossom_v5_14b,HF OpenLLM v1,67.57,,hf_open_llm_v1_240829_frozen.csv blossom_v5_14b,HFv1 ARC,58.45,,hf_open_llm_v1_240829_frozen.csv blossom_v5_14b,HFv1 GSM8K,67.78,,hf_open_llm_v1_240829_frozen.csv blossom_v5_14b,HFv1 HellaSwag,80.72,,hf_open_llm_v1_240829_frozen.csv blossom_v5_14b,HFv1 MMLU,68.45,,hf_open_llm_v1_240829_frozen.csv blossom_v5_14b,HFv1 TruthfulQA,54.89,,hf_open_llm_v1_240829_frozen.csv blossom_v5_14b,HFv1 Winogrande,75.14,,hf_open_llm_v1_240829_frozen.csv blossom_v5_32b,HF OpenLLM v1,72.04,,hf_open_llm_v1_240829_frozen.csv blossom_v5_32b,HFv1 ARC,63.82,,hf_open_llm_v1_240829_frozen.csv blossom_v5_32b,HFv1 GSM8K,70.66,,hf_open_llm_v1_240829_frozen.csv blossom_v5_32b,HFv1 HellaSwag,83.54,,hf_open_llm_v1_240829_frozen.csv blossom_v5_32b,HFv1 MMLU,74.27,,hf_open_llm_v1_240829_frozen.csv blossom_v5_32b,HFv1 TruthfulQA,58.24,,hf_open_llm_v1_240829_frozen.csv blossom_v5_32b,HFv1 Winogrande,81.69,,hf_open_llm_v1_240829_frozen.csv blossom_v5_34b,HF OpenLLM v1,72.65,,hf_open_llm_v1_240829_frozen.csv blossom_v5_34b,HFv1 ARC,66.98,,hf_open_llm_v1_240829_frozen.csv blossom_v5_34b,HFv1 GSM8K,62.02,,hf_open_llm_v1_240829_frozen.csv blossom_v5_34b,HFv1 HellaSwag,84.79,,hf_open_llm_v1_240829_frozen.csv blossom_v5_34b,HFv1 MMLU,76.0,,hf_open_llm_v1_240829_frozen.csv blossom_v5_34b,HFv1 TruthfulQA,62.68,,hf_open_llm_v1_240829_frozen.csv blossom_v5_34b,HFv1 Winogrande,83.43,,hf_open_llm_v1_240829_frozen.csv blossom_v5_4b,HF OpenLLM v1,56.16,,hf_open_llm_v1_240829_frozen.csv blossom_v5_4b,HFv1 ARC,46.76,,hf_open_llm_v1_240829_frozen.csv blossom_v5_4b,HFv1 GSM8K,48.37,,hf_open_llm_v1_240829_frozen.csv blossom_v5_4b,HFv1 HellaSwag,71.87,,hf_open_llm_v1_240829_frozen.csv blossom_v5_4b,HFv1 MMLU,55.04,,hf_open_llm_v1_240829_frozen.csv blossom_v5_4b,HFv1 TruthfulQA,47.51,,hf_open_llm_v1_240829_frozen.csv blossom_v5_4b,HFv1 Winogrande,67.4,,hf_open_llm_v1_240829_frozen.csv blossom_v5_7b,HF OpenLLM v1,63.57,,hf_open_llm_v1_240829_frozen.csv blossom_v5_7b,HFv1 ARC,56.06,,hf_open_llm_v1_240829_frozen.csv blossom_v5_7b,HFv1 GSM8K,60.05,,hf_open_llm_v1_240829_frozen.csv blossom_v5_7b,HFv1 HellaSwag,77.36,,hf_open_llm_v1_240829_frozen.csv blossom_v5_7b,HFv1 MMLU,61.29,,hf_open_llm_v1_240829_frozen.csv blossom_v5_7b,HFv1 TruthfulQA,54.29,,hf_open_llm_v1_240829_frozen.csv blossom_v5_7b,HFv1 Winogrande,72.38,,hf_open_llm_v1_240829_frozen.csv blossom_v5_9b,HF OpenLLM v1,64.69,,hf_open_llm_v1_240829_frozen.csv blossom_v5_9b,HFv1 ARC,62.46,,hf_open_llm_v1_240829_frozen.csv blossom_v5_9b,HFv1 GSM8K,48.37,,hf_open_llm_v1_240829_frozen.csv blossom_v5_9b,HFv1 HellaSwag,78.41,,hf_open_llm_v1_240829_frozen.csv blossom_v5_9b,HFv1 MMLU,69.81,,hf_open_llm_v1_240829_frozen.csv blossom_v5_9b,HFv1 TruthfulQA,52.78,,hf_open_llm_v1_240829_frozen.csv blossom_v5_9b,HFv1 Winogrande,76.32,,hf_open_llm_v1_240829_frozen.csv blossom_v5_llama3_8b,HF OpenLLM v1,64.21,,hf_open_llm_v1_240829_frozen.csv blossom_v5_llama3_8b,HFv1 ARC,56.83,,hf_open_llm_v1_240829_frozen.csv blossom_v5_llama3_8b,HFv1 GSM8K,44.43,,hf_open_llm_v1_240829_frozen.csv blossom_v5_llama3_8b,HFv1 HellaSwag,83.05,,hf_open_llm_v1_240829_frozen.csv blossom_v5_llama3_8b,HFv1 MMLU,65.48,,hf_open_llm_v1_240829_frozen.csv blossom_v5_llama3_8b,HFv1 TruthfulQA,57.12,,hf_open_llm_v1_240829_frozen.csv blossom_v5_llama3_8b,HFv1 Winogrande,78.37,,hf_open_llm_v1_240829_frozen.csv blossom_v5_mistral_7b,HF OpenLLM v1,61.88,,hf_open_llm_v1_240829_frozen.csv blossom_v5_mistral_7b,HFv1 ARC,62.63,,hf_open_llm_v1_240829_frozen.csv blossom_v5_mistral_7b,HFv1 GSM8K,31.84,,hf_open_llm_v1_240829_frozen.csv blossom_v5_mistral_7b,HFv1 HellaSwag,84.26,,hf_open_llm_v1_240829_frozen.csv blossom_v5_mistral_7b,HFv1 MMLU,62.45,,hf_open_llm_v1_240829_frozen.csv blossom_v5_mistral_7b,HFv1 TruthfulQA,51.83,,hf_open_llm_v1_240829_frozen.csv blossom_v5_mistral_7b,HFv1 Winogrande,78.3,,hf_open_llm_v1_240829_frozen.csv blur_7b_slerp_v1_46,HF OpenLLM v1,76.26,,hf_open_llm_v1_240829_frozen.csv blur_7b_slerp_v1_46,HFv1 ARC,73.29,,hf_open_llm_v1_240829_frozen.csv blur_7b_slerp_v1_46,HFv1 GSM8K,69.67,,hf_open_llm_v1_240829_frozen.csv blur_7b_slerp_v1_46,HFv1 HellaSwag,89.07,,hf_open_llm_v1_240829_frozen.csv blur_7b_slerp_v1_46,HFv1 MMLU,64.37,,hf_open_llm_v1_240829_frozen.csv blur_7b_slerp_v1_46,HFv1 TruthfulQA,76.61,,hf_open_llm_v1_240829_frozen.csv blur_7b_slerp_v1_46,HFv1 Winogrande,84.53,,hf_open_llm_v1_240829_frozen.csv blur_7b_v1_2,HF OpenLLM v1,67.74,,hf_open_llm_v1_240829_frozen.csv blur_7b_v1_2,HFv1 ARC,65.36,,hf_open_llm_v1_240829_frozen.csv blur_7b_v1_2,HFv1 GSM8K,52.84,,hf_open_llm_v1_240829_frozen.csv blur_7b_v1_2,HFv1 HellaSwag,83.88,,hf_open_llm_v1_240829_frozen.csv blur_7b_v1_2,HFv1 MMLU,63.45,,hf_open_llm_v1_240829_frozen.csv blur_7b_v1_2,HFv1 TruthfulQA,60.3,,hf_open_llm_v1_240829_frozen.csv blur_7b_v1_2,HFv1 Winogrande,80.58,,hf_open_llm_v1_240829_frozen.csv blur_7b_v1_21,HF OpenLLM v1,74.18,,hf_open_llm_v1_240829_frozen.csv blur_7b_v1_21,HFv1 ARC,70.82,,hf_open_llm_v1_240829_frozen.csv blur_7b_v1_21,HFv1 GSM8K,69.52,,hf_open_llm_v1_240829_frozen.csv blur_7b_v1_21,HFv1 HellaSwag,88.07,,hf_open_llm_v1_240829_frozen.csv blur_7b_v1_21,HFv1 MMLU,64.85,,hf_open_llm_v1_240829_frozen.csv blur_7b_v1_21,HFv1 TruthfulQA,67.99,,hf_open_llm_v1_240829_frozen.csv blur_7b_v1_21,HFv1 Winogrande,83.82,,hf_open_llm_v1_240829_frozen.csv blur_7b_v1_22,HF OpenLLM v1,63.35,,hf_open_llm_v1_240829_frozen.csv blur_7b_v1_22,HFv1 ARC,62.29,,hf_open_llm_v1_240829_frozen.csv blur_7b_v1_22,HFv1 GSM8K,31.16,,hf_open_llm_v1_240829_frozen.csv blur_7b_v1_22,HFv1 HellaSwag,82.0,,hf_open_llm_v1_240829_frozen.csv blur_7b_v1_22,HFv1 MMLU,58.03,,hf_open_llm_v1_240829_frozen.csv blur_7b_v1_22,HFv1 TruthfulQA,68.01,,hf_open_llm_v1_240829_frozen.csv blur_7b_v1_22,HFv1 Winogrande,78.61,,hf_open_llm_v1_240829_frozen.csv blured_ties_7b,HF OpenLLM v1,65.92,,hf_open_llm_v1_240829_frozen.csv blured_ties_7b,HFv1 ARC,63.99,,hf_open_llm_v1_240829_frozen.csv blured_ties_7b,HFv1 GSM8K,46.93,,hf_open_llm_v1_240829_frozen.csv blured_ties_7b,HFv1 HellaSwag,83.56,,hf_open_llm_v1_240829_frozen.csv blured_ties_7b,HFv1 MMLU,63.19,,hf_open_llm_v1_240829_frozen.csv blured_ties_7b,HFv1 TruthfulQA,58.12,,hf_open_llm_v1_240829_frozen.csv blured_ties_7b,HFv1 Winogrande,79.72,,hf_open_llm_v1_240829_frozen.csv blurred_beagle_7b_slerp,HF OpenLLM v1,74.8,,hf_open_llm_v1_240829_frozen.csv blurred_beagle_7b_slerp,HFv1 ARC,72.78,,hf_open_llm_v1_240829_frozen.csv blurred_beagle_7b_slerp,HFv1 GSM8K,69.9,,hf_open_llm_v1_240829_frozen.csv blurred_beagle_7b_slerp,HFv1 HellaSwag,88.58,,hf_open_llm_v1_240829_frozen.csv blurred_beagle_7b_slerp,HFv1 MMLU,64.95,,hf_open_llm_v1_240829_frozen.csv blurred_beagle_7b_slerp,HFv1 TruthfulQA,69.39,,hf_open_llm_v1_240829_frozen.csv blurred_beagle_7b_slerp,HFv1 Winogrande,83.19,,hf_open_llm_v1_240829_frozen.csv blurstral_7b_slerp,HF OpenLLM v1,69.08,,hf_open_llm_v1_240829_frozen.csv blurstral_7b_slerp,HFv1 ARC,66.3,,hf_open_llm_v1_240829_frozen.csv blurstral_7b_slerp,HFv1 GSM8K,62.85,,hf_open_llm_v1_240829_frozen.csv blurstral_7b_slerp,HFv1 HellaSwag,85.38,,hf_open_llm_v1_240829_frozen.csv blurstral_7b_slerp,HFv1 MMLU,65.18,,hf_open_llm_v1_240829_frozen.csv blurstral_7b_slerp,HFv1 TruthfulQA,53.4,,hf_open_llm_v1_240829_frozen.csv blurstral_7b_slerp,HFv1 Winogrande,81.37,,hf_open_llm_v1_240829_frozen.csv bookworm_10_7b_v0_4_dpo,HF OpenLLM v1,66.59,,hf_open_llm_v1_240829_frozen.csv bookworm_10_7b_v0_4_dpo,HFv1 ARC,64.76,,hf_open_llm_v1_240829_frozen.csv bookworm_10_7b_v0_4_dpo,HFv1 GSM8K,52.24,,hf_open_llm_v1_240829_frozen.csv bookworm_10_7b_v0_4_dpo,HFv1 HellaSwag,84.4,,hf_open_llm_v1_240829_frozen.csv bookworm_10_7b_v0_4_dpo,HFv1 MMLU,64.96,,hf_open_llm_v1_240829_frozen.csv bookworm_10_7b_v0_4_dpo,HFv1 TruthfulQA,52.31,,hf_open_llm_v1_240829_frozen.csv bookworm_10_7b_v0_4_dpo,HFv1 Winogrande,80.9,,hf_open_llm_v1_240829_frozen.csv boomer_1b,HF OpenLLM v1,28.44,,hf_open_llm_v1_240829_frozen.csv boomer_1b,HFv1 ARC,22.78,,hf_open_llm_v1_240829_frozen.csv boomer_1b,HFv1 GSM8K,0.91,,hf_open_llm_v1_240829_frozen.csv boomer_1b,HFv1 HellaSwag,31.58,,hf_open_llm_v1_240829_frozen.csv boomer_1b,HFv1 MMLU,25.66,,hf_open_llm_v1_240829_frozen.csv boomer_1b,HFv1 TruthfulQA,39.17,,hf_open_llm_v1_240829_frozen.csv boomer_1b,HFv1 Winogrande,50.51,,hf_open_llm_v1_240829_frozen.csv brocae_area_7b_slerp,HF OpenLLM v1,75.86,,hf_open_llm_v1_240829_frozen.csv brocae_area_7b_slerp,HFv1 ARC,73.81,,hf_open_llm_v1_240829_frozen.csv brocae_area_7b_slerp,HFv1 GSM8K,68.61,,hf_open_llm_v1_240829_frozen.csv brocae_area_7b_slerp,HFv1 HellaSwag,88.98,,hf_open_llm_v1_240829_frozen.csv brocae_area_7b_slerp,HFv1 MMLU,64.55,,hf_open_llm_v1_240829_frozen.csv brocae_area_7b_slerp,HFv1 TruthfulQA,74.13,,hf_open_llm_v1_240829_frozen.csv brocae_area_7b_slerp,HFv1 Winogrande,85.08,,hf_open_llm_v1_240829_frozen.csv brokenkeyboard,HF OpenLLM v1,74.08,,hf_open_llm_v1_240829_frozen.csv brokenkeyboard,HFv1 ARC,71.25,,hf_open_llm_v1_240829_frozen.csv brokenkeyboard,HFv1 GSM8K,64.29,,hf_open_llm_v1_240829_frozen.csv brokenkeyboard,HFv1 HellaSwag,88.34,,hf_open_llm_v1_240829_frozen.csv brokenkeyboard,HFv1 MMLU,66.04,,hf_open_llm_v1_240829_frozen.csv brokenkeyboard,HFv1 TruthfulQA,71.36,,hf_open_llm_v1_240829_frozen.csv brokenkeyboard,HFv1 Winogrande,83.19,,hf_open_llm_v1_240829_frozen.csv brokenkeyboardmerge,HF OpenLLM v1,59.33,,hf_open_llm_v1_240829_frozen.csv brokenkeyboardmerge,HFv1 ARC,59.73,,hf_open_llm_v1_240829_frozen.csv brokenkeyboardmerge,HFv1 GSM8K,25.93,,hf_open_llm_v1_240829_frozen.csv brokenkeyboardmerge,HFv1 HellaSwag,81.25,,hf_open_llm_v1_240829_frozen.csv brokenkeyboardmerge,HFv1 MMLU,58.36,,hf_open_llm_v1_240829_frozen.csv brokenkeyboardmerge,HFv1 TruthfulQA,52.0,,hf_open_llm_v1_240829_frozen.csv brokenkeyboardmerge,HFv1 Winogrande,78.69,,hf_open_llm_v1_240829_frozen.csv brurrydog_7b_v0_1,HF OpenLLM v1,74.24,,hf_open_llm_v1_240829_frozen.csv brurrydog_7b_v0_1,HFv1 ARC,72.53,,hf_open_llm_v1_240829_frozen.csv brurrydog_7b_v0_1,HFv1 GSM8K,66.87,,hf_open_llm_v1_240829_frozen.csv brurrydog_7b_v0_1,HFv1 HellaSwag,88.37,,hf_open_llm_v1_240829_frozen.csv brurrydog_7b_v0_1,HFv1 MMLU,64.74,,hf_open_llm_v1_240829_frozen.csv brurrydog_7b_v0_1,HFv1 TruthfulQA,70.05,,hf_open_llm_v1_240829_frozen.csv brurrydog_7b_v0_1,HFv1 Winogrande,82.87,,hf_open_llm_v1_240829_frozen.csv btlm_v1_7b_base_v0_1,HF OpenLLM v1,50.04,,hf_open_llm_v1_240829_frozen.csv btlm_v1_7b_base_v0_1,HFv1 ARC,52.73,,hf_open_llm_v1_240829_frozen.csv btlm_v1_7b_base_v0_1,HFv1 GSM8K,8.95,,hf_open_llm_v1_240829_frozen.csv btlm_v1_7b_base_v0_1,HFv1 HellaSwag,79.48,,hf_open_llm_v1_240829_frozen.csv btlm_v1_7b_base_v0_1,HFv1 MMLU,49.93,,hf_open_llm_v1_240829_frozen.csv btlm_v1_7b_base_v0_1,HFv1 TruthfulQA,37.2,,hf_open_llm_v1_240829_frozen.csv btlm_v1_7b_base_v0_1,HFv1 Winogrande,71.98,,hf_open_llm_v1_240829_frozen.csv buddhi_128k_chat_7b,HF OpenLLM v1,64.42,,hf_open_llm_v1_240829_frozen.csv buddhi_128k_chat_7b,HFv1 ARC,60.84,,hf_open_llm_v1_240829_frozen.csv buddhi_128k_chat_7b,HFv1 GSM8K,38.29,,hf_open_llm_v1_240829_frozen.csv buddhi_128k_chat_7b,HFv1 HellaSwag,84.0,,hf_open_llm_v1_240829_frozen.csv buddhi_128k_chat_7b,HFv1 MMLU,60.42,,hf_open_llm_v1_240829_frozen.csv buddhi_128k_chat_7b,HFv1 TruthfulQA,65.72,,hf_open_llm_v1_240829_frozen.csv buddhi_128k_chat_7b,HFv1 Winogrande,77.27,,hf_open_llm_v1_240829_frozen.csv bulgakovlm_3b,HF OpenLLM v1,29.72,,hf_open_llm_v1_240829_frozen.csv bulgakovlm_3b,HFv1 ARC,28.33,,hf_open_llm_v1_240829_frozen.csv bulgakovlm_3b,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv bulgakovlm_3b,HFv1 HellaSwag,26.57,,hf_open_llm_v1_240829_frozen.csv bulgakovlm_3b,HFv1 MMLU,24.99,,hf_open_llm_v1_240829_frozen.csv bulgakovlm_3b,HFv1 TruthfulQA,47.93,,hf_open_llm_v1_240829_frozen.csv bulgakovlm_3b,HFv1 Winogrande,50.51,,hf_open_llm_v1_240829_frozen.csv buzz_8b_large_v0_5,HF OpenLLM v1,65.33,,hf_open_llm_v1_240829_frozen.csv buzz_8b_large_v0_5,HFv1 ARC,60.84,,hf_open_llm_v1_240829_frozen.csv buzz_8b_large_v0_5,HFv1 GSM8K,57.62,,hf_open_llm_v1_240829_frozen.csv buzz_8b_large_v0_5,HFv1 HellaSwag,81.9,,hf_open_llm_v1_240829_frozen.csv buzz_8b_large_v0_5,HFv1 MMLU,64.0,,hf_open_llm_v1_240829_frozen.csv buzz_8b_large_v0_5,HFv1 TruthfulQA,50.54,,hf_open_llm_v1_240829_frozen.csv buzz_8b_large_v0_5,HFv1 Winogrande,77.11,,hf_open_llm_v1_240829_frozen.csv c4ai_command_r_plus,HF OpenLLM v1,74.62,,hf_open_llm_v1_240829_frozen.csv c4ai_command_r_plus,HFv1 ARC,70.99,,hf_open_llm_v1_240829_frozen.csv c4ai_command_r_plus,HFv1 GSM8K,70.74,,hf_open_llm_v1_240829_frozen.csv c4ai_command_r_plus,HFv1 HellaSwag,88.56,,hf_open_llm_v1_240829_frozen.csv c4ai_command_r_plus,HFv1 MMLU,75.73,,hf_open_llm_v1_240829_frozen.csv c4ai_command_r_plus,HFv1 TruthfulQA,56.95,,hf_open_llm_v1_240829_frozen.csv c4ai_command_r_plus,HFv1 Winogrande,85.4,,hf_open_llm_v1_240829_frozen.csv c4ai_command_r_v0_1,HF OpenLLM v1,68.54,,hf_open_llm_v1_240829_frozen.csv c4ai_command_r_v0_1,HFv1 ARC,65.53,,hf_open_llm_v1_240829_frozen.csv c4ai_command_r_v0_1,HFv1 GSM8K,56.63,,hf_open_llm_v1_240829_frozen.csv c4ai_command_r_v0_1,HFv1 HellaSwag,87.0,,hf_open_llm_v1_240829_frozen.csv c4ai_command_r_v0_1,HFv1 MMLU,68.2,,hf_open_llm_v1_240829_frozen.csv c4ai_command_r_v0_1,HFv1 TruthfulQA,52.32,,hf_open_llm_v1_240829_frozen.csv c4ai_command_r_v0_1,HFv1 Winogrande,81.53,,hf_open_llm_v1_240829_frozen.csv c4ai_command_r_v0_1_japanese_instruct,HF OpenLLM v1,68.85,,hf_open_llm_v1_240829_frozen.csv c4ai_command_r_v0_1_japanese_instruct,HFv1 ARC,65.87,,hf_open_llm_v1_240829_frozen.csv c4ai_command_r_v0_1_japanese_instruct,HFv1 GSM8K,60.05,,hf_open_llm_v1_240829_frozen.csv c4ai_command_r_v0_1_japanese_instruct,HFv1 HellaSwag,85.62,,hf_open_llm_v1_240829_frozen.csv c4ai_command_r_v0_1_japanese_instruct,HFv1 MMLU,67.61,,hf_open_llm_v1_240829_frozen.csv c4ai_command_r_v0_1_japanese_instruct,HFv1 TruthfulQA,51.01,,hf_open_llm_v1_240829_frozen.csv c4ai_command_r_v0_1_japanese_instruct,HFv1 Winogrande,82.95,,hf_open_llm_v1_240829_frozen.csv calm2_7b_chat,HF OpenLLM v1,43.27,,hf_open_llm_v1_240829_frozen.csv calm2_7b_chat,HFv1 ARC,40.27,,hf_open_llm_v1_240829_frozen.csv calm2_7b_chat,HFv1 GSM8K,4.93,,hf_open_llm_v1_240829_frozen.csv calm2_7b_chat,HFv1 HellaSwag,68.12,,hf_open_llm_v1_240829_frozen.csv calm2_7b_chat,HFv1 MMLU,39.39,,hf_open_llm_v1_240829_frozen.csv calm2_7b_chat,HFv1 TruthfulQA,41.96,,hf_open_llm_v1_240829_frozen.csv calm2_7b_chat,HFv1 Winogrande,64.96,,hf_open_llm_v1_240829_frozen.csv calm2_7b_chat_dpo_experimental,HF OpenLLM v1,44.03,,hf_open_llm_v1_240829_frozen.csv calm2_7b_chat_dpo_experimental,HFv1 ARC,41.04,,hf_open_llm_v1_240829_frozen.csv calm2_7b_chat_dpo_experimental,HFv1 GSM8K,5.53,,hf_open_llm_v1_240829_frozen.csv calm2_7b_chat_dpo_experimental,HFv1 HellaSwag,68.99,,hf_open_llm_v1_240829_frozen.csv calm2_7b_chat_dpo_experimental,HFv1 MMLU,39.82,,hf_open_llm_v1_240829_frozen.csv calm2_7b_chat_dpo_experimental,HFv1 TruthfulQA,43.13,,hf_open_llm_v1_240829_frozen.csv calm2_7b_chat_dpo_experimental,HFv1 Winogrande,65.67,,hf_open_llm_v1_240829_frozen.csv camel_platypus2_70b,HF OpenLLM v1,65.39,,hf_open_llm_v1_240829_frozen.csv camel_platypus2_70b,HFv1 ARC,70.14,,hf_open_llm_v1_240829_frozen.csv camel_platypus2_70b,HFv1 GSM8K,23.96,,hf_open_llm_v1_240829_frozen.csv camel_platypus2_70b,HFv1 HellaSwag,87.71,,hf_open_llm_v1_240829_frozen.csv camel_platypus2_70b,HFv1 MMLU,69.83,,hf_open_llm_v1_240829_frozen.csv camel_platypus2_70b,HFv1 TruthfulQA,57.77,,hf_open_llm_v1_240829_frozen.csv camel_platypus2_70b,HFv1 Winogrande,82.95,,hf_open_llm_v1_240829_frozen.csv camelidae_8x13b,HF OpenLLM v1,59.4,,hf_open_llm_v1_240829_frozen.csv camelidae_8x13b,HFv1 ARC,61.18,,hf_open_llm_v1_240829_frozen.csv camelidae_8x13b,HFv1 GSM8K,34.57,,hf_open_llm_v1_240829_frozen.csv camelidae_8x13b,HFv1 HellaSwag,82.73,,hf_open_llm_v1_240829_frozen.csv camelidae_8x13b,HFv1 MMLU,57.21,,hf_open_llm_v1_240829_frozen.csv camelidae_8x13b,HFv1 TruthfulQA,43.37,,hf_open_llm_v1_240829_frozen.csv camelidae_8x13b,HFv1 Winogrande,77.35,,hf_open_llm_v1_240829_frozen.csv camelidae_8x7b,HF OpenLLM v1,54.47,,hf_open_llm_v1_240829_frozen.csv camelidae_8x7b,HFv1 ARC,55.63,,hf_open_llm_v1_240829_frozen.csv camelidae_8x7b,HFv1 GSM8K,22.82,,hf_open_llm_v1_240829_frozen.csv camelidae_8x7b,HFv1 HellaSwag,79.18,,hf_open_llm_v1_240829_frozen.csv camelidae_8x7b,HFv1 MMLU,50.1,,hf_open_llm_v1_240829_frozen.csv camelidae_8x7b,HFv1 TruthfulQA,42.86,,hf_open_llm_v1_240829_frozen.csv camelidae_8x7b,HFv1 Winogrande,76.24,,hf_open_llm_v1_240829_frozen.csv cantonesellm_6b_preview202402,HF OpenLLM v1,56.93,,hf_open_llm_v1_240829_frozen.csv cantonesellm_6b_preview202402,HFv1 ARC,55.63,,hf_open_llm_v1_240829_frozen.csv cantonesellm_6b_preview202402,HFv1 GSM8K,30.71,,hf_open_llm_v1_240829_frozen.csv cantonesellm_6b_preview202402,HFv1 HellaSwag,75.8,,hf_open_llm_v1_240829_frozen.csv cantonesellm_6b_preview202402,HFv1 MMLU,63.07,,hf_open_llm_v1_240829_frozen.csv cantonesellm_6b_preview202402,HFv1 TruthfulQA,42.26,,hf_open_llm_v1_240829_frozen.csv cantonesellm_6b_preview202402,HFv1 Winogrande,74.11,,hf_open_llm_v1_240829_frozen.csv cantonesellm_cpt_202405,HF OpenLLM v1,60.2,,hf_open_llm_v1_240829_frozen.csv cantonesellm_cpt_202405,HFv1 ARC,55.2,,hf_open_llm_v1_240829_frozen.csv cantonesellm_cpt_202405,HFv1 GSM8K,46.55,,hf_open_llm_v1_240829_frozen.csv cantonesellm_cpt_202405,HFv1 HellaSwag,77.05,,hf_open_llm_v1_240829_frozen.csv cantonesellm_cpt_202405,HFv1 MMLU,63.83,,hf_open_llm_v1_240829_frozen.csv cantonesellm_cpt_202405,HFv1 TruthfulQA,43.58,,hf_open_llm_v1_240829_frozen.csv cantonesellm_cpt_202405,HFv1 Winogrande,74.98,,hf_open_llm_v1_240829_frozen.csv caplattessdolxaboros_yi_34b_200k_dare_ties,HF OpenLLM v1,68.57,,hf_open_llm_v1_240829_frozen.csv caplattessdolxaboros_yi_34b_200k_dare_ties,HFv1 ARC,64.93,,hf_open_llm_v1_240829_frozen.csv caplattessdolxaboros_yi_34b_200k_dare_ties,HFv1 GSM8K,54.06,,hf_open_llm_v1_240829_frozen.csv caplattessdolxaboros_yi_34b_200k_dare_ties,HFv1 HellaSwag,84.99,,hf_open_llm_v1_240829_frozen.csv caplattessdolxaboros_yi_34b_200k_dare_ties,HFv1 MMLU,75.37,,hf_open_llm_v1_240829_frozen.csv caplattessdolxaboros_yi_34b_200k_dare_ties,HFv1 TruthfulQA,52.84,,hf_open_llm_v1_240829_frozen.csv caplattessdolxaboros_yi_34b_200k_dare_ties,HFv1 Winogrande,79.24,,hf_open_llm_v1_240829_frozen.csv caplattessdolxaboros_yi_34b_200k_dare_ties_extremedensity,HF OpenLLM v1,71.57,,hf_open_llm_v1_240829_frozen.csv caplattessdolxaboros_yi_34b_200k_dare_ties_extremedensity,HFv1 ARC,66.89,,hf_open_llm_v1_240829_frozen.csv caplattessdolxaboros_yi_34b_200k_dare_ties_extremedensity,HFv1 GSM8K,59.82,,hf_open_llm_v1_240829_frozen.csv caplattessdolxaboros_yi_34b_200k_dare_ties_extremedensity,HFv1 HellaSwag,85.69,,hf_open_llm_v1_240829_frozen.csv caplattessdolxaboros_yi_34b_200k_dare_ties_extremedensity,HFv1 MMLU,77.35,,hf_open_llm_v1_240829_frozen.csv caplattessdolxaboros_yi_34b_200k_dare_ties_extremedensity,HFv1 TruthfulQA,57.63,,hf_open_llm_v1_240829_frozen.csv caplattessdolxaboros_yi_34b_200k_dare_ties_extremedensity,HFv1 Winogrande,82.0,,hf_open_llm_v1_240829_frozen.csv caplattessdolxaboros_yi_34b_200k_dare_ties_highdensity,HF OpenLLM v1,72.15,,hf_open_llm_v1_240829_frozen.csv caplattessdolxaboros_yi_34b_200k_dare_ties_highdensity,HFv1 ARC,67.41,,hf_open_llm_v1_240829_frozen.csv caplattessdolxaboros_yi_34b_200k_dare_ties_highdensity,HFv1 GSM8K,61.33,,hf_open_llm_v1_240829_frozen.csv caplattessdolxaboros_yi_34b_200k_dare_ties_highdensity,HFv1 HellaSwag,85.77,,hf_open_llm_v1_240829_frozen.csv caplattessdolxaboros_yi_34b_200k_dare_ties_highdensity,HFv1 MMLU,77.44,,hf_open_llm_v1_240829_frozen.csv caplattessdolxaboros_yi_34b_200k_dare_ties_highdensity,HFv1 TruthfulQA,57.84,,hf_open_llm_v1_240829_frozen.csv caplattessdolxaboros_yi_34b_200k_dare_ties_highdensity,HFv1 Winogrande,83.11,,hf_open_llm_v1_240829_frozen.csv capybara_tess_yi_34b_200k,HF OpenLLM v1,70.57,,hf_open_llm_v1_240829_frozen.csv capybara_tess_yi_34b_200k,HFv1 ARC,66.13,,hf_open_llm_v1_240829_frozen.csv capybara_tess_yi_34b_200k,HFv1 GSM8K,57.39,,hf_open_llm_v1_240829_frozen.csv capybara_tess_yi_34b_200k,HFv1 HellaSwag,86.24,,hf_open_llm_v1_240829_frozen.csv capybara_tess_yi_34b_200k,HFv1 MMLU,74.89,,hf_open_llm_v1_240829_frozen.csv capybara_tess_yi_34b_200k,HFv1 TruthfulQA,56.37,,hf_open_llm_v1_240829_frozen.csv capybara_tess_yi_34b_200k,HFv1 Winogrande,82.4,,hf_open_llm_v1_240829_frozen.csv capybarahermes_2_5_mistral_7b,HF OpenLLM v1,68.14,,hf_open_llm_v1_240829_frozen.csv capybarahermes_2_5_mistral_7b,HFv1 ARC,65.78,,hf_open_llm_v1_240829_frozen.csv capybarahermes_2_5_mistral_7b,HFv1 GSM8K,59.29,,hf_open_llm_v1_240829_frozen.csv capybarahermes_2_5_mistral_7b,HFv1 HellaSwag,85.45,,hf_open_llm_v1_240829_frozen.csv capybarahermes_2_5_mistral_7b,HFv1 MMLU,63.13,,hf_open_llm_v1_240829_frozen.csv capybarahermes_2_5_mistral_7b,HFv1 TruthfulQA,56.91,,hf_open_llm_v1_240829_frozen.csv capybarahermes_2_5_mistral_7b,HFv1 Winogrande,78.3,,hf_open_llm_v1_240829_frozen.csv carbonbeagle_11b_truthy,HF OpenLLM v1,76.1,,hf_open_llm_v1_240829_frozen.csv carbonbeagle_11b_truthy,HFv1 ARC,72.27,,hf_open_llm_v1_240829_frozen.csv carbonbeagle_11b_truthy,HFv1 GSM8K,66.11,,hf_open_llm_v1_240829_frozen.csv carbonbeagle_11b_truthy,HFv1 HellaSwag,89.31,,hf_open_llm_v1_240829_frozen.csv carbonbeagle_11b_truthy,HFv1 MMLU,66.55,,hf_open_llm_v1_240829_frozen.csv carbonbeagle_11b_truthy,HFv1 TruthfulQA,78.55,,hf_open_llm_v1_240829_frozen.csv carbonbeagle_11b_truthy,HFv1 Winogrande,83.82,,hf_open_llm_v1_240829_frozen.csv carbonvillain_en_10_7b_v1,HF OpenLLM v1,74.28,,hf_open_llm_v1_240829_frozen.csv carbonvillain_en_10_7b_v1,HFv1 ARC,71.25,,hf_open_llm_v1_240829_frozen.csv carbonvillain_en_10_7b_v1,HFv1 GSM8K,64.29,,hf_open_llm_v1_240829_frozen.csv carbonvillain_en_10_7b_v1,HFv1 HellaSwag,88.46,,hf_open_llm_v1_240829_frozen.csv carbonvillain_en_10_7b_v1,HFv1 MMLU,66.42,,hf_open_llm_v1_240829_frozen.csv carbonvillain_en_10_7b_v1,HFv1 TruthfulQA,71.98,,hf_open_llm_v1_240829_frozen.csv carbonvillain_en_10_7b_v1,HFv1 Winogrande,83.27,,hf_open_llm_v1_240829_frozen.csv carbonvillain_en_10_7b_v2,HF OpenLLM v1,74.42,,hf_open_llm_v1_240829_frozen.csv carbonvillain_en_10_7b_v2,HFv1 ARC,71.25,,hf_open_llm_v1_240829_frozen.csv carbonvillain_en_10_7b_v2,HFv1 GSM8K,65.28,,hf_open_llm_v1_240829_frozen.csv carbonvillain_en_10_7b_v2,HFv1 HellaSwag,88.4,,hf_open_llm_v1_240829_frozen.csv carbonvillain_en_10_7b_v2,HFv1 MMLU,66.31,,hf_open_llm_v1_240829_frozen.csv carbonvillain_en_10_7b_v2,HFv1 TruthfulQA,71.94,,hf_open_llm_v1_240829_frozen.csv carbonvillain_en_10_7b_v2,HFv1 Winogrande,83.35,,hf_open_llm_v1_240829_frozen.csv carbonvillain_en_10_7b_v3,HF OpenLLM v1,74.41,,hf_open_llm_v1_240829_frozen.csv carbonvillain_en_10_7b_v3,HFv1 ARC,70.99,,hf_open_llm_v1_240829_frozen.csv carbonvillain_en_10_7b_v3,HFv1 GSM8K,65.2,,hf_open_llm_v1_240829_frozen.csv carbonvillain_en_10_7b_v3,HFv1 HellaSwag,88.48,,hf_open_llm_v1_240829_frozen.csv carbonvillain_en_10_7b_v3,HFv1 MMLU,66.34,,hf_open_llm_v1_240829_frozen.csv carbonvillain_en_10_7b_v3,HFv1 TruthfulQA,71.84,,hf_open_llm_v1_240829_frozen.csv carbonvillain_en_10_7b_v3,HFv1 Winogrande,83.58,,hf_open_llm_v1_240829_frozen.csv carbonvillain_en_10_7b_v4,HF OpenLLM v1,74.52,,hf_open_llm_v1_240829_frozen.csv carbonvillain_en_10_7b_v4,HFv1 ARC,71.25,,hf_open_llm_v1_240829_frozen.csv carbonvillain_en_10_7b_v4,HFv1 GSM8K,65.58,,hf_open_llm_v1_240829_frozen.csv carbonvillain_en_10_7b_v4,HFv1 HellaSwag,88.48,,hf_open_llm_v1_240829_frozen.csv carbonvillain_en_10_7b_v4,HFv1 MMLU,66.27,,hf_open_llm_v1_240829_frozen.csv carbonvillain_en_10_7b_v4,HFv1 TruthfulQA,71.95,,hf_open_llm_v1_240829_frozen.csv carbonvillain_en_10_7b_v4,HFv1 Winogrande,83.58,,hf_open_llm_v1_240829_frozen.csv carbonvillain_en_10_7b_v5,HF OpenLLM v1,74.31,,hf_open_llm_v1_240829_frozen.csv carbonvillain_en_10_7b_v5,HFv1 ARC,71.16,,hf_open_llm_v1_240829_frozen.csv carbonvillain_en_10_7b_v5,HFv1 GSM8K,64.44,,hf_open_llm_v1_240829_frozen.csv carbonvillain_en_10_7b_v5,HFv1 HellaSwag,88.51,,hf_open_llm_v1_240829_frozen.csv carbonvillain_en_10_7b_v5,HFv1 MMLU,66.44,,hf_open_llm_v1_240829_frozen.csv carbonvillain_en_10_7b_v5,HFv1 TruthfulQA,71.97,,hf_open_llm_v1_240829_frozen.csv carbonvillain_en_10_7b_v5,HFv1 Winogrande,83.35,,hf_open_llm_v1_240829_frozen.csv carbonvillain_en_13b_v1,HF OpenLLM v1,74.28,,hf_open_llm_v1_240829_frozen.csv carbonvillain_en_13b_v1,HFv1 ARC,71.25,,hf_open_llm_v1_240829_frozen.csv carbonvillain_en_13b_v1,HFv1 GSM8K,64.29,,hf_open_llm_v1_240829_frozen.csv carbonvillain_en_13b_v1,HFv1 HellaSwag,88.46,,hf_open_llm_v1_240829_frozen.csv carbonvillain_en_13b_v1,HFv1 MMLU,66.42,,hf_open_llm_v1_240829_frozen.csv carbonvillain_en_13b_v1,HFv1 TruthfulQA,71.98,,hf_open_llm_v1_240829_frozen.csv carbonvillain_en_13b_v1,HFv1 Winogrande,83.27,,hf_open_llm_v1_240829_frozen.csv catppt_base,HF OpenLLM v1,72.25,,hf_open_llm_v1_240829_frozen.csv catppt_base,HFv1 ARC,67.92,,hf_open_llm_v1_240829_frozen.csv catppt_base,HFv1 GSM8K,70.66,,hf_open_llm_v1_240829_frozen.csv catppt_base,HFv1 HellaSwag,86.64,,hf_open_llm_v1_240829_frozen.csv catppt_base,HFv1 MMLU,65.26,,hf_open_llm_v1_240829_frozen.csv catppt_base,HFv1 TruthfulQA,61.72,,hf_open_llm_v1_240829_frozen.csv catppt_base,HFv1 Winogrande,81.29,,hf_open_llm_v1_240829_frozen.csv catunalaserpi_dpo,HF OpenLLM v1,74.7,,hf_open_llm_v1_240829_frozen.csv catunalaserpi_dpo,HFv1 ARC,72.95,,hf_open_llm_v1_240829_frozen.csv catunalaserpi_dpo,HFv1 GSM8K,69.29,,hf_open_llm_v1_240829_frozen.csv catunalaserpi_dpo,HFv1 HellaSwag,88.33,,hf_open_llm_v1_240829_frozen.csv catunalaserpi_dpo,HFv1 MMLU,64.95,,hf_open_llm_v1_240829_frozen.csv catunalaserpi_dpo,HFv1 TruthfulQA,70.01,,hf_open_llm_v1_240829_frozen.csv catunalaserpi_dpo,HFv1 Winogrande,82.64,,hf_open_llm_v1_240829_frozen.csv causallm_platypus_14b,HF OpenLLM v1,63.8,,hf_open_llm_v1_240829_frozen.csv causallm_platypus_14b,HFv1 ARC,56.91,,hf_open_llm_v1_240829_frozen.csv causallm_platypus_14b,HFv1 GSM8K,57.24,,hf_open_llm_v1_240829_frozen.csv causallm_platypus_14b,HFv1 HellaSwag,80.06,,hf_open_llm_v1_240829_frozen.csv causallm_platypus_14b,HFv1 MMLU,64.98,,hf_open_llm_v1_240829_frozen.csv causallm_platypus_14b,HFv1 TruthfulQA,47.57,,hf_open_llm_v1_240829_frozen.csv causallm_platypus_14b,HFv1 Winogrande,76.01,,hf_open_llm_v1_240829_frozen.csv cerberus_7b_model_stock,HF OpenLLM v1,69.66,,hf_open_llm_v1_240829_frozen.csv cerberus_7b_model_stock,HFv1 ARC,67.92,,hf_open_llm_v1_240829_frozen.csv cerberus_7b_model_stock,HFv1 GSM8K,61.79,,hf_open_llm_v1_240829_frozen.csv cerberus_7b_model_stock,HFv1 HellaSwag,85.2,,hf_open_llm_v1_240829_frozen.csv cerberus_7b_model_stock,HFv1 MMLU,64.52,,hf_open_llm_v1_240829_frozen.csv cerberus_7b_model_stock,HFv1 TruthfulQA,58.8,,hf_open_llm_v1_240829_frozen.csv cerberus_7b_model_stock,HFv1 Winogrande,79.72,,hf_open_llm_v1_240829_frozen.csv cerebras_gpt_111m,HF OpenLLM v1,27.75,,hf_open_llm_v1_240829_frozen.csv cerebras_gpt_111m,HFv1 ARC,20.22,,hf_open_llm_v1_240829_frozen.csv cerebras_gpt_111m,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv cerebras_gpt_111m,HFv1 HellaSwag,26.73,,hf_open_llm_v1_240829_frozen.csv cerebras_gpt_111m,HFv1 MMLU,25.51,,hf_open_llm_v1_240829_frozen.csv cerebras_gpt_111m,HFv1 TruthfulQA,46.31,,hf_open_llm_v1_240829_frozen.csv cerebras_gpt_111m,HFv1 Winogrande,47.75,,hf_open_llm_v1_240829_frozen.csv cerebras_gpt_13b,HF OpenLLM v1,37.4,,hf_open_llm_v1_240829_frozen.csv cerebras_gpt_13b,HFv1 ARC,38.14,,hf_open_llm_v1_240829_frozen.csv cerebras_gpt_13b,HFv1 GSM8K,1.29,,hf_open_llm_v1_240829_frozen.csv cerebras_gpt_13b,HFv1 HellaSwag,60.01,,hf_open_llm_v1_240829_frozen.csv cerebras_gpt_13b,HFv1 MMLU,25.92,,hf_open_llm_v1_240829_frozen.csv cerebras_gpt_13b,HFv1 TruthfulQA,39.19,,hf_open_llm_v1_240829_frozen.csv cerebras_gpt_13b,HFv1 Winogrande,59.83,,hf_open_llm_v1_240829_frozen.csv cerebras_gpt_1_3b,HF OpenLLM v1,31.3,,hf_open_llm_v1_240829_frozen.csv cerebras_gpt_1_3b,HFv1 ARC,26.28,,hf_open_llm_v1_240829_frozen.csv cerebras_gpt_1_3b,HFv1 GSM8K,0.23,,hf_open_llm_v1_240829_frozen.csv cerebras_gpt_1_3b,HFv1 HellaSwag,38.54,,hf_open_llm_v1_240829_frozen.csv cerebras_gpt_1_3b,HFv1 MMLU,26.59,,hf_open_llm_v1_240829_frozen.csv cerebras_gpt_1_3b,HFv1 TruthfulQA,42.7,,hf_open_llm_v1_240829_frozen.csv cerebras_gpt_1_3b,HFv1 Winogrande,53.43,,hf_open_llm_v1_240829_frozen.csv cerebras_gpt_256m,HF OpenLLM v1,29.38,,hf_open_llm_v1_240829_frozen.csv cerebras_gpt_256m,HFv1 ARC,22.01,,hf_open_llm_v1_240829_frozen.csv cerebras_gpt_256m,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv cerebras_gpt_256m,HFv1 HellaSwag,28.99,,hf_open_llm_v1_240829_frozen.csv cerebras_gpt_256m,HFv1 MMLU,26.83,,hf_open_llm_v1_240829_frozen.csv cerebras_gpt_256m,HFv1 TruthfulQA,45.98,,hf_open_llm_v1_240829_frozen.csv cerebras_gpt_256m,HFv1 Winogrande,52.49,,hf_open_llm_v1_240829_frozen.csv cerebras_gpt_2_7b,HF OpenLLM v1,33.25,,hf_open_llm_v1_240829_frozen.csv cerebras_gpt_2_7b,HFv1 ARC,29.1,,hf_open_llm_v1_240829_frozen.csv cerebras_gpt_2_7b,HFv1 GSM8K,0.45,,hf_open_llm_v1_240829_frozen.csv cerebras_gpt_2_7b,HFv1 HellaSwag,49.29,,hf_open_llm_v1_240829_frozen.csv cerebras_gpt_2_7b,HFv1 MMLU,25.17,,hf_open_llm_v1_240829_frozen.csv cerebras_gpt_2_7b,HFv1 TruthfulQA,41.37,,hf_open_llm_v1_240829_frozen.csv cerebras_gpt_2_7b,HFv1 Winogrande,54.14,,hf_open_llm_v1_240829_frozen.csv cerebras_gpt_6_7b,HF OpenLLM v1,36.27,,hf_open_llm_v1_240829_frozen.csv cerebras_gpt_6_7b,HFv1 ARC,35.07,,hf_open_llm_v1_240829_frozen.csv cerebras_gpt_6_7b,HFv1 GSM8K,0.53,,hf_open_llm_v1_240829_frozen.csv cerebras_gpt_6_7b,HFv1 HellaSwag,59.36,,hf_open_llm_v1_240829_frozen.csv cerebras_gpt_6_7b,HFv1 MMLU,25.93,,hf_open_llm_v1_240829_frozen.csv cerebras_gpt_6_7b,HFv1 TruthfulQA,38.02,,hf_open_llm_v1_240829_frozen.csv cerebras_gpt_6_7b,HFv1 Winogrande,58.72,,hf_open_llm_v1_240829_frozen.csv changpt_bart,HF OpenLLM v1,29.27,,hf_open_llm_v1_240829_frozen.csv changpt_bart,HFv1 ARC,28.67,,hf_open_llm_v1_240829_frozen.csv changpt_bart,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv changpt_bart,HFv1 HellaSwag,26.41,,hf_open_llm_v1_240829_frozen.csv changpt_bart,HFv1 MMLU,23.12,,hf_open_llm_v1_240829_frozen.csv changpt_bart,HFv1 TruthfulQA,47.94,,hf_open_llm_v1_240829_frozen.csv changpt_bart,HFv1 Winogrande,49.49,,hf_open_llm_v1_240829_frozen.csv chat_ayb_nova_13b,HF OpenLLM v1,57.84,,hf_open_llm_v1_240829_frozen.csv chat_ayb_nova_13b,HFv1 ARC,62.97,,hf_open_llm_v1_240829_frozen.csv chat_ayb_nova_13b,HFv1 GSM8K,12.36,,hf_open_llm_v1_240829_frozen.csv chat_ayb_nova_13b,HFv1 HellaSwag,84.28,,hf_open_llm_v1_240829_frozen.csv chat_ayb_nova_13b,HFv1 MMLU,58.58,,hf_open_llm_v1_240829_frozen.csv chat_ayb_nova_13b,HFv1 TruthfulQA,51.28,,hf_open_llm_v1_240829_frozen.csv chat_ayb_nova_13b,HFv1 Winogrande,77.58,,hf_open_llm_v1_240829_frozen.csv chat_ayb_platypus2_13b,HF OpenLLM v1,55.93,,hf_open_llm_v1_240829_frozen.csv chat_ayb_platypus2_13b,HFv1 ARC,60.49,,hf_open_llm_v1_240829_frozen.csv chat_ayb_platypus2_13b,HFv1 GSM8K,2.96,,hf_open_llm_v1_240829_frozen.csv chat_ayb_platypus2_13b,HFv1 HellaSwag,84.03,,hf_open_llm_v1_240829_frozen.csv chat_ayb_platypus2_13b,HFv1 MMLU,57.83,,hf_open_llm_v1_240829_frozen.csv chat_ayb_platypus2_13b,HFv1 TruthfulQA,54.52,,hf_open_llm_v1_240829_frozen.csv chat_ayb_platypus2_13b,HFv1 Winogrande,75.77,,hf_open_llm_v1_240829_frozen.csv chatayt_lora_assamble_marcoroni,HF OpenLLM v1,57.76,,hf_open_llm_v1_240829_frozen.csv chatayt_lora_assamble_marcoroni,HFv1 ARC,62.46,,hf_open_llm_v1_240829_frozen.csv chatayt_lora_assamble_marcoroni,HFv1 GSM8K,8.87,,hf_open_llm_v1_240829_frozen.csv chatayt_lora_assamble_marcoroni,HFv1 HellaSwag,83.05,,hf_open_llm_v1_240829_frozen.csv chatayt_lora_assamble_marcoroni,HFv1 MMLU,58.72,,hf_open_llm_v1_240829_frozen.csv chatayt_lora_assamble_marcoroni,HFv1 TruthfulQA,56.12,,hf_open_llm_v1_240829_frozen.csv chatayt_lora_assamble_marcoroni,HFv1 Winogrande,77.35,,hf_open_llm_v1_240829_frozen.csv chathercules_2_5_mistral_7b_dpo,HF OpenLLM v1,69.73,,hf_open_llm_v1_240829_frozen.csv chathercules_2_5_mistral_7b_dpo,HFv1 ARC,66.04,,hf_open_llm_v1_240829_frozen.csv chathercules_2_5_mistral_7b_dpo,HFv1 GSM8K,67.55,,hf_open_llm_v1_240829_frozen.csv chathercules_2_5_mistral_7b_dpo,HFv1 HellaSwag,85.4,,hf_open_llm_v1_240829_frozen.csv chathercules_2_5_mistral_7b_dpo,HFv1 MMLU,65.17,,hf_open_llm_v1_240829_frozen.csv chathercules_2_5_mistral_7b_dpo,HFv1 TruthfulQA,52.3,,hf_open_llm_v1_240829_frozen.csv chathercules_2_5_mistral_7b_dpo,HFv1 Winogrande,81.93,,hf_open_llm_v1_240829_frozen.csv chimera_7b_slerp,HF OpenLLM v1,72.84,,hf_open_llm_v1_240829_frozen.csv chimera_7b_slerp,HFv1 ARC,70.22,,hf_open_llm_v1_240829_frozen.csv chimera_7b_slerp,HFv1 GSM8K,68.46,,hf_open_llm_v1_240829_frozen.csv chimera_7b_slerp,HFv1 HellaSwag,87.09,,hf_open_llm_v1_240829_frozen.csv chimera_7b_slerp,HFv1 MMLU,64.84,,hf_open_llm_v1_240829_frozen.csv chimera_7b_slerp,HFv1 TruthfulQA,64.43,,hf_open_llm_v1_240829_frozen.csv chimera_7b_slerp,HFv1 Winogrande,82.0,,hf_open_llm_v1_240829_frozen.csv chimera_7b_ties,HF OpenLLM v1,62.46,,hf_open_llm_v1_240829_frozen.csv chimera_7b_ties,HFv1 ARC,67.06,,hf_open_llm_v1_240829_frozen.csv chimera_7b_ties,HFv1 GSM8K,29.34,,hf_open_llm_v1_240829_frozen.csv chimera_7b_ties,HFv1 HellaSwag,81.55,,hf_open_llm_v1_240829_frozen.csv chimera_7b_ties,HFv1 MMLU,53.46,,hf_open_llm_v1_240829_frozen.csv chimera_7b_ties,HFv1 TruthfulQA,64.34,,hf_open_llm_v1_240829_frozen.csv chimera_7b_ties,HFv1 Winogrande,79.01,,hf_open_llm_v1_240829_frozen.csv chimerallama3_8b,HF OpenLLM v1,68.13,,hf_open_llm_v1_240829_frozen.csv chimerallama3_8b,HFv1 ARC,62.37,,hf_open_llm_v1_240829_frozen.csv chimerallama3_8b,HFv1 GSM8K,67.55,,hf_open_llm_v1_240829_frozen.csv chimerallama3_8b,HFv1 HellaSwag,81.19,,hf_open_llm_v1_240829_frozen.csv chimerallama3_8b,HFv1 MMLU,67.62,,hf_open_llm_v1_240829_frozen.csv chimerallama3_8b,HFv1 TruthfulQA,52.4,,hf_open_llm_v1_240829_frozen.csv chimerallama3_8b,HFv1 Winogrande,77.66,,hf_open_llm_v1_240829_frozen.csv chinese_alpaca_2_13b,HF OpenLLM v1,57.41,,hf_open_llm_v1_240829_frozen.csv chinese_alpaca_2_13b,HFv1 ARC,58.7,,hf_open_llm_v1_240829_frozen.csv chinese_alpaca_2_13b,HFv1 GSM8K,25.02,,hf_open_llm_v1_240829_frozen.csv chinese_alpaca_2_13b,HFv1 HellaSwag,79.76,,hf_open_llm_v1_240829_frozen.csv chinese_alpaca_2_13b,HFv1 MMLU,55.12,,hf_open_llm_v1_240829_frozen.csv chinese_alpaca_2_13b,HFv1 TruthfulQA,50.22,,hf_open_llm_v1_240829_frozen.csv chinese_alpaca_2_13b,HFv1 Winogrande,75.61,,hf_open_llm_v1_240829_frozen.csv chinese_alpaca_2_13b_16k,HF OpenLLM v1,54.12,,hf_open_llm_v1_240829_frozen.csv chinese_alpaca_2_13b_16k,HFv1 ARC,55.03,,hf_open_llm_v1_240829_frozen.csv chinese_alpaca_2_13b_16k,HFv1 GSM8K,21.08,,hf_open_llm_v1_240829_frozen.csv chinese_alpaca_2_13b_16k,HFv1 HellaSwag,77.41,,hf_open_llm_v1_240829_frozen.csv chinese_alpaca_2_13b_16k,HFv1 MMLU,51.28,,hf_open_llm_v1_240829_frozen.csv chinese_alpaca_2_13b_16k,HFv1 TruthfulQA,46.5,,hf_open_llm_v1_240829_frozen.csv chinese_alpaca_2_13b_16k,HFv1 Winogrande,73.4,,hf_open_llm_v1_240829_frozen.csv chinese_alpaca_2_1_3b,HF OpenLLM v1,29.34,,hf_open_llm_v1_240829_frozen.csv chinese_alpaca_2_1_3b,HFv1 ARC,24.49,,hf_open_llm_v1_240829_frozen.csv chinese_alpaca_2_1_3b,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv chinese_alpaca_2_1_3b,HFv1 HellaSwag,30.17,,hf_open_llm_v1_240829_frozen.csv chinese_alpaca_2_1_3b,HFv1 MMLU,25.88,,hf_open_llm_v1_240829_frozen.csv chinese_alpaca_2_1_3b,HFv1 TruthfulQA,44.6,,hf_open_llm_v1_240829_frozen.csv chinese_alpaca_2_1_3b,HFv1 Winogrande,50.91,,hf_open_llm_v1_240829_frozen.csv chinese_alpaca_2_1_3b_rlhf,HF OpenLLM v1,29.39,,hf_open_llm_v1_240829_frozen.csv chinese_alpaca_2_1_3b_rlhf,HFv1 ARC,23.89,,hf_open_llm_v1_240829_frozen.csv chinese_alpaca_2_1_3b_rlhf,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv chinese_alpaca_2_1_3b_rlhf,HFv1 HellaSwag,30.01,,hf_open_llm_v1_240829_frozen.csv chinese_alpaca_2_1_3b_rlhf,HFv1 MMLU,26.53,,hf_open_llm_v1_240829_frozen.csv chinese_alpaca_2_1_3b_rlhf,HFv1 TruthfulQA,45.06,,hf_open_llm_v1_240829_frozen.csv chinese_alpaca_2_1_3b_rlhf,HFv1 Winogrande,50.83,,hf_open_llm_v1_240829_frozen.csv chinese_alpaca_2_7b,HF OpenLLM v1,50.21,,hf_open_llm_v1_240829_frozen.csv chinese_alpaca_2_7b,HFv1 ARC,49.57,,hf_open_llm_v1_240829_frozen.csv chinese_alpaca_2_7b,HFv1 GSM8K,13.72,,hf_open_llm_v1_240829_frozen.csv chinese_alpaca_2_7b,HFv1 HellaSwag,72.64,,hf_open_llm_v1_240829_frozen.csv chinese_alpaca_2_7b,HFv1 MMLU,46.55,,hf_open_llm_v1_240829_frozen.csv chinese_alpaca_2_7b,HFv1 TruthfulQA,48.63,,hf_open_llm_v1_240829_frozen.csv chinese_alpaca_2_7b,HFv1 Winogrande,70.17,,hf_open_llm_v1_240829_frozen.csv chinese_alpaca_2_7b_16k,HF OpenLLM v1,48.02,,hf_open_llm_v1_240829_frozen.csv chinese_alpaca_2_7b_16k,HFv1 ARC,48.46,,hf_open_llm_v1_240829_frozen.csv chinese_alpaca_2_7b_16k,HFv1 GSM8K,9.33,,hf_open_llm_v1_240829_frozen.csv chinese_alpaca_2_7b_16k,HFv1 HellaSwag,70.3,,hf_open_llm_v1_240829_frozen.csv chinese_alpaca_2_7b_16k,HFv1 MMLU,42.94,,hf_open_llm_v1_240829_frozen.csv chinese_alpaca_2_7b_16k,HFv1 TruthfulQA,48.59,,hf_open_llm_v1_240829_frozen.csv chinese_alpaca_2_7b_16k,HFv1 Winogrande,68.51,,hf_open_llm_v1_240829_frozen.csv chinese_alpaca_2_7b_rlhf,HF OpenLLM v1,50.92,,hf_open_llm_v1_240829_frozen.csv chinese_alpaca_2_7b_rlhf,HFv1 ARC,49.49,,hf_open_llm_v1_240829_frozen.csv chinese_alpaca_2_7b_rlhf,HFv1 GSM8K,15.01,,hf_open_llm_v1_240829_frozen.csv chinese_alpaca_2_7b_rlhf,HFv1 HellaSwag,72.61,,hf_open_llm_v1_240829_frozen.csv chinese_alpaca_2_7b_rlhf,HFv1 MMLU,46.29,,hf_open_llm_v1_240829_frozen.csv chinese_alpaca_2_7b_rlhf,HFv1 TruthfulQA,51.19,,hf_open_llm_v1_240829_frozen.csv chinese_alpaca_2_7b_rlhf,HFv1 Winogrande,70.96,,hf_open_llm_v1_240829_frozen.csv chinese_mixtral,HF OpenLLM v1,58.57,,hf_open_llm_v1_240829_frozen.csv chinese_mixtral,HFv1 ARC,67.49,,hf_open_llm_v1_240829_frozen.csv chinese_mixtral,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv chinese_mixtral,HFv1 HellaSwag,85.25,,hf_open_llm_v1_240829_frozen.csv chinese_mixtral,HFv1 MMLU,70.31,,hf_open_llm_v1_240829_frozen.csv chinese_mixtral,HFv1 TruthfulQA,46.75,,hf_open_llm_v1_240829_frozen.csv chinese_mixtral,HFv1 Winogrande,81.61,,hf_open_llm_v1_240829_frozen.csv chinese_mixtral_8x7b,HF OpenLLM v1,66.69,,hf_open_llm_v1_240829_frozen.csv chinese_mixtral_8x7b,HFv1 ARC,63.57,,hf_open_llm_v1_240829_frozen.csv chinese_mixtral_8x7b,HFv1 GSM8K,51.71,,hf_open_llm_v1_240829_frozen.csv chinese_mixtral_8x7b,HFv1 HellaSwag,85.98,,hf_open_llm_v1_240829_frozen.csv chinese_mixtral_8x7b,HFv1 MMLU,70.95,,hf_open_llm_v1_240829_frozen.csv chinese_mixtral_8x7b,HFv1 TruthfulQA,45.86,,hf_open_llm_v1_240829_frozen.csv chinese_mixtral_8x7b,HFv1 Winogrande,82.08,,hf_open_llm_v1_240829_frozen.csv chronorctypus_limarobormes_13b,HF OpenLLM v1,55.22,,hf_open_llm_v1_240829_frozen.csv chronorctypus_limarobormes_13b,HFv1 ARC,59.9,,hf_open_llm_v1_240829_frozen.csv chronorctypus_limarobormes_13b,HFv1 GSM8K,3.87,,hf_open_llm_v1_240829_frozen.csv chronorctypus_limarobormes_13b,HFv1 HellaSwag,82.75,,hf_open_llm_v1_240829_frozen.csv chronorctypus_limarobormes_13b,HFv1 MMLU,58.45,,hf_open_llm_v1_240829_frozen.csv chronorctypus_limarobormes_13b,HFv1 TruthfulQA,51.9,,hf_open_llm_v1_240829_frozen.csv chronorctypus_limarobormes_13b,HFv1 Winogrande,74.43,,hf_open_llm_v1_240829_frozen.csv chronos007_70b,HF OpenLLM v1,68.25,,hf_open_llm_v1_240829_frozen.csv chronos007_70b,HFv1 ARC,70.14,,hf_open_llm_v1_240829_frozen.csv chronos007_70b,HFv1 GSM8K,42.61,,hf_open_llm_v1_240829_frozen.csv chronos007_70b,HFv1 HellaSwag,87.52,,hf_open_llm_v1_240829_frozen.csv chronos007_70b,HFv1 MMLU,69.33,,hf_open_llm_v1_240829_frozen.csv chronos007_70b,HFv1 TruthfulQA,57.65,,hf_open_llm_v1_240829_frozen.csv chronos007_70b,HFv1 Winogrande,82.24,,hf_open_llm_v1_240829_frozen.csv code_millenials_34b,HF OpenLLM v1,53.51,,hf_open_llm_v1_240829_frozen.csv code_millenials_34b,HFv1 ARC,49.83,,hf_open_llm_v1_240829_frozen.csv code_millenials_34b,HFv1 GSM8K,32.45,,hf_open_llm_v1_240829_frozen.csv code_millenials_34b,HFv1 HellaSwag,75.09,,hf_open_llm_v1_240829_frozen.csv code_millenials_34b,HFv1 MMLU,49.28,,hf_open_llm_v1_240829_frozen.csv code_millenials_34b,HFv1 TruthfulQA,45.37,,hf_open_llm_v1_240829_frozen.csv code_millenials_34b,HFv1 Winogrande,69.06,,hf_open_llm_v1_240829_frozen.csv codegen_16b_nl,HF OpenLLM v1,42.59,,hf_open_llm_v1_240829_frozen.csv codegen_16b_nl,HFv1 ARC,46.76,,hf_open_llm_v1_240829_frozen.csv codegen_16b_nl,HFv1 GSM8K,2.65,,hf_open_llm_v1_240829_frozen.csv codegen_16b_nl,HFv1 HellaSwag,71.87,,hf_open_llm_v1_240829_frozen.csv codegen_16b_nl,HFv1 MMLU,32.35,,hf_open_llm_v1_240829_frozen.csv codegen_16b_nl,HFv1 TruthfulQA,33.95,,hf_open_llm_v1_240829_frozen.csv codegen_16b_nl,HFv1 Winogrande,67.96,,hf_open_llm_v1_240829_frozen.csv codegen_6b_multi,HF OpenLLM v1,32.43,,hf_open_llm_v1_240829_frozen.csv codegen_6b_multi,HFv1 ARC,27.22,,hf_open_llm_v1_240829_frozen.csv codegen_6b_multi,HFv1 GSM8K,0.99,,hf_open_llm_v1_240829_frozen.csv codegen_6b_multi,HFv1 HellaSwag,41.11,,hf_open_llm_v1_240829_frozen.csv codegen_6b_multi,HFv1 MMLU,25.71,,hf_open_llm_v1_240829_frozen.csv codegen_6b_multi,HFv1 TruthfulQA,45.65,,hf_open_llm_v1_240829_frozen.csv codegen_6b_multi,HFv1 Winogrande,53.91,,hf_open_llm_v1_240829_frozen.csv codegen_6b_nl,HF OpenLLM v1,40.0,,hf_open_llm_v1_240829_frozen.csv codegen_6b_nl,HFv1 ARC,42.32,,hf_open_llm_v1_240829_frozen.csv codegen_6b_nl,HFv1 GSM8K,2.2,,hf_open_llm_v1_240829_frozen.csv codegen_6b_nl,HFv1 HellaSwag,68.59,,hf_open_llm_v1_240829_frozen.csv codegen_6b_nl,HFv1 MMLU,25.93,,hf_open_llm_v1_240829_frozen.csv codegen_6b_nl,HFv1 TruthfulQA,34.47,,hf_open_llm_v1_240829_frozen.csv codegen_6b_nl,HFv1 Winogrande,66.46,,hf_open_llm_v1_240829_frozen.csv codellama34b,HF OpenLLM v1,55.28,,hf_open_llm_v1_240829_frozen.csv codellama34b,HFv1 ARC,54.18,,hf_open_llm_v1_240829_frozen.csv codellama34b,HFv1 GSM8K,34.34,,hf_open_llm_v1_240829_frozen.csv codellama34b,HFv1 HellaSwag,75.82,,hf_open_llm_v1_240829_frozen.csv codellama34b,HFv1 MMLU,54.92,,hf_open_llm_v1_240829_frozen.csv codellama34b,HFv1 TruthfulQA,39.11,,hf_open_llm_v1_240829_frozen.csv codellama34b,HFv1 Winogrande,73.32,,hf_open_llm_v1_240829_frozen.csv codellama34b_instruct,HF OpenLLM v1,44.33,,hf_open_llm_v1_240829_frozen.csv codellama34b_instruct,HFv1 ARC,40.78,,hf_open_llm_v1_240829_frozen.csv codellama34b_instruct,HFv1 GSM8K,31.01,,hf_open_llm_v1_240829_frozen.csv codellama34b_instruct,HFv1 HellaSwag,35.66,,hf_open_llm_v1_240829_frozen.csv codellama34b_instruct,HFv1 MMLU,39.72,,hf_open_llm_v1_240829_frozen.csv codellama34b_instruct,HFv1 TruthfulQA,44.29,,hf_open_llm_v1_240829_frozen.csv codellama34b_instruct,HFv1 Winogrande,74.51,,hf_open_llm_v1_240829_frozen.csv codellama34b_instruct_fp16,HF OpenLLM v1,43.0,,hf_open_llm_v1_240829_frozen.csv codellama34b_instruct_fp16,HFv1 ARC,40.78,,hf_open_llm_v1_240829_frozen.csv codellama34b_instruct_fp16,HFv1 GSM8K,23.05,,hf_open_llm_v1_240829_frozen.csv codellama34b_instruct_fp16,HFv1 HellaSwag,35.66,,hf_open_llm_v1_240829_frozen.csv codellama34b_instruct_fp16,HFv1 MMLU,39.72,,hf_open_llm_v1_240829_frozen.csv codellama34b_instruct_fp16,HFv1 TruthfulQA,44.29,,hf_open_llm_v1_240829_frozen.csv codellama34b_instruct_fp16,HFv1 Winogrande,74.51,,hf_open_llm_v1_240829_frozen.csv codellama34b_python,HF OpenLLM v1,40.27,,hf_open_llm_v1_240829_frozen.csv codellama34b_python,HFv1 ARC,40.19,,hf_open_llm_v1_240829_frozen.csv codellama34b_python,HFv1 GSM8K,14.33,,hf_open_llm_v1_240829_frozen.csv codellama34b_python,HFv1 HellaSwag,36.82,,hf_open_llm_v1_240829_frozen.csv codellama34b_python,HFv1 MMLU,34.79,,hf_open_llm_v1_240829_frozen.csv codellama34b_python,HFv1 TruthfulQA,44.28,,hf_open_llm_v1_240829_frozen.csv codellama34b_python,HFv1 Winogrande,71.19,,hf_open_llm_v1_240829_frozen.csv codellama34b_python_fp16,HF OpenLLM v1,40.27,,hf_open_llm_v1_240829_frozen.csv codellama34b_python_fp16,HFv1 ARC,38.14,,hf_open_llm_v1_240829_frozen.csv codellama34b_python_fp16,HFv1 GSM8K,20.02,,hf_open_llm_v1_240829_frozen.csv codellama34b_python_fp16,HFv1 HellaSwag,34.8,,hf_open_llm_v1_240829_frozen.csv codellama34b_python_fp16,HFv1 MMLU,32.95,,hf_open_llm_v1_240829_frozen.csv codellama34b_python_fp16,HFv1 TruthfulQA,43.57,,hf_open_llm_v1_240829_frozen.csv codellama34b_python_fp16,HFv1 Winogrande,72.14,,hf_open_llm_v1_240829_frozen.csv codellama_13b,HF OpenLLM v1,43.35,,hf_open_llm_v1_240829_frozen.csv codellama_13b,HFv1 ARC,40.87,,hf_open_llm_v1_240829_frozen.csv codellama_13b,HFv1 GSM8K,12.13,,hf_open_llm_v1_240829_frozen.csv codellama_13b,HFv1 HellaSwag,63.35,,hf_open_llm_v1_240829_frozen.csv codellama_13b,HFv1 MMLU,32.81,,hf_open_llm_v1_240829_frozen.csv codellama_13b,HFv1 TruthfulQA,43.79,,hf_open_llm_v1_240829_frozen.csv codellama_13b,HFv1 Winogrande,67.17,,hf_open_llm_v1_240829_frozen.csv codellama_13b_instruct,HF OpenLLM v1,45.82,,hf_open_llm_v1_240829_frozen.csv codellama_13b_instruct,HFv1 ARC,44.54,,hf_open_llm_v1_240829_frozen.csv codellama_13b_instruct,HFv1 GSM8K,12.66,,hf_open_llm_v1_240829_frozen.csv codellama_13b_instruct,HFv1 HellaSwag,64.93,,hf_open_llm_v1_240829_frozen.csv codellama_13b_instruct,HFv1 MMLU,38.89,,hf_open_llm_v1_240829_frozen.csv codellama_13b_instruct,HFv1 TruthfulQA,45.88,,hf_open_llm_v1_240829_frozen.csv codellama_13b_instruct,HFv1 Winogrande,68.03,,hf_open_llm_v1_240829_frozen.csv codellama_13b_instruct_fp16,HF OpenLLM v1,45.82,,hf_open_llm_v1_240829_frozen.csv codellama_13b_instruct_fp16,HFv1 ARC,44.62,,hf_open_llm_v1_240829_frozen.csv codellama_13b_instruct_fp16,HFv1 GSM8K,12.66,,hf_open_llm_v1_240829_frozen.csv codellama_13b_instruct_fp16,HFv1 HellaSwag,64.94,,hf_open_llm_v1_240829_frozen.csv codellama_13b_instruct_fp16,HFv1 MMLU,38.77,,hf_open_llm_v1_240829_frozen.csv codellama_13b_instruct_fp16,HFv1 TruthfulQA,45.88,,hf_open_llm_v1_240829_frozen.csv codellama_13b_instruct_fp16,HFv1 Winogrande,68.03,,hf_open_llm_v1_240829_frozen.csv codellama_13b_oasst_sft_v10,HF OpenLLM v1,44.85,,hf_open_llm_v1_240829_frozen.csv codellama_13b_oasst_sft_v10,HFv1 ARC,45.39,,hf_open_llm_v1_240829_frozen.csv codellama_13b_oasst_sft_v10,HFv1 GSM8K,13.19,,hf_open_llm_v1_240829_frozen.csv codellama_13b_oasst_sft_v10,HFv1 HellaSwag,62.36,,hf_open_llm_v1_240829_frozen.csv codellama_13b_oasst_sft_v10,HFv1 MMLU,35.36,,hf_open_llm_v1_240829_frozen.csv codellama_13b_oasst_sft_v10,HFv1 TruthfulQA,45.02,,hf_open_llm_v1_240829_frozen.csv codellama_13b_oasst_sft_v10,HFv1 Winogrande,67.8,,hf_open_llm_v1_240829_frozen.csv codellama_13b_python,HF OpenLLM v1,37.0,,hf_open_llm_v1_240829_frozen.csv codellama_13b_python,HFv1 ARC,32.59,,hf_open_llm_v1_240829_frozen.csv codellama_13b_python,HFv1 GSM8K,8.64,,hf_open_llm_v1_240829_frozen.csv codellama_13b_python,HFv1 HellaSwag,43.94,,hf_open_llm_v1_240829_frozen.csv codellama_13b_python,HFv1 MMLU,27.23,,hf_open_llm_v1_240829_frozen.csv codellama_13b_python,HFv1 TruthfulQA,44.59,,hf_open_llm_v1_240829_frozen.csv codellama_13b_python,HFv1 Winogrande,65.04,,hf_open_llm_v1_240829_frozen.csv codellama_70b,HF OpenLLM v1,58.93,,hf_open_llm_v1_240829_frozen.csv codellama_70b,HFv1 ARC,56.74,,hf_open_llm_v1_240829_frozen.csv codellama_70b,HFv1 GSM8K,43.97,,hf_open_llm_v1_240829_frozen.csv codellama_70b,HFv1 HellaSwag,78.21,,hf_open_llm_v1_240829_frozen.csv codellama_70b,HFv1 MMLU,59.67,,hf_open_llm_v1_240829_frozen.csv codellama_70b,HFv1 TruthfulQA,39.79,,hf_open_llm_v1_240829_frozen.csv codellama_70b,HFv1 Winogrande,75.22,,hf_open_llm_v1_240829_frozen.csv codellama_70b_instruct,HF OpenLLM v1,59.98,,hf_open_llm_v1_240829_frozen.csv codellama_70b_instruct,HFv1 ARC,55.03,,hf_open_llm_v1_240829_frozen.csv codellama_70b_instruct,HFv1 GSM8K,46.25,,hf_open_llm_v1_240829_frozen.csv codellama_70b_instruct,HFv1 HellaSwag,77.24,,hf_open_llm_v1_240829_frozen.csv codellama_70b_instruct,HFv1 MMLU,56.4,,hf_open_llm_v1_240829_frozen.csv codellama_70b_instruct,HFv1 TruthfulQA,50.44,,hf_open_llm_v1_240829_frozen.csv codellama_70b_instruct,HFv1 Winogrande,74.51,,hf_open_llm_v1_240829_frozen.csv codellama_70b_python,HF OpenLLM v1,58.0,,hf_open_llm_v1_240829_frozen.csv codellama_70b_python,HFv1 ARC,55.12,,hf_open_llm_v1_240829_frozen.csv codellama_70b_python,HFv1 GSM8K,43.44,,hf_open_llm_v1_240829_frozen.csv codellama_70b_python,HFv1 HellaSwag,78.48,,hf_open_llm_v1_240829_frozen.csv codellama_70b_python,HFv1 MMLU,56.17,,hf_open_llm_v1_240829_frozen.csv codellama_70b_python,HFv1 TruthfulQA,41.78,,hf_open_llm_v1_240829_frozen.csv codellama_70b_python,HFv1 Winogrande,73.01,,hf_open_llm_v1_240829_frozen.csv codellama_7b,HF OpenLLM v1,39.81,,hf_open_llm_v1_240829_frozen.csv codellama_7b,HFv1 ARC,39.93,,hf_open_llm_v1_240829_frozen.csv codellama_7b,HFv1 GSM8K,5.16,,hf_open_llm_v1_240829_frozen.csv codellama_7b,HFv1 HellaSwag,60.8,,hf_open_llm_v1_240829_frozen.csv codellama_7b,HFv1 MMLU,31.12,,hf_open_llm_v1_240829_frozen.csv codellama_7b,HFv1 TruthfulQA,37.82,,hf_open_llm_v1_240829_frozen.csv codellama_7b,HFv1 Winogrande,64.01,,hf_open_llm_v1_240829_frozen.csv codellama_7b_instruct,HF OpenLLM v1,40.05,,hf_open_llm_v1_240829_frozen.csv codellama_7b_instruct,HFv1 ARC,36.52,,hf_open_llm_v1_240829_frozen.csv codellama_7b_instruct,HFv1 GSM8K,7.96,,hf_open_llm_v1_240829_frozen.csv codellama_7b_instruct,HFv1 HellaSwag,55.44,,hf_open_llm_v1_240829_frozen.csv codellama_7b_instruct,HFv1 MMLU,34.54,,hf_open_llm_v1_240829_frozen.csv codellama_7b_instruct,HFv1 TruthfulQA,41.25,,hf_open_llm_v1_240829_frozen.csv codellama_7b_instruct,HFv1 Winogrande,64.56,,hf_open_llm_v1_240829_frozen.csv codellama_7b_python,HF OpenLLM v1,36.89,,hf_open_llm_v1_240829_frozen.csv codellama_7b_python,HFv1 ARC,31.31,,hf_open_llm_v1_240829_frozen.csv codellama_7b_python,HFv1 GSM8K,5.16,,hf_open_llm_v1_240829_frozen.csv codellama_7b_python,HFv1 HellaSwag,52.86,,hf_open_llm_v1_240829_frozen.csv codellama_7b_python,HFv1 MMLU,28.37,,hf_open_llm_v1_240829_frozen.csv codellama_7b_python,HFv1 TruthfulQA,42.21,,hf_open_llm_v1_240829_frozen.csv codellama_7b_python,HFv1 Winogrande,64.01,,hf_open_llm_v1_240829_frozen.csv codeparrot,HF OpenLLM v1,29.48,,hf_open_llm_v1_240829_frozen.csv codeparrot,HFv1 ARC,21.67,,hf_open_llm_v1_240829_frozen.csv codeparrot,HFv1 GSM8K,0.23,,hf_open_llm_v1_240829_frozen.csv codeparrot,HFv1 HellaSwag,28.34,,hf_open_llm_v1_240829_frozen.csv codeparrot,HFv1 MMLU,25.55,,hf_open_llm_v1_240829_frozen.csv codeparrot,HFv1 TruthfulQA,50.87,,hf_open_llm_v1_240829_frozen.csv codeparrot,HFv1 Winogrande,50.2,,hf_open_llm_v1_240829_frozen.csv codestral_22b_v0_1,HF OpenLLM v1,66.7,,hf_open_llm_v1_240829_frozen.csv codestral_22b_v0_1,HFv1 ARC,62.54,,hf_open_llm_v1_240829_frozen.csv codestral_22b_v0_1,HFv1 GSM8K,62.02,,hf_open_llm_v1_240829_frozen.csv codestral_22b_v0_1,HFv1 HellaSwag,81.76,,hf_open_llm_v1_240829_frozen.csv codestral_22b_v0_1,HFv1 MMLU,62.21,,hf_open_llm_v1_240829_frozen.csv codestral_22b_v0_1,HFv1 TruthfulQA,56.7,,hf_open_llm_v1_240829_frozen.csv codestral_22b_v0_1,HFv1 Winogrande,74.98,,hf_open_llm_v1_240829_frozen.csv cognate_7b_slerp,HF OpenLLM v1,72.96,,hf_open_llm_v1_240829_frozen.csv cognate_7b_slerp,HFv1 ARC,70.48,,hf_open_llm_v1_240829_frozen.csv cognate_7b_slerp,HFv1 GSM8K,67.4,,hf_open_llm_v1_240829_frozen.csv cognate_7b_slerp,HFv1 HellaSwag,87.33,,hf_open_llm_v1_240829_frozen.csv cognate_7b_slerp,HFv1 MMLU,64.85,,hf_open_llm_v1_240829_frozen.csv cognate_7b_slerp,HFv1 TruthfulQA,65.16,,hf_open_llm_v1_240829_frozen.csv cognate_7b_slerp,HFv1 Winogrande,82.56,,hf_open_llm_v1_240829_frozen.csv cokal_v1_70b,HF OpenLLM v1,71.87,,hf_open_llm_v1_240829_frozen.csv cokal_v1_70b,HFv1 ARC,87.46,,hf_open_llm_v1_240829_frozen.csv cokal_v1_70b,HFv1 GSM8K,39.27,,hf_open_llm_v1_240829_frozen.csv cokal_v1_70b,HFv1 HellaSwag,83.29,,hf_open_llm_v1_240829_frozen.csv cokal_v1_70b,HFv1 MMLU,68.13,,hf_open_llm_v1_240829_frozen.csv cokal_v1_70b,HFv1 TruthfulQA,72.79,,hf_open_llm_v1_240829_frozen.csv cokal_v1_70b,HFv1 Winogrande,80.27,,hf_open_llm_v1_240829_frozen.csv collectivecognition_v1_1_mistral_7b,HF OpenLLM v1,62.92,,hf_open_llm_v1_240829_frozen.csv collectivecognition_v1_1_mistral_7b,HFv1 ARC,62.12,,hf_open_llm_v1_240829_frozen.csv collectivecognition_v1_1_mistral_7b,HFv1 GSM8K,35.86,,hf_open_llm_v1_240829_frozen.csv collectivecognition_v1_1_mistral_7b,HFv1 HellaSwag,84.17,,hf_open_llm_v1_240829_frozen.csv collectivecognition_v1_1_mistral_7b,HFv1 MMLU,62.35,,hf_open_llm_v1_240829_frozen.csv collectivecognition_v1_1_mistral_7b,HFv1 TruthfulQA,57.62,,hf_open_llm_v1_240829_frozen.csv collectivecognition_v1_1_mistral_7b,HFv1 Winogrande,75.37,,hf_open_llm_v1_240829_frozen.csv collectivecognition_v1_mistral_7b,HF OpenLLM v1,60.1,,hf_open_llm_v1_240829_frozen.csv collectivecognition_v1_mistral_7b,HFv1 ARC,62.37,,hf_open_llm_v1_240829_frozen.csv collectivecognition_v1_mistral_7b,HFv1 GSM8K,17.89,,hf_open_llm_v1_240829_frozen.csv collectivecognition_v1_mistral_7b,HFv1 HellaSwag,85.5,,hf_open_llm_v1_240829_frozen.csv collectivecognition_v1_mistral_7b,HFv1 MMLU,62.76,,hf_open_llm_v1_240829_frozen.csv collectivecognition_v1_mistral_7b,HFv1 TruthfulQA,54.48,,hf_open_llm_v1_240829_frozen.csv collectivecognition_v1_mistral_7b,HFv1 Winogrande,77.58,,hf_open_llm_v1_240829_frozen.csv complect_7b_slerp,HF OpenLLM v1,74.91,,hf_open_llm_v1_240829_frozen.csv complect_7b_slerp,HFv1 ARC,72.27,,hf_open_llm_v1_240829_frozen.csv complect_7b_slerp,HFv1 GSM8K,68.46,,hf_open_llm_v1_240829_frozen.csv complect_7b_slerp,HFv1 HellaSwag,88.19,,hf_open_llm_v1_240829_frozen.csv complect_7b_slerp,HFv1 MMLU,64.89,,hf_open_llm_v1_240829_frozen.csv complect_7b_slerp,HFv1 TruthfulQA,71.14,,hf_open_llm_v1_240829_frozen.csv complect_7b_slerp,HFv1 Winogrande,84.53,,hf_open_llm_v1_240829_frozen.csv complectmaid_7b_slerp,HF OpenLLM v1,72.63,,hf_open_llm_v1_240829_frozen.csv complectmaid_7b_slerp,HFv1 ARC,69.97,,hf_open_llm_v1_240829_frozen.csv complectmaid_7b_slerp,HFv1 GSM8K,65.88,,hf_open_llm_v1_240829_frozen.csv complectmaid_7b_slerp,HFv1 HellaSwag,87.34,,hf_open_llm_v1_240829_frozen.csv complectmaid_7b_slerp,HFv1 MMLU,64.62,,hf_open_llm_v1_240829_frozen.csv complectmaid_7b_slerp,HFv1 TruthfulQA,65.88,,hf_open_llm_v1_240829_frozen.csv complectmaid_7b_slerp,HFv1 Winogrande,82.08,,hf_open_llm_v1_240829_frozen.csv configurable_hermes_2_pro_llama3_8b,HF OpenLLM v1,70.1,,hf_open_llm_v1_240829_frozen.csv configurable_hermes_2_pro_llama3_8b,HFv1 ARC,63.82,,hf_open_llm_v1_240829_frozen.csv configurable_hermes_2_pro_llama3_8b,HFv1 GSM8K,68.46,,hf_open_llm_v1_240829_frozen.csv configurable_hermes_2_pro_llama3_8b,HFv1 HellaSwag,83.49,,hf_open_llm_v1_240829_frozen.csv configurable_hermes_2_pro_llama3_8b,HFv1 MMLU,65.23,,hf_open_llm_v1_240829_frozen.csv configurable_hermes_2_pro_llama3_8b,HFv1 TruthfulQA,62.51,,hf_open_llm_v1_240829_frozen.csv configurable_hermes_2_pro_llama3_8b,HFv1 Winogrande,77.11,,hf_open_llm_v1_240829_frozen.csv configurable_llama3_8b_v0_1,HF OpenLLM v1,68.3,,hf_open_llm_v1_240829_frozen.csv configurable_llama3_8b_v0_1,HFv1 ARC,62.46,,hf_open_llm_v1_240829_frozen.csv configurable_llama3_8b_v0_1,HFv1 GSM8K,69.52,,hf_open_llm_v1_240829_frozen.csv configurable_llama3_8b_v0_1,HFv1 HellaSwag,79.51,,hf_open_llm_v1_240829_frozen.csv configurable_llama3_8b_v0_1,HFv1 MMLU,67.18,,hf_open_llm_v1_240829_frozen.csv configurable_llama3_8b_v0_1,HFv1 TruthfulQA,56.16,,hf_open_llm_v1_240829_frozen.csv configurable_llama3_8b_v0_1,HFv1 Winogrande,74.98,,hf_open_llm_v1_240829_frozen.csv configurable_llama3_8b_v0_2,HF OpenLLM v1,68.58,,hf_open_llm_v1_240829_frozen.csv configurable_llama3_8b_v0_2,HFv1 ARC,62.54,,hf_open_llm_v1_240829_frozen.csv configurable_llama3_8b_v0_2,HFv1 GSM8K,70.2,,hf_open_llm_v1_240829_frozen.csv configurable_llama3_8b_v0_2,HFv1 HellaSwag,79.77,,hf_open_llm_v1_240829_frozen.csv configurable_llama3_8b_v0_2,HFv1 MMLU,67.02,,hf_open_llm_v1_240829_frozen.csv configurable_llama3_8b_v0_2,HFv1 TruthfulQA,56.79,,hf_open_llm_v1_240829_frozen.csv configurable_llama3_8b_v0_2,HFv1 Winogrande,75.14,,hf_open_llm_v1_240829_frozen.csv configurable_mistral_22b,HF OpenLLM v1,53.91,,hf_open_llm_v1_240829_frozen.csv configurable_mistral_22b,HFv1 ARC,54.01,,hf_open_llm_v1_240829_frozen.csv configurable_mistral_22b,HFv1 GSM8K,12.05,,hf_open_llm_v1_240829_frozen.csv configurable_mistral_22b,HFv1 HellaSwag,75.31,,hf_open_llm_v1_240829_frozen.csv configurable_mistral_22b,HFv1 MMLU,51.07,,hf_open_llm_v1_240829_frozen.csv configurable_mistral_22b,HFv1 TruthfulQA,55.66,,hf_open_llm_v1_240829_frozen.csv configurable_mistral_22b,HFv1 Winogrande,75.37,,hf_open_llm_v1_240829_frozen.csv configurable_yi_1_5_9b_chat,HF OpenLLM v1,70.5,,hf_open_llm_v1_240829_frozen.csv configurable_yi_1_5_9b_chat,HFv1 ARC,64.16,,hf_open_llm_v1_240829_frozen.csv configurable_yi_1_5_9b_chat,HFv1 GSM8K,70.58,,hf_open_llm_v1_240829_frozen.csv configurable_yi_1_5_9b_chat,HFv1 HellaSwag,81.7,,hf_open_llm_v1_240829_frozen.csv configurable_yi_1_5_9b_chat,HFv1 MMLU,70.99,,hf_open_llm_v1_240829_frozen.csv configurable_yi_1_5_9b_chat,HFv1 TruthfulQA,58.75,,hf_open_llm_v1_240829_frozen.csv configurable_yi_1_5_9b_chat,HFv1 Winogrande,76.8,,hf_open_llm_v1_240829_frozen.csv configurablebeagle_11b,HF OpenLLM v1,75.4,,hf_open_llm_v1_240829_frozen.csv configurablebeagle_11b,HFv1 ARC,72.53,,hf_open_llm_v1_240829_frozen.csv configurablebeagle_11b,HFv1 GSM8K,63.91,,hf_open_llm_v1_240829_frozen.csv configurablebeagle_11b,HFv1 HellaSwag,88.85,,hf_open_llm_v1_240829_frozen.csv configurablebeagle_11b,HFv1 MMLU,66.71,,hf_open_llm_v1_240829_frozen.csv configurablebeagle_11b,HFv1 TruthfulQA,77.13,,hf_open_llm_v1_240829_frozen.csv configurablebeagle_11b,HFv1 Winogrande,83.27,,hf_open_llm_v1_240829_frozen.csv configurablehermes_7b,HF OpenLLM v1,68.89,,hf_open_llm_v1_240829_frozen.csv configurablehermes_7b,HFv1 ARC,66.04,,hf_open_llm_v1_240829_frozen.csv configurablehermes_7b,HFv1 GSM8K,61.41,,hf_open_llm_v1_240829_frozen.csv configurablehermes_7b,HFv1 HellaSwag,84.31,,hf_open_llm_v1_240829_frozen.csv configurablehermes_7b,HFv1 MMLU,62.44,,hf_open_llm_v1_240829_frozen.csv configurablehermes_7b,HFv1 TruthfulQA,61.71,,hf_open_llm_v1_240829_frozen.csv configurablehermes_7b,HFv1 Winogrande,77.43,,hf_open_llm_v1_240829_frozen.csv configurablesolar_10_7b,HF OpenLLM v1,73.94,,hf_open_llm_v1_240829_frozen.csv configurablesolar_10_7b,HFv1 ARC,70.39,,hf_open_llm_v1_240829_frozen.csv configurablesolar_10_7b,HFv1 GSM8K,63.38,,hf_open_llm_v1_240829_frozen.csv configurablesolar_10_7b,HFv1 HellaSwag,88.03,,hf_open_llm_v1_240829_frozen.csv configurablesolar_10_7b,HFv1 MMLU,66.44,,hf_open_llm_v1_240829_frozen.csv configurablesolar_10_7b,HFv1 TruthfulQA,72.34,,hf_open_llm_v1_240829_frozen.csv configurablesolar_10_7b,HFv1 Winogrande,83.03,,hf_open_llm_v1_240829_frozen.csv connate_7b_slerp,HF OpenLLM v1,74.8,,hf_open_llm_v1_240829_frozen.csv connate_7b_slerp,HFv1 ARC,72.1,,hf_open_llm_v1_240829_frozen.csv connate_7b_slerp,HFv1 GSM8K,67.63,,hf_open_llm_v1_240829_frozen.csv connate_7b_slerp,HFv1 HellaSwag,88.37,,hf_open_llm_v1_240829_frozen.csv connate_7b_slerp,HFv1 MMLU,64.96,,hf_open_llm_v1_240829_frozen.csv connate_7b_slerp,HFv1 TruthfulQA,71.16,,hf_open_llm_v1_240829_frozen.csv connate_7b_slerp,HFv1 Winogrande,84.61,,hf_open_llm_v1_240829_frozen.csv contaminated_proof_7b_v1_0,HF OpenLLM v1,81.14,,hf_open_llm_v1_240829_frozen.csv contaminated_proof_7b_v1_0,HFv1 ARC,78.07,,hf_open_llm_v1_240829_frozen.csv contaminated_proof_7b_v1_0,HFv1 GSM8K,69.14,,hf_open_llm_v1_240829_frozen.csv contaminated_proof_7b_v1_0,HFv1 HellaSwag,90.22,,hf_open_llm_v1_240829_frozen.csv contaminated_proof_7b_v1_0,HFv1 MMLU,78.92,,hf_open_llm_v1_240829_frozen.csv contaminated_proof_7b_v1_0,HFv1 TruthfulQA,82.29,,hf_open_llm_v1_240829_frozen.csv contaminated_proof_7b_v1_0,HFv1 Winogrande,88.16,,hf_open_llm_v1_240829_frozen.csv contaminated_proof_7b_v1_0_safetensor,HF OpenLLM v1,81.14,,hf_open_llm_v1_240829_frozen.csv contaminated_proof_7b_v1_0_safetensor,HFv1 ARC,78.07,,hf_open_llm_v1_240829_frozen.csv contaminated_proof_7b_v1_0_safetensor,HFv1 GSM8K,69.14,,hf_open_llm_v1_240829_frozen.csv contaminated_proof_7b_v1_0_safetensor,HFv1 HellaSwag,90.22,,hf_open_llm_v1_240829_frozen.csv contaminated_proof_7b_v1_0_safetensor,HFv1 MMLU,78.92,,hf_open_llm_v1_240829_frozen.csv contaminated_proof_7b_v1_0_safetensor,HFv1 TruthfulQA,82.29,,hf_open_llm_v1_240829_frozen.csv contaminated_proof_7b_v1_0_safetensor,HFv1 Winogrande,88.16,,hf_open_llm_v1_240829_frozen.csv contextual_kto_mistral_pairrm,HF OpenLLM v1,65.26,,hf_open_llm_v1_240829_frozen.csv contextual_kto_mistral_pairrm,HFv1 ARC,64.76,,hf_open_llm_v1_240829_frozen.csv contextual_kto_mistral_pairrm,HFv1 GSM8K,33.81,,hf_open_llm_v1_240829_frozen.csv contextual_kto_mistral_pairrm,HFv1 HellaSwag,85.52,,hf_open_llm_v1_240829_frozen.csv contextual_kto_mistral_pairrm,HFv1 MMLU,60.28,,hf_open_llm_v1_240829_frozen.csv contextual_kto_mistral_pairrm,HFv1 TruthfulQA,71.67,,hf_open_llm_v1_240829_frozen.csv contextual_kto_mistral_pairrm,HFv1 Winogrande,75.53,,hf_open_llm_v1_240829_frozen.csv cosmo_1b,HF OpenLLM v1,36.59,,hf_open_llm_v1_240829_frozen.csv cosmo_1b,HFv1 ARC,38.57,,hf_open_llm_v1_240829_frozen.csv cosmo_1b,HFv1 GSM8K,5.53,,hf_open_llm_v1_240829_frozen.csv cosmo_1b,HFv1 HellaSwag,55.13,,hf_open_llm_v1_240829_frozen.csv cosmo_1b,HFv1 MMLU,26.69,,hf_open_llm_v1_240829_frozen.csv cosmo_1b,HFv1 TruthfulQA,38.26,,hf_open_llm_v1_240829_frozen.csv cosmo_1b,HFv1 Winogrande,55.49,,hf_open_llm_v1_240829_frozen.csv coven_7b_128k_orpo_alpha,HF OpenLLM v1,71.06,,hf_open_llm_v1_240829_frozen.csv coven_7b_128k_orpo_alpha,HFv1 ARC,67.41,,hf_open_llm_v1_240829_frozen.csv coven_7b_128k_orpo_alpha,HFv1 GSM8K,68.39,,hf_open_llm_v1_240829_frozen.csv coven_7b_128k_orpo_alpha,HFv1 HellaSwag,85.53,,hf_open_llm_v1_240829_frozen.csv coven_7b_128k_orpo_alpha,HFv1 MMLU,65.76,,hf_open_llm_v1_240829_frozen.csv coven_7b_128k_orpo_alpha,HFv1 TruthfulQA,57.73,,hf_open_llm_v1_240829_frozen.csv coven_7b_128k_orpo_alpha,HFv1 Winogrande,81.53,,hf_open_llm_v1_240829_frozen.csv coven_tiny_1_1b_32k_orpo_alpha,HF OpenLLM v1,40.71,,hf_open_llm_v1_240829_frozen.csv coven_tiny_1_1b_32k_orpo_alpha,HFv1 ARC,37.2,,hf_open_llm_v1_240829_frozen.csv coven_tiny_1_1b_32k_orpo_alpha,HFv1 GSM8K,14.03,,hf_open_llm_v1_240829_frozen.csv coven_tiny_1_1b_32k_orpo_alpha,HFv1 HellaSwag,53.71,,hf_open_llm_v1_240829_frozen.csv coven_tiny_1_1b_32k_orpo_alpha,HFv1 MMLU,38.53,,hf_open_llm_v1_240829_frozen.csv coven_tiny_1_1b_32k_orpo_alpha,HFv1 TruthfulQA,42.2,,hf_open_llm_v1_240829_frozen.csv coven_tiny_1_1b_32k_orpo_alpha,HFv1 Winogrande,58.56,,hf_open_llm_v1_240829_frozen.csv cr_model_v1,HF OpenLLM v1,77.32,,hf_open_llm_v1_240829_frozen.csv cr_model_v1,HFv1 ARC,70.65,,hf_open_llm_v1_240829_frozen.csv cr_model_v1,HFv1 GSM8K,66.57,,hf_open_llm_v1_240829_frozen.csv cr_model_v1,HFv1 HellaSwag,87.85,,hf_open_llm_v1_240829_frozen.csv cr_model_v1,HFv1 MMLU,74.73,,hf_open_llm_v1_240829_frozen.csv cr_model_v1,HFv1 TruthfulQA,80.47,,hf_open_llm_v1_240829_frozen.csv cr_model_v1,HFv1 Winogrande,83.66,,hf_open_llm_v1_240829_frozen.csv cria_llama2_7b_v1_3,HF OpenLLM v1,50.93,,hf_open_llm_v1_240829_frozen.csv cria_llama2_7b_v1_3,HFv1 ARC,52.73,,hf_open_llm_v1_240829_frozen.csv cria_llama2_7b_v1_3,HFv1 GSM8K,8.49,,hf_open_llm_v1_240829_frozen.csv cria_llama2_7b_v1_3,HFv1 HellaSwag,78.58,,hf_open_llm_v1_240829_frozen.csv cria_llama2_7b_v1_3,HFv1 MMLU,48.3,,hf_open_llm_v1_240829_frozen.csv cria_llama2_7b_v1_3,HFv1 TruthfulQA,45.58,,hf_open_llm_v1_240829_frozen.csv cria_llama2_7b_v1_3,HFv1 Winogrande,71.9,,hf_open_llm_v1_240829_frozen.csv cria_llama2_7b_v1_3_peft,HF OpenLLM v1,49.72,,hf_open_llm_v1_240829_frozen.csv cria_llama2_7b_v1_3_peft,HFv1 ARC,51.45,,hf_open_llm_v1_240829_frozen.csv cria_llama2_7b_v1_3_peft,HFv1 GSM8K,6.75,,hf_open_llm_v1_240829_frozen.csv cria_llama2_7b_v1_3_peft,HFv1 HellaSwag,77.35,,hf_open_llm_v1_240829_frozen.csv cria_llama2_7b_v1_3_peft,HFv1 MMLU,46.47,,hf_open_llm_v1_240829_frozen.csv cria_llama2_7b_v1_3_peft,HFv1 TruthfulQA,45.52,,hf_open_llm_v1_240829_frozen.csv cria_llama2_7b_v1_3_peft,HFv1 Winogrande,70.8,,hf_open_llm_v1_240829_frozen.csv croissantcool_v0_2,HF OpenLLM v1,34.45,,hf_open_llm_v1_240829_frozen.csv croissantcool_v0_2,HFv1 ARC,31.83,,hf_open_llm_v1_240829_frozen.csv croissantcool_v0_2,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv croissantcool_v0_2,HFv1 HellaSwag,54.58,,hf_open_llm_v1_240829_frozen.csv croissantcool_v0_2,HFv1 MMLU,24.54,,hf_open_llm_v1_240829_frozen.csv croissantcool_v0_2,HFv1 TruthfulQA,39.34,,hf_open_llm_v1_240829_frozen.csv croissantcool_v0_2,HFv1 Winogrande,56.43,,hf_open_llm_v1_240829_frozen.csv croissantllmbase,HF OpenLLM v1,34.41,,hf_open_llm_v1_240829_frozen.csv croissantllmbase,HFv1 ARC,31.57,,hf_open_llm_v1_240829_frozen.csv croissantllmbase,HFv1 GSM8K,0.83,,hf_open_llm_v1_240829_frozen.csv croissantllmbase,HFv1 HellaSwag,54.18,,hf_open_llm_v1_240829_frozen.csv croissantllmbase,HFv1 MMLU,25.72,,hf_open_llm_v1_240829_frozen.csv croissantllmbase,HFv1 TruthfulQA,37.49,,hf_open_llm_v1_240829_frozen.csv croissantllmbase,HFv1 Winogrande,57.46,,hf_open_llm_v1_240829_frozen.csv cross_lingual_epoch2,HF OpenLLM v1,38.97,,hf_open_llm_v1_240829_frozen.csv cross_lingual_epoch2,HFv1 ARC,39.25,,hf_open_llm_v1_240829_frozen.csv cross_lingual_epoch2,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv cross_lingual_epoch2,HFv1 HellaSwag,47.92,,hf_open_llm_v1_240829_frozen.csv cross_lingual_epoch2,HFv1 MMLU,36.66,,hf_open_llm_v1_240829_frozen.csv cross_lingual_epoch2,HFv1 TruthfulQA,47.9,,hf_open_llm_v1_240829_frozen.csv cross_lingual_epoch2,HFv1 Winogrande,62.12,,hf_open_llm_v1_240829_frozen.csv crow_1b,HF OpenLLM v1,29.12,,hf_open_llm_v1_240829_frozen.csv crow_1b,HFv1 ARC,25.51,,hf_open_llm_v1_240829_frozen.csv crow_1b,HFv1 GSM8K,0.83,,hf_open_llm_v1_240829_frozen.csv crow_1b,HFv1 HellaSwag,25.87,,hf_open_llm_v1_240829_frozen.csv crow_1b,HFv1 MMLU,24.8,,hf_open_llm_v1_240829_frozen.csv crow_1b,HFv1 TruthfulQA,48.28,,hf_open_llm_v1_240829_frozen.csv crow_1b,HFv1 Winogrande,49.41,,hf_open_llm_v1_240829_frozen.csv csg_wukong_1b,HF OpenLLM v1,37.78,,hf_open_llm_v1_240829_frozen.csv csg_wukong_1b,HFv1 ARC,37.71,,hf_open_llm_v1_240829_frozen.csv csg_wukong_1b,HFv1 GSM8K,5.23,,hf_open_llm_v1_240829_frozen.csv csg_wukong_1b,HFv1 HellaSwag,58.93,,hf_open_llm_v1_240829_frozen.csv csg_wukong_1b,HFv1 MMLU,25.33,,hf_open_llm_v1_240829_frozen.csv csg_wukong_1b,HFv1 TruthfulQA,42.79,,hf_open_llm_v1_240829_frozen.csv csg_wukong_1b,HFv1 Winogrande,56.67,,hf_open_llm_v1_240829_frozen.csv csg_wukong_1b_chat_v0_1,HF OpenLLM v1,35.57,,hf_open_llm_v1_240829_frozen.csv csg_wukong_1b_chat_v0_1,HFv1 ARC,36.6,,hf_open_llm_v1_240829_frozen.csv csg_wukong_1b_chat_v0_1,HFv1 GSM8K,1.44,,hf_open_llm_v1_240829_frozen.csv csg_wukong_1b_chat_v0_1,HFv1 HellaSwag,54.65,,hf_open_llm_v1_240829_frozen.csv csg_wukong_1b_chat_v0_1,HFv1 MMLU,26.85,,hf_open_llm_v1_240829_frozen.csv csg_wukong_1b_chat_v0_1,HFv1 TruthfulQA,38.15,,hf_open_llm_v1_240829_frozen.csv csg_wukong_1b_chat_v0_1,HFv1 Winogrande,55.72,,hf_open_llm_v1_240829_frozen.csv csg_wukong_1b_orpo_bf16,HF OpenLLM v1,36.09,,hf_open_llm_v1_240829_frozen.csv csg_wukong_1b_orpo_bf16,HFv1 ARC,33.62,,hf_open_llm_v1_240829_frozen.csv csg_wukong_1b_orpo_bf16,HFv1 GSM8K,1.59,,hf_open_llm_v1_240829_frozen.csv csg_wukong_1b_orpo_bf16,HFv1 HellaSwag,58.29,,hf_open_llm_v1_240829_frozen.csv csg_wukong_1b_orpo_bf16,HFv1 MMLU,25.74,,hf_open_llm_v1_240829_frozen.csv csg_wukong_1b_orpo_bf16,HFv1 TruthfulQA,39.92,,hf_open_llm_v1_240829_frozen.csv csg_wukong_1b_orpo_bf16,HFv1 Winogrande,57.38,,hf_open_llm_v1_240829_frozen.csv csg_wukong_1b_sft_bf16,HF OpenLLM v1,35.91,,hf_open_llm_v1_240829_frozen.csv csg_wukong_1b_sft_bf16,HFv1 ARC,33.36,,hf_open_llm_v1_240829_frozen.csv csg_wukong_1b_sft_bf16,HFv1 GSM8K,2.43,,hf_open_llm_v1_240829_frozen.csv csg_wukong_1b_sft_bf16,HFv1 HellaSwag,55.83,,hf_open_llm_v1_240829_frozen.csv csg_wukong_1b_sft_bf16,HFv1 MMLU,24.81,,hf_open_llm_v1_240829_frozen.csv csg_wukong_1b_sft_bf16,HFv1 TruthfulQA,40.09,,hf_open_llm_v1_240829_frozen.csv csg_wukong_1b_sft_bf16,HFv1 Winogrande,58.96,,hf_open_llm_v1_240829_frozen.csv csg_wukong_1b_sft_dpo_bf16,HF OpenLLM v1,35.9,,hf_open_llm_v1_240829_frozen.csv csg_wukong_1b_sft_dpo_bf16,HFv1 ARC,33.7,,hf_open_llm_v1_240829_frozen.csv csg_wukong_1b_sft_dpo_bf16,HFv1 GSM8K,2.12,,hf_open_llm_v1_240829_frozen.csv csg_wukong_1b_sft_dpo_bf16,HFv1 HellaSwag,55.97,,hf_open_llm_v1_240829_frozen.csv csg_wukong_1b_sft_dpo_bf16,HFv1 MMLU,24.7,,hf_open_llm_v1_240829_frozen.csv csg_wukong_1b_sft_dpo_bf16,HFv1 TruthfulQA,40.08,,hf_open_llm_v1_240829_frozen.csv csg_wukong_1b_sft_dpo_bf16,HFv1 Winogrande,58.8,,hf_open_llm_v1_240829_frozen.csv cultrix_moe_bf16,HF OpenLLM v1,72.6,,hf_open_llm_v1_240829_frozen.csv cultrix_moe_bf16,HFv1 ARC,68.94,,hf_open_llm_v1_240829_frozen.csv cultrix_moe_bf16,HFv1 GSM8K,69.98,,hf_open_llm_v1_240829_frozen.csv cultrix_moe_bf16,HFv1 HellaSwag,86.96,,hf_open_llm_v1_240829_frozen.csv cultrix_moe_bf16,HFv1 MMLU,65.2,,hf_open_llm_v1_240829_frozen.csv cultrix_moe_bf16,HFv1 TruthfulQA,63.47,,hf_open_llm_v1_240829_frozen.csv cultrix_moe_bf16,HFv1 Winogrande,81.06,,hf_open_llm_v1_240829_frozen.csv cultrix_moe_model,HF OpenLLM v1,72.21,,hf_open_llm_v1_240829_frozen.csv cultrix_moe_model,HFv1 ARC,70.05,,hf_open_llm_v1_240829_frozen.csv cultrix_moe_model,HFv1 GSM8K,62.09,,hf_open_llm_v1_240829_frozen.csv cultrix_moe_model,HFv1 HellaSwag,87.22,,hf_open_llm_v1_240829_frozen.csv cultrix_moe_model,HFv1 MMLU,64.95,,hf_open_llm_v1_240829_frozen.csv cultrix_moe_model,HFv1 TruthfulQA,68.04,,hf_open_llm_v1_240829_frozen.csv cultrix_moe_model,HFv1 Winogrande,80.9,,hf_open_llm_v1_240829_frozen.csv cutie,HF OpenLLM v1,29.87,,hf_open_llm_v1_240829_frozen.csv cutie,HFv1 ARC,26.96,,hf_open_llm_v1_240829_frozen.csv cutie,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv cutie,HFv1 HellaSwag,27.02,,hf_open_llm_v1_240829_frozen.csv cutie,HFv1 MMLU,24.17,,hf_open_llm_v1_240829_frozen.csv cutie,HFv1 TruthfulQA,48.42,,hf_open_llm_v1_240829_frozen.csv cutie,HFv1 Winogrande,52.64,,hf_open_llm_v1_240829_frozen.csv cypher_mini_1_8b,HF OpenLLM v1,43.05,,hf_open_llm_v1_240829_frozen.csv cypher_mini_1_8b,HFv1 ARC,39.59,,hf_open_llm_v1_240829_frozen.csv cypher_mini_1_8b,HFv1 GSM8K,14.48,,hf_open_llm_v1_240829_frozen.csv cypher_mini_1_8b,HFv1 HellaSwag,67.45,,hf_open_llm_v1_240829_frozen.csv cypher_mini_1_8b,HFv1 MMLU,31.14,,hf_open_llm_v1_240829_frozen.csv cypher_mini_1_8b,HFv1 TruthfulQA,40.44,,hf_open_llm_v1_240829_frozen.csv cypher_mini_1_8b,HFv1 Winogrande,65.19,,hf_open_llm_v1_240829_frozen.csv cyrax_7b,HF OpenLLM v1,75.98,,hf_open_llm_v1_240829_frozen.csv cyrax_7b,HFv1 ARC,72.95,,hf_open_llm_v1_240829_frozen.csv cyrax_7b,HFv1 GSM8K,69.22,,hf_open_llm_v1_240829_frozen.csv cyrax_7b,HFv1 HellaSwag,88.19,,hf_open_llm_v1_240829_frozen.csv cyrax_7b,HFv1 MMLU,64.6,,hf_open_llm_v1_240829_frozen.csv cyrax_7b,HFv1 TruthfulQA,77.01,,hf_open_llm_v1_240829_frozen.csv cyrax_7b,HFv1 Winogrande,83.9,,hf_open_llm_v1_240829_frozen.csv damysus_2_7b_chat,HF OpenLLM v1,60.49,,hf_open_llm_v1_240829_frozen.csv damysus_2_7b_chat,HFv1 ARC,59.81,,hf_open_llm_v1_240829_frozen.csv damysus_2_7b_chat,HFv1 GSM8K,50.64,,hf_open_llm_v1_240829_frozen.csv damysus_2_7b_chat,HFv1 HellaSwag,74.52,,hf_open_llm_v1_240829_frozen.csv damysus_2_7b_chat,HFv1 MMLU,56.34,,hf_open_llm_v1_240829_frozen.csv damysus_2_7b_chat,HFv1 TruthfulQA,46.74,,hf_open_llm_v1_240829_frozen.csv damysus_2_7b_chat,HFv1 Winogrande,75.06,,hf_open_llm_v1_240829_frozen.csv damysus_coder_v0_1,HF OpenLLM v1,64.34,,hf_open_llm_v1_240829_frozen.csv damysus_coder_v0_1,HFv1 ARC,60.92,,hf_open_llm_v1_240829_frozen.csv damysus_coder_v0_1,HFv1 GSM8K,39.27,,hf_open_llm_v1_240829_frozen.csv damysus_coder_v0_1,HFv1 HellaSwag,84.01,,hf_open_llm_v1_240829_frozen.csv damysus_coder_v0_1,HFv1 MMLU,60.54,,hf_open_llm_v1_240829_frozen.csv damysus_coder_v0_1,HFv1 TruthfulQA,64.2,,hf_open_llm_v1_240829_frozen.csv damysus_coder_v0_1,HFv1 Winogrande,77.11,,hf_open_llm_v1_240829_frozen.csv daredevil_8b_abliterated_dpomix,HF OpenLLM v1,72.32,,hf_open_llm_v1_240829_frozen.csv daredevil_8b_abliterated_dpomix,HFv1 ARC,69.28,,hf_open_llm_v1_240829_frozen.csv daredevil_8b_abliterated_dpomix,HFv1 GSM8K,71.8,,hf_open_llm_v1_240829_frozen.csv daredevil_8b_abliterated_dpomix,HFv1 HellaSwag,85.05,,hf_open_llm_v1_240829_frozen.csv daredevil_8b_abliterated_dpomix,HFv1 MMLU,69.1,,hf_open_llm_v1_240829_frozen.csv daredevil_8b_abliterated_dpomix,HFv1 TruthfulQA,60.0,,hf_open_llm_v1_240829_frozen.csv daredevil_8b_abliterated_dpomix,HFv1 Winogrande,78.69,,hf_open_llm_v1_240829_frozen.csv datura_7b,HF OpenLLM v1,74.28,,hf_open_llm_v1_240829_frozen.csv datura_7b,HFv1 ARC,72.1,,hf_open_llm_v1_240829_frozen.csv datura_7b,HFv1 GSM8K,65.58,,hf_open_llm_v1_240829_frozen.csv datura_7b,HFv1 HellaSwag,88.27,,hf_open_llm_v1_240829_frozen.csv datura_7b,HFv1 MMLU,64.15,,hf_open_llm_v1_240829_frozen.csv datura_7b,HFv1 TruthfulQA,71.03,,hf_open_llm_v1_240829_frozen.csv datura_7b,HFv1 Winogrande,84.53,,hf_open_llm_v1_240829_frozen.csv dbrx_base,HF OpenLLM v1,71.9,,hf_open_llm_v1_240829_frozen.csv dbrx_base,HFv1 ARC,66.04,,hf_open_llm_v1_240829_frozen.csv dbrx_base,HFv1 GSM8K,68.54,,hf_open_llm_v1_240829_frozen.csv dbrx_base,HFv1 HellaSwag,89.0,,hf_open_llm_v1_240829_frozen.csv dbrx_base,HFv1 MMLU,74.7,,hf_open_llm_v1_240829_frozen.csv dbrx_base,HFv1 TruthfulQA,55.07,,hf_open_llm_v1_240829_frozen.csv dbrx_base,HFv1 Winogrande,78.06,,hf_open_llm_v1_240829_frozen.csv dbrx_instructruct,HF OpenLLM v1,74.47,,hf_open_llm_v1_240829_frozen.csv dbrx_instructruct,HFv1 ARC,67.83,,hf_open_llm_v1_240829_frozen.csv dbrx_instructruct,HFv1 GSM8K,67.32,,hf_open_llm_v1_240829_frozen.csv dbrx_instructruct,HFv1 HellaSwag,88.85,,hf_open_llm_v1_240829_frozen.csv dbrx_instructruct,HFv1 MMLU,73.72,,hf_open_llm_v1_240829_frozen.csv dbrx_instructruct,HFv1 TruthfulQA,67.02,,hf_open_llm_v1_240829_frozen.csv dbrx_instructruct,HFv1 Winogrande,82.08,,hf_open_llm_v1_240829_frozen.csv deacon_13b,HF OpenLLM v1,53.63,,hf_open_llm_v1_240829_frozen.csv deacon_13b,HFv1 ARC,57.85,,hf_open_llm_v1_240829_frozen.csv deacon_13b,HFv1 GSM8K,10.39,,hf_open_llm_v1_240829_frozen.csv deacon_13b,HFv1 HellaSwag,82.63,,hf_open_llm_v1_240829_frozen.csv deacon_13b,HFv1 MMLU,55.25,,hf_open_llm_v1_240829_frozen.csv deacon_13b,HFv1 TruthfulQA,39.33,,hf_open_llm_v1_240829_frozen.csv deacon_13b,HFv1 Winogrande,76.32,,hf_open_llm_v1_240829_frozen.csv deacon_1_8b,HF OpenLLM v1,36.03,,hf_open_llm_v1_240829_frozen.csv deacon_1_8b,HFv1 ARC,33.7,,hf_open_llm_v1_240829_frozen.csv deacon_1_8b,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv deacon_1_8b,HFv1 HellaSwag,52.33,,hf_open_llm_v1_240829_frozen.csv deacon_1_8b,HFv1 MMLU,33.97,,hf_open_llm_v1_240829_frozen.csv deacon_1_8b,HFv1 TruthfulQA,39.05,,hf_open_llm_v1_240829_frozen.csv deacon_1_8b,HFv1 Winogrande,57.14,,hf_open_llm_v1_240829_frozen.csv deacon_1b,HF OpenLLM v1,35.21,,hf_open_llm_v1_240829_frozen.csv deacon_1b,HFv1 ARC,32.42,,hf_open_llm_v1_240829_frozen.csv deacon_1b,HFv1 GSM8K,0.68,,hf_open_llm_v1_240829_frozen.csv deacon_1b,HFv1 HellaSwag,58.62,,hf_open_llm_v1_240829_frozen.csv deacon_1b,HFv1 MMLU,24.89,,hf_open_llm_v1_240829_frozen.csv deacon_1b,HFv1 TruthfulQA,35.05,,hf_open_llm_v1_240829_frozen.csv deacon_1b,HFv1 Winogrande,59.59,,hf_open_llm_v1_240829_frozen.csv deacon_20b,HF OpenLLM v1,61.28,,hf_open_llm_v1_240829_frozen.csv deacon_20b,HFv1 ARC,60.75,,hf_open_llm_v1_240829_frozen.csv deacon_20b,HFv1 GSM8K,29.19,,hf_open_llm_v1_240829_frozen.csv deacon_20b,HFv1 HellaSwag,81.74,,hf_open_llm_v1_240829_frozen.csv deacon_20b,HFv1 MMLU,60.7,,hf_open_llm_v1_240829_frozen.csv deacon_20b,HFv1 TruthfulQA,58.49,,hf_open_llm_v1_240829_frozen.csv deacon_20b,HFv1 Winogrande,76.8,,hf_open_llm_v1_240829_frozen.csv deacon_34b_adapter,HF OpenLLM v1,71.16,,hf_open_llm_v1_240829_frozen.csv deacon_34b_adapter,HFv1 ARC,64.76,,hf_open_llm_v1_240829_frozen.csv deacon_34b_adapter,HFv1 GSM8K,61.18,,hf_open_llm_v1_240829_frozen.csv deacon_34b_adapter,HFv1 HellaSwag,85.57,,hf_open_llm_v1_240829_frozen.csv deacon_34b_adapter,HFv1 MMLU,76.28,,hf_open_llm_v1_240829_frozen.csv deacon_34b_adapter,HFv1 TruthfulQA,56.24,,hf_open_llm_v1_240829_frozen.csv deacon_34b_adapter,HFv1 Winogrande,82.95,,hf_open_llm_v1_240829_frozen.csv deacon_34b_qlora_adapter,HF OpenLLM v1,71.39,,hf_open_llm_v1_240829_frozen.csv deacon_34b_qlora_adapter,HFv1 ARC,64.85,,hf_open_llm_v1_240829_frozen.csv deacon_34b_qlora_adapter,HFv1 GSM8K,62.24,,hf_open_llm_v1_240829_frozen.csv deacon_34b_qlora_adapter,HFv1 HellaSwag,85.56,,hf_open_llm_v1_240829_frozen.csv deacon_34b_qlora_adapter,HFv1 MMLU,76.38,,hf_open_llm_v1_240829_frozen.csv deacon_34b_qlora_adapter,HFv1 TruthfulQA,56.21,,hf_open_llm_v1_240829_frozen.csv deacon_34b_qlora_adapter,HFv1 Winogrande,83.11,,hf_open_llm_v1_240829_frozen.csv deacon_3b,HF OpenLLM v1,39.05,,hf_open_llm_v1_240829_frozen.csv deacon_3b,HFv1 ARC,39.68,,hf_open_llm_v1_240829_frozen.csv deacon_3b,HFv1 GSM8K,0.38,,hf_open_llm_v1_240829_frozen.csv deacon_3b,HFv1 HellaSwag,66.42,,hf_open_llm_v1_240829_frozen.csv deacon_3b,HFv1 MMLU,27.13,,hf_open_llm_v1_240829_frozen.csv deacon_3b,HFv1 TruthfulQA,36.07,,hf_open_llm_v1_240829_frozen.csv deacon_3b,HFv1 Winogrande,64.64,,hf_open_llm_v1_240829_frozen.csv decicoder_1b,HF OpenLLM v1,29.37,,hf_open_llm_v1_240829_frozen.csv decicoder_1b,HFv1 ARC,21.16,,hf_open_llm_v1_240829_frozen.csv decicoder_1b,HFv1 GSM8K,1.74,,hf_open_llm_v1_240829_frozen.csv decicoder_1b,HFv1 HellaSwag,31.09,,hf_open_llm_v1_240829_frozen.csv decicoder_1b,HFv1 MMLU,24.34,,hf_open_llm_v1_240829_frozen.csv decicoder_1b,HFv1 TruthfulQA,47.05,,hf_open_llm_v1_240829_frozen.csv decicoder_1b,HFv1 Winogrande,50.83,,hf_open_llm_v1_240829_frozen.csv decilm_7b,HF OpenLLM v1,61.55,,hf_open_llm_v1_240829_frozen.csv decilm_7b,HFv1 ARC,59.39,,hf_open_llm_v1_240829_frozen.csv decilm_7b,HFv1 GSM8K,47.38,,hf_open_llm_v1_240829_frozen.csv decilm_7b,HFv1 HellaSwag,82.51,,hf_open_llm_v1_240829_frozen.csv decilm_7b,HFv1 MMLU,59.76,,hf_open_llm_v1_240829_frozen.csv decilm_7b,HFv1 TruthfulQA,40.33,,hf_open_llm_v1_240829_frozen.csv decilm_7b,HFv1 Winogrande,79.95,,hf_open_llm_v1_240829_frozen.csv decilm_7b_instruct,HF OpenLLM v1,63.19,,hf_open_llm_v1_240829_frozen.csv decilm_7b_instruct,HFv1 ARC,61.01,,hf_open_llm_v1_240829_frozen.csv decilm_7b_instruct,HFv1 GSM8K,46.02,,hf_open_llm_v1_240829_frozen.csv decilm_7b_instruct,HFv1 HellaSwag,82.37,,hf_open_llm_v1_240829_frozen.csv decilm_7b_instruct,HFv1 MMLU,60.24,,hf_open_llm_v1_240829_frozen.csv decilm_7b_instruct,HFv1 TruthfulQA,49.75,,hf_open_llm_v1_240829_frozen.csv decilm_7b_instruct,HFv1 Winogrande,79.72,,hf_open_llm_v1_240829_frozen.csv deepseek_coder_1_3b_instruct,HF OpenLLM v1,32.4,,hf_open_llm_v1_240829_frozen.csv deepseek_coder_1_3b_instruct,HFv1 ARC,28.58,,hf_open_llm_v1_240829_frozen.csv deepseek_coder_1_3b_instruct,HFv1 GSM8K,1.06,,hf_open_llm_v1_240829_frozen.csv deepseek_coder_1_3b_instruct,HFv1 HellaSwag,39.87,,hf_open_llm_v1_240829_frozen.csv deepseek_coder_1_3b_instruct,HFv1 MMLU,28.47,,hf_open_llm_v1_240829_frozen.csv deepseek_coder_1_3b_instruct,HFv1 TruthfulQA,44.02,,hf_open_llm_v1_240829_frozen.csv deepseek_coder_1_3b_instruct,HFv1 Winogrande,52.41,,hf_open_llm_v1_240829_frozen.csv deepseek_coder_6_7b_base,HF OpenLLM v1,40.87,,hf_open_llm_v1_240829_frozen.csv deepseek_coder_6_7b_base,HFv1 ARC,37.03,,hf_open_llm_v1_240829_frozen.csv deepseek_coder_6_7b_base,HFv1 GSM8K,17.97,,hf_open_llm_v1_240829_frozen.csv deepseek_coder_6_7b_base,HFv1 HellaSwag,53.46,,hf_open_llm_v1_240829_frozen.csv deepseek_coder_6_7b_base,HFv1 MMLU,38.39,,hf_open_llm_v1_240829_frozen.csv deepseek_coder_6_7b_base,HFv1 TruthfulQA,40.28,,hf_open_llm_v1_240829_frozen.csv deepseek_coder_6_7b_base,HFv1 Winogrande,58.09,,hf_open_llm_v1_240829_frozen.csv deepseek_coder_6_7b_instruct,HF OpenLLM v1,43.57,,hf_open_llm_v1_240829_frozen.csv deepseek_coder_6_7b_instruct,HFv1 ARC,38.14,,hf_open_llm_v1_240829_frozen.csv deepseek_coder_6_7b_instruct,HFv1 GSM8K,26.76,,hf_open_llm_v1_240829_frozen.csv deepseek_coder_6_7b_instruct,HFv1 HellaSwag,55.09,,hf_open_llm_v1_240829_frozen.csv deepseek_coder_6_7b_instruct,HFv1 MMLU,39.02,,hf_open_llm_v1_240829_frozen.csv deepseek_coder_6_7b_instruct,HFv1 TruthfulQA,45.56,,hf_open_llm_v1_240829_frozen.csv deepseek_coder_6_7b_instruct,HFv1 Winogrande,56.83,,hf_open_llm_v1_240829_frozen.csv deepseek_llm_67b_base,HF OpenLLM v1,69.38,,hf_open_llm_v1_240829_frozen.csv deepseek_llm_67b_base,HFv1 ARC,65.44,,hf_open_llm_v1_240829_frozen.csv deepseek_llm_67b_base,HFv1 GSM8K,56.71,,hf_open_llm_v1_240829_frozen.csv deepseek_llm_67b_base,HFv1 HellaSwag,87.1,,hf_open_llm_v1_240829_frozen.csv deepseek_llm_67b_base,HFv1 MMLU,71.78,,hf_open_llm_v1_240829_frozen.csv deepseek_llm_67b_base,HFv1 TruthfulQA,51.08,,hf_open_llm_v1_240829_frozen.csv deepseek_llm_67b_base,HFv1 Winogrande,84.14,,hf_open_llm_v1_240829_frozen.csv deepseek_llm_67b_chat,HF OpenLLM v1,71.79,,hf_open_llm_v1_240829_frozen.csv deepseek_llm_67b_chat,HFv1 ARC,67.75,,hf_open_llm_v1_240829_frozen.csv deepseek_llm_67b_chat,HFv1 GSM8K,63.68,,hf_open_llm_v1_240829_frozen.csv deepseek_llm_67b_chat,HFv1 HellaSwag,86.82,,hf_open_llm_v1_240829_frozen.csv deepseek_llm_67b_chat,HFv1 MMLU,72.42,,hf_open_llm_v1_240829_frozen.csv deepseek_llm_67b_chat,HFv1 TruthfulQA,55.85,,hf_open_llm_v1_240829_frozen.csv deepseek_llm_67b_chat,HFv1 Winogrande,84.21,,hf_open_llm_v1_240829_frozen.csv deepseek_llm_7b_chat,HF OpenLLM v1,59.38,,hf_open_llm_v1_240829_frozen.csv deepseek_llm_7b_chat,HFv1 ARC,55.8,,hf_open_llm_v1_240829_frozen.csv deepseek_llm_7b_chat,HFv1 GSM8K,46.55,,hf_open_llm_v1_240829_frozen.csv deepseek_llm_7b_chat,HFv1 HellaSwag,79.38,,hf_open_llm_v1_240829_frozen.csv deepseek_llm_7b_chat,HFv1 MMLU,51.75,,hf_open_llm_v1_240829_frozen.csv deepseek_llm_7b_chat,HFv1 TruthfulQA,47.98,,hf_open_llm_v1_240829_frozen.csv deepseek_llm_7b_chat,HFv1 Winogrande,74.82,,hf_open_llm_v1_240829_frozen.csv deepseek_moe_16b_base,HF OpenLLM v1,51.07,,hf_open_llm_v1_240829_frozen.csv deepseek_moe_16b_base,HFv1 ARC,53.24,,hf_open_llm_v1_240829_frozen.csv deepseek_moe_16b_base,HFv1 GSM8K,17.29,,hf_open_llm_v1_240829_frozen.csv deepseek_moe_16b_base,HFv1 HellaSwag,79.77,,hf_open_llm_v1_240829_frozen.csv deepseek_moe_16b_base,HFv1 MMLU,46.31,,hf_open_llm_v1_240829_frozen.csv deepseek_moe_16b_base,HFv1 TruthfulQA,36.08,,hf_open_llm_v1_240829_frozen.csv deepseek_moe_16b_base,HFv1 Winogrande,73.72,,hf_open_llm_v1_240829_frozen.csv deita_1_8b,HF OpenLLM v1,42.96,,hf_open_llm_v1_240829_frozen.csv deita_1_8b,HFv1 ARC,36.52,,hf_open_llm_v1_240829_frozen.csv deita_1_8b,HFv1 GSM8K,15.62,,hf_open_llm_v1_240829_frozen.csv deita_1_8b,HFv1 HellaSwag,60.63,,hf_open_llm_v1_240829_frozen.csv deita_1_8b,HFv1 MMLU,45.62,,hf_open_llm_v1_240829_frozen.csv deita_1_8b,HFv1 TruthfulQA,40.02,,hf_open_llm_v1_240829_frozen.csv deita_1_8b,HFv1 Winogrande,59.35,,hf_open_llm_v1_240829_frozen.csv deita_2b,HF OpenLLM v1,52.35,,hf_open_llm_v1_240829_frozen.csv deita_2b,HFv1 ARC,44.71,,hf_open_llm_v1_240829_frozen.csv deita_2b,HFv1 GSM8K,41.32,,hf_open_llm_v1_240829_frozen.csv deita_2b,HFv1 HellaSwag,70.39,,hf_open_llm_v1_240829_frozen.csv deita_2b,HFv1 MMLU,52.79,,hf_open_llm_v1_240829_frozen.csv deita_2b,HFv1 TruthfulQA,39.61,,hf_open_llm_v1_240829_frozen.csv deita_2b,HFv1 Winogrande,65.27,,hf_open_llm_v1_240829_frozen.csv deita_32b,HF OpenLLM v1,72.16,,hf_open_llm_v1_240829_frozen.csv deita_32b,HFv1 ARC,63.05,,hf_open_llm_v1_240829_frozen.csv deita_32b,HFv1 GSM8K,72.33,,hf_open_llm_v1_240829_frozen.csv deita_32b,HFv1 HellaSwag,84.67,,hf_open_llm_v1_240829_frozen.csv deita_32b,HFv1 MMLU,73.95,,hf_open_llm_v1_240829_frozen.csv deita_32b,HFv1 TruthfulQA,58.11,,hf_open_llm_v1_240829_frozen.csv deita_32b,HFv1 Winogrande,80.82,,hf_open_llm_v1_240829_frozen.csv deita_34b,HF OpenLLM v1,71.56,,hf_open_llm_v1_240829_frozen.csv deita_34b,HFv1 ARC,64.08,,hf_open_llm_v1_240829_frozen.csv deita_34b,HFv1 GSM8K,66.19,,hf_open_llm_v1_240829_frozen.csv deita_34b,HFv1 HellaSwag,85.29,,hf_open_llm_v1_240829_frozen.csv deita_34b,HFv1 MMLU,76.66,,hf_open_llm_v1_240829_frozen.csv deita_34b,HFv1 TruthfulQA,54.35,,hf_open_llm_v1_240829_frozen.csv deita_34b,HFv1 Winogrande,82.79,,hf_open_llm_v1_240829_frozen.csv deita_4b,HF OpenLLM v1,56.43,,hf_open_llm_v1_240829_frozen.csv deita_4b,HFv1 ARC,46.08,,hf_open_llm_v1_240829_frozen.csv deita_4b,HFv1 GSM8K,48.9,,hf_open_llm_v1_240829_frozen.csv deita_4b,HFv1 HellaSwag,71.81,,hf_open_llm_v1_240829_frozen.csv deita_4b,HFv1 MMLU,55.46,,hf_open_llm_v1_240829_frozen.csv deita_4b,HFv1 TruthfulQA,50.23,,hf_open_llm_v1_240829_frozen.csv deita_4b,HFv1 Winogrande,66.14,,hf_open_llm_v1_240829_frozen.csv deita_500m,HF OpenLLM v1,38.22,,hf_open_llm_v1_240829_frozen.csv deita_500m,HFv1 ARC,29.27,,hf_open_llm_v1_240829_frozen.csv deita_500m,HFv1 GSM8K,8.95,,hf_open_llm_v1_240829_frozen.csv deita_500m,HFv1 HellaSwag,50.0,,hf_open_llm_v1_240829_frozen.csv deita_500m,HFv1 MMLU,39.41,,hf_open_llm_v1_240829_frozen.csv deita_500m,HFv1 TruthfulQA,43.94,,hf_open_llm_v1_240829_frozen.csv deita_500m,HFv1 Winogrande,57.77,,hf_open_llm_v1_240829_frozen.csv deita_qwen_1_8b,HF OpenLLM v1,42.96,,hf_open_llm_v1_240829_frozen.csv deita_qwen_1_8b,HFv1 ARC,36.52,,hf_open_llm_v1_240829_frozen.csv deita_qwen_1_8b,HFv1 GSM8K,15.62,,hf_open_llm_v1_240829_frozen.csv deita_qwen_1_8b,HFv1 HellaSwag,60.63,,hf_open_llm_v1_240829_frozen.csv deita_qwen_1_8b,HFv1 MMLU,45.62,,hf_open_llm_v1_240829_frozen.csv deita_qwen_1_8b,HFv1 TruthfulQA,40.02,,hf_open_llm_v1_240829_frozen.csv deita_qwen_1_8b,HFv1 Winogrande,59.35,,hf_open_llm_v1_240829_frozen.csv delta_4b_base,HF OpenLLM v1,61.04,,hf_open_llm_v1_240829_frozen.csv delta_4b_base,HFv1 ARC,58.62,,hf_open_llm_v1_240829_frozen.csv delta_4b_base,HFv1 GSM8K,46.93,,hf_open_llm_v1_240829_frozen.csv delta_4b_base,HFv1 HellaSwag,76.29,,hf_open_llm_v1_240829_frozen.csv delta_4b_base,HFv1 MMLU,59.06,,hf_open_llm_v1_240829_frozen.csv delta_4b_base,HFv1 TruthfulQA,51.74,,hf_open_llm_v1_240829_frozen.csv delta_4b_base,HFv1 Winogrande,73.64,,hf_open_llm_v1_240829_frozen.csv delta_4b_notso_base,HF OpenLLM v1,54.23,,hf_open_llm_v1_240829_frozen.csv delta_4b_notso_base,HFv1 ARC,57.59,,hf_open_llm_v1_240829_frozen.csv delta_4b_notso_base,HFv1 GSM8K,4.02,,hf_open_llm_v1_240829_frozen.csv delta_4b_notso_base,HFv1 HellaSwag,76.1,,hf_open_llm_v1_240829_frozen.csv delta_4b_notso_base,HFv1 MMLU,57.26,,hf_open_llm_v1_240829_frozen.csv delta_4b_notso_base,HFv1 TruthfulQA,54.31,,hf_open_llm_v1_240829_frozen.csv delta_4b_notso_base,HFv1 Winogrande,76.09,,hf_open_llm_v1_240829_frozen.csv delta_4b_orange,HF OpenLLM v1,62.23,,hf_open_llm_v1_240829_frozen.csv delta_4b_orange,HFv1 ARC,58.87,,hf_open_llm_v1_240829_frozen.csv delta_4b_orange,HFv1 GSM8K,48.14,,hf_open_llm_v1_240829_frozen.csv delta_4b_orange,HFv1 HellaSwag,76.59,,hf_open_llm_v1_240829_frozen.csv delta_4b_orange,HFv1 MMLU,56.5,,hf_open_llm_v1_240829_frozen.csv delta_4b_orange,HFv1 TruthfulQA,56.82,,hf_open_llm_v1_240829_frozen.csv delta_4b_orange,HFv1 Winogrande,76.48,,hf_open_llm_v1_240829_frozen.csv distilabeled_hermes_2_5_mistral_7b,HF OpenLLM v1,68.42,,hf_open_llm_v1_240829_frozen.csv distilabeled_hermes_2_5_mistral_7b,HFv1 ARC,66.3,,hf_open_llm_v1_240829_frozen.csv distilabeled_hermes_2_5_mistral_7b,HFv1 GSM8K,60.88,,hf_open_llm_v1_240829_frozen.csv distilabeled_hermes_2_5_mistral_7b,HFv1 HellaSwag,85.15,,hf_open_llm_v1_240829_frozen.csv distilabeled_hermes_2_5_mistral_7b,HFv1 MMLU,63.5,,hf_open_llm_v1_240829_frozen.csv distilabeled_hermes_2_5_mistral_7b,HFv1 TruthfulQA,55.75,,hf_open_llm_v1_240829_frozen.csv distilabeled_hermes_2_5_mistral_7b,HFv1 Winogrande,78.93,,hf_open_llm_v1_240829_frozen.csv distilabeled_marcoro14_7b_slerp,HF OpenLLM v1,73.63,,hf_open_llm_v1_240829_frozen.csv distilabeled_marcoro14_7b_slerp,HFv1 ARC,70.73,,hf_open_llm_v1_240829_frozen.csv distilabeled_marcoro14_7b_slerp,HFv1 GSM8K,71.19,,hf_open_llm_v1_240829_frozen.csv distilabeled_marcoro14_7b_slerp,HFv1 HellaSwag,87.47,,hf_open_llm_v1_240829_frozen.csv distilabeled_marcoro14_7b_slerp,HFv1 MMLU,65.22,,hf_open_llm_v1_240829_frozen.csv distilabeled_marcoro14_7b_slerp,HFv1 TruthfulQA,65.1,,hf_open_llm_v1_240829_frozen.csv distilabeled_marcoro14_7b_slerp,HFv1 Winogrande,82.08,,hf_open_llm_v1_240829_frozen.csv distilabeled_marcoro14_7b_slerp_full,HF OpenLLM v1,73.4,,hf_open_llm_v1_240829_frozen.csv distilabeled_marcoro14_7b_slerp_full,HFv1 ARC,70.65,,hf_open_llm_v1_240829_frozen.csv distilabeled_marcoro14_7b_slerp_full,HFv1 GSM8K,70.66,,hf_open_llm_v1_240829_frozen.csv distilabeled_marcoro14_7b_slerp_full,HFv1 HellaSwag,87.55,,hf_open_llm_v1_240829_frozen.csv distilabeled_marcoro14_7b_slerp_full,HFv1 MMLU,65.33,,hf_open_llm_v1_240829_frozen.csv distilabeled_marcoro14_7b_slerp_full,HFv1 TruthfulQA,64.21,,hf_open_llm_v1_240829_frozen.csv distilabeled_marcoro14_7b_slerp_full,HFv1 Winogrande,82.0,,hf_open_llm_v1_240829_frozen.csv dociprollm_7b,HF OpenLLM v1,44.2,,hf_open_llm_v1_240829_frozen.csv dociprollm_7b,HFv1 ARC,47.87,,hf_open_llm_v1_240829_frozen.csv dociprollm_7b,HFv1 GSM8K,4.62,,hf_open_llm_v1_240829_frozen.csv dociprollm_7b,HFv1 HellaSwag,78.11,,hf_open_llm_v1_240829_frozen.csv dociprollm_7b,HFv1 MMLU,27.78,,hf_open_llm_v1_240829_frozen.csv dociprollm_7b,HFv1 TruthfulQA,34.26,,hf_open_llm_v1_240829_frozen.csv dociprollm_7b,HFv1 Winogrande,72.53,,hf_open_llm_v1_240829_frozen.csv dolphin_2_1_mistral_7b,HF OpenLLM v1,61.12,,hf_open_llm_v1_240829_frozen.csv dolphin_2_1_mistral_7b,HFv1 ARC,64.42,,hf_open_llm_v1_240829_frozen.csv dolphin_2_1_mistral_7b,HFv1 GSM8K,20.77,,hf_open_llm_v1_240829_frozen.csv dolphin_2_1_mistral_7b,HFv1 HellaSwag,84.92,,hf_open_llm_v1_240829_frozen.csv dolphin_2_1_mistral_7b,HFv1 MMLU,63.32,,hf_open_llm_v1_240829_frozen.csv dolphin_2_1_mistral_7b,HFv1 TruthfulQA,55.56,,hf_open_llm_v1_240829_frozen.csv dolphin_2_1_mistral_7b,HFv1 Winogrande,77.74,,hf_open_llm_v1_240829_frozen.csv dolphin_2_1_mistral_7b_snr_laser,HF OpenLLM v1,65.5,,hf_open_llm_v1_240829_frozen.csv dolphin_2_1_mistral_7b_snr_laser,HFv1 ARC,63.82,,hf_open_llm_v1_240829_frozen.csv dolphin_2_1_mistral_7b_snr_laser,HFv1 GSM8K,47.23,,hf_open_llm_v1_240829_frozen.csv dolphin_2_1_mistral_7b_snr_laser,HFv1 HellaSwag,84.78,,hf_open_llm_v1_240829_frozen.csv dolphin_2_1_mistral_7b_snr_laser,HFv1 MMLU,63.63,,hf_open_llm_v1_240829_frozen.csv dolphin_2_1_mistral_7b_snr_laser,HFv1 TruthfulQA,55.24,,hf_open_llm_v1_240829_frozen.csv dolphin_2_1_mistral_7b_snr_laser,HFv1 Winogrande,78.3,,hf_open_llm_v1_240829_frozen.csv dolphin_2_1_mistral_7b_snr_math_laser,HF OpenLLM v1,65.03,,hf_open_llm_v1_240829_frozen.csv dolphin_2_1_mistral_7b_snr_math_laser,HFv1 ARC,63.31,,hf_open_llm_v1_240829_frozen.csv dolphin_2_1_mistral_7b_snr_math_laser,HFv1 GSM8K,47.23,,hf_open_llm_v1_240829_frozen.csv dolphin_2_1_mistral_7b_snr_math_laser,HFv1 HellaSwag,84.29,,hf_open_llm_v1_240829_frozen.csv dolphin_2_1_mistral_7b_snr_math_laser,HFv1 MMLU,63.02,,hf_open_llm_v1_240829_frozen.csv dolphin_2_1_mistral_7b_snr_math_laser,HFv1 TruthfulQA,54.75,,hf_open_llm_v1_240829_frozen.csv dolphin_2_1_mistral_7b_snr_math_laser,HFv1 Winogrande,77.58,,hf_open_llm_v1_240829_frozen.csv dolphin_2_2_70b,HF OpenLLM v1,70.6,,hf_open_llm_v1_240829_frozen.csv dolphin_2_2_70b,HFv1 ARC,70.05,,hf_open_llm_v1_240829_frozen.csv dolphin_2_2_70b,HFv1 GSM8K,56.79,,hf_open_llm_v1_240829_frozen.csv dolphin_2_2_70b,HFv1 HellaSwag,85.97,,hf_open_llm_v1_240829_frozen.csv dolphin_2_2_70b,HFv1 MMLU,69.18,,hf_open_llm_v1_240829_frozen.csv dolphin_2_2_70b,HFv1 TruthfulQA,60.14,,hf_open_llm_v1_240829_frozen.csv dolphin_2_2_70b,HFv1 Winogrande,81.45,,hf_open_llm_v1_240829_frozen.csv dolphin_2_2_yi_34b_200k,HF OpenLLM v1,46.67,,hf_open_llm_v1_240829_frozen.csv dolphin_2_2_yi_34b_200k,HFv1 ARC,42.15,,hf_open_llm_v1_240829_frozen.csv dolphin_2_2_yi_34b_200k,HFv1 GSM8K,3.71,,hf_open_llm_v1_240829_frozen.csv dolphin_2_2_yi_34b_200k,HFv1 HellaSwag,68.18,,hf_open_llm_v1_240829_frozen.csv dolphin_2_2_yi_34b_200k,HFv1 MMLU,55.47,,hf_open_llm_v1_240829_frozen.csv dolphin_2_2_yi_34b_200k,HFv1 TruthfulQA,45.93,,hf_open_llm_v1_240829_frozen.csv dolphin_2_2_yi_34b_200k,HFv1 Winogrande,64.56,,hf_open_llm_v1_240829_frozen.csv dolphin_2_6_mistral_7b_dpo_5_93b,HF OpenLLM v1,40.62,,hf_open_llm_v1_240829_frozen.csv dolphin_2_6_mistral_7b_dpo_5_93b,HFv1 ARC,38.99,,hf_open_llm_v1_240829_frozen.csv dolphin_2_6_mistral_7b_dpo_5_93b,HFv1 GSM8K,0.23,,hf_open_llm_v1_240829_frozen.csv dolphin_2_6_mistral_7b_dpo_5_93b,HFv1 HellaSwag,61.01,,hf_open_llm_v1_240829_frozen.csv dolphin_2_6_mistral_7b_dpo_5_93b,HFv1 MMLU,27.32,,hf_open_llm_v1_240829_frozen.csv dolphin_2_6_mistral_7b_dpo_5_93b,HFv1 TruthfulQA,53.51,,hf_open_llm_v1_240829_frozen.csv dolphin_2_6_mistral_7b_dpo_5_93b,HFv1 Winogrande,62.67,,hf_open_llm_v1_240829_frozen.csv dolphin_2_8_experiment26_7b,HF OpenLLM v1,68.6,,hf_open_llm_v1_240829_frozen.csv dolphin_2_8_experiment26_7b,HFv1 ARC,64.51,,hf_open_llm_v1_240829_frozen.csv dolphin_2_8_experiment26_7b,HFv1 GSM8K,63.61,,hf_open_llm_v1_240829_frozen.csv dolphin_2_8_experiment26_7b,HFv1 HellaSwag,83.79,,hf_open_llm_v1_240829_frozen.csv dolphin_2_8_experiment26_7b,HFv1 MMLU,63.24,,hf_open_llm_v1_240829_frozen.csv dolphin_2_8_experiment26_7b,HFv1 TruthfulQA,55.1,,hf_open_llm_v1_240829_frozen.csv dolphin_2_8_experiment26_7b,HFv1 Winogrande,81.61,,hf_open_llm_v1_240829_frozen.csv dolphin_2_9_1_phi_3_kensho_4_5b,HF OpenLLM v1,63.7,,hf_open_llm_v1_240829_frozen.csv dolphin_2_9_1_phi_3_kensho_4_5b,HFv1 ARC,58.53,,hf_open_llm_v1_240829_frozen.csv dolphin_2_9_1_phi_3_kensho_4_5b,HFv1 GSM8K,57.01,,hf_open_llm_v1_240829_frozen.csv dolphin_2_9_1_phi_3_kensho_4_5b,HFv1 HellaSwag,74.69,,hf_open_llm_v1_240829_frozen.csv dolphin_2_9_1_phi_3_kensho_4_5b,HFv1 MMLU,65.98,,hf_open_llm_v1_240829_frozen.csv dolphin_2_9_1_phi_3_kensho_4_5b,HFv1 TruthfulQA,52.25,,hf_open_llm_v1_240829_frozen.csv dolphin_2_9_1_phi_3_kensho_4_5b,HFv1 Winogrande,73.72,,hf_open_llm_v1_240829_frozen.csv dolphin_2_9_1_yi_1_5_34b,HF OpenLLM v1,75.05,,hf_open_llm_v1_240829_frozen.csv dolphin_2_9_1_yi_1_5_34b,HFv1 ARC,69.37,,hf_open_llm_v1_240829_frozen.csv dolphin_2_9_1_yi_1_5_34b,HFv1 GSM8K,73.01,,hf_open_llm_v1_240829_frozen.csv dolphin_2_9_1_yi_1_5_34b,HFv1 HellaSwag,85.53,,hf_open_llm_v1_240829_frozen.csv dolphin_2_9_1_yi_1_5_34b,HFv1 MMLU,77.52,,hf_open_llm_v1_240829_frozen.csv dolphin_2_9_1_yi_1_5_34b,HFv1 TruthfulQA,62.34,,hf_open_llm_v1_240829_frozen.csv dolphin_2_9_1_yi_1_5_34b,HFv1 Winogrande,82.56,,hf_open_llm_v1_240829_frozen.csv dolphin_2_9_1_yi_1_5_9b,HF OpenLLM v1,68.93,,hf_open_llm_v1_240829_frozen.csv dolphin_2_9_1_yi_1_5_9b,HFv1 ARC,65.7,,hf_open_llm_v1_240829_frozen.csv dolphin_2_9_1_yi_1_5_9b,HFv1 GSM8K,65.35,,hf_open_llm_v1_240829_frozen.csv dolphin_2_9_1_yi_1_5_9b,HFv1 HellaSwag,81.02,,hf_open_llm_v1_240829_frozen.csv dolphin_2_9_1_yi_1_5_9b,HFv1 MMLU,70.82,,hf_open_llm_v1_240829_frozen.csv dolphin_2_9_1_yi_1_5_9b,HFv1 TruthfulQA,53.76,,hf_open_llm_v1_240829_frozen.csv dolphin_2_9_1_yi_1_5_9b,HFv1 Winogrande,77.51,,hf_open_llm_v1_240829_frozen.csv dolphin_nebula_7b,HF OpenLLM v1,58.69,,hf_open_llm_v1_240829_frozen.csv dolphin_nebula_7b,HFv1 ARC,55.2,,hf_open_llm_v1_240829_frozen.csv dolphin_nebula_7b,HFv1 GSM8K,33.06,,hf_open_llm_v1_240829_frozen.csv dolphin_nebula_7b,HFv1 HellaSwag,78.57,,hf_open_llm_v1_240829_frozen.csv dolphin_nebula_7b,HFv1 MMLU,53.44,,hf_open_llm_v1_240829_frozen.csv dolphin_nebula_7b,HFv1 TruthfulQA,57.97,,hf_open_llm_v1_240829_frozen.csv dolphin_nebula_7b,HFv1 Winogrande,73.88,,hf_open_llm_v1_240829_frozen.csv dopeyplats_1_1b_2t_v1,HF OpenLLM v1,35.28,,hf_open_llm_v1_240829_frozen.csv dopeyplats_1_1b_2t_v1,HFv1 ARC,33.11,,hf_open_llm_v1_240829_frozen.csv dopeyplats_1_1b_2t_v1,HFv1 GSM8K,1.67,,hf_open_llm_v1_240829_frozen.csv dopeyplats_1_1b_2t_v1,HFv1 HellaSwag,54.31,,hf_open_llm_v1_240829_frozen.csv dopeyplats_1_1b_2t_v1,HFv1 MMLU,24.55,,hf_open_llm_v1_240829_frozen.csv dopeyplats_1_1b_2t_v1,HFv1 TruthfulQA,39.26,,hf_open_llm_v1_240829_frozen.csv dopeyplats_1_1b_2t_v1,HFv1 Winogrande,58.8,,hf_open_llm_v1_240829_frozen.csv dopeyshearedplats_1_3b_v1,HF OpenLLM v1,36.74,,hf_open_llm_v1_240829_frozen.csv dopeyshearedplats_1_3b_v1,HFv1 ARC,34.39,,hf_open_llm_v1_240829_frozen.csv dopeyshearedplats_1_3b_v1,HFv1 GSM8K,0.76,,hf_open_llm_v1_240829_frozen.csv dopeyshearedplats_1_3b_v1,HFv1 HellaSwag,64.31,,hf_open_llm_v1_240829_frozen.csv dopeyshearedplats_1_3b_v1,HFv1 MMLU,25.4,,hf_open_llm_v1_240829_frozen.csv dopeyshearedplats_1_3b_v1,HFv1 TruthfulQA,38.21,,hf_open_llm_v1_240829_frozen.csv dopeyshearedplats_1_3b_v1,HFv1 Winogrande,57.38,,hf_open_llm_v1_240829_frozen.csv dough_instruct_base_001,HF OpenLLM v1,29.37,,hf_open_llm_v1_240829_frozen.csv dough_instruct_base_001,HFv1 ARC,23.89,,hf_open_llm_v1_240829_frozen.csv dough_instruct_base_001,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv dough_instruct_base_001,HFv1 HellaSwag,24.76,,hf_open_llm_v1_240829_frozen.csv dough_instruct_base_001,HFv1 MMLU,23.13,,hf_open_llm_v1_240829_frozen.csv dough_instruct_base_001,HFv1 TruthfulQA,53.4,,hf_open_llm_v1_240829_frozen.csv dough_instruct_base_001,HFv1 Winogrande,51.07,,hf_open_llm_v1_240829_frozen.csv dpo_binarized_neuraltrix_7b,HF OpenLLM v1,76.17,,hf_open_llm_v1_240829_frozen.csv dpo_binarized_neuraltrix_7b,HFv1 ARC,72.35,,hf_open_llm_v1_240829_frozen.csv dpo_binarized_neuraltrix_7b,HFv1 GSM8K,68.01,,hf_open_llm_v1_240829_frozen.csv dpo_binarized_neuraltrix_7b,HFv1 HellaSwag,88.89,,hf_open_llm_v1_240829_frozen.csv dpo_binarized_neuraltrix_7b,HFv1 MMLU,64.09,,hf_open_llm_v1_240829_frozen.csv dpo_binarized_neuraltrix_7b,HFv1 TruthfulQA,79.07,,hf_open_llm_v1_240829_frozen.csv dpo_binarized_neuraltrix_7b,HFv1 Winogrande,84.61,,hf_open_llm_v1_240829_frozen.csv dpo_binarized_neutrixomnibe_7b,HF OpenLLM v1,76.31,,hf_open_llm_v1_240829_frozen.csv dpo_binarized_neutrixomnibe_7b,HFv1 ARC,72.78,,hf_open_llm_v1_240829_frozen.csv dpo_binarized_neutrixomnibe_7b,HFv1 GSM8K,69.45,,hf_open_llm_v1_240829_frozen.csv dpo_binarized_neutrixomnibe_7b,HFv1 HellaSwag,89.05,,hf_open_llm_v1_240829_frozen.csv dpo_binarized_neutrixomnibe_7b,HFv1 MMLU,64.6,,hf_open_llm_v1_240829_frozen.csv dpo_binarized_neutrixomnibe_7b,HFv1 TruthfulQA,76.9,,hf_open_llm_v1_240829_frozen.csv dpo_binarized_neutrixomnibe_7b,HFv1 Winogrande,85.08,,hf_open_llm_v1_240829_frozen.csv dpo_miniguanaco_1_5t,HF OpenLLM v1,35.13,,hf_open_llm_v1_240829_frozen.csv dpo_miniguanaco_1_5t,HFv1 ARC,30.63,,hf_open_llm_v1_240829_frozen.csv dpo_miniguanaco_1_5t,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv dpo_miniguanaco_1_5t,HFv1 HellaSwag,54.05,,hf_open_llm_v1_240829_frozen.csv dpo_miniguanaco_1_5t,HFv1 MMLU,24.79,,hf_open_llm_v1_240829_frozen.csv dpo_miniguanaco_1_5t,HFv1 TruthfulQA,42.69,,hf_open_llm_v1_240829_frozen.csv dpo_miniguanaco_1_5t,HFv1 Winogrande,58.64,,hf_open_llm_v1_240829_frozen.csv dpo_phi2,HF OpenLLM v1,61.26,,hf_open_llm_v1_240829_frozen.csv dpo_phi2,HFv1 ARC,61.69,,hf_open_llm_v1_240829_frozen.csv dpo_phi2,HFv1 GSM8K,54.44,,hf_open_llm_v1_240829_frozen.csv dpo_phi2,HFv1 HellaSwag,75.13,,hf_open_llm_v1_240829_frozen.csv dpo_phi2,HFv1 MMLU,58.1,,hf_open_llm_v1_240829_frozen.csv dpo_phi2,HFv1 TruthfulQA,43.99,,hf_open_llm_v1_240829_frozen.csv dpo_phi2,HFv1 Winogrande,74.19,,hf_open_llm_v1_240829_frozen.csv dpo_qlora_qwen1_5_0_5b_chat_xtuner,HF OpenLLM v1,32.84,,hf_open_llm_v1_240829_frozen.csv dpo_qlora_qwen1_5_0_5b_chat_xtuner,HFv1 ARC,29.1,,hf_open_llm_v1_240829_frozen.csv dpo_qlora_qwen1_5_0_5b_chat_xtuner,HFv1 GSM8K,2.12,,hf_open_llm_v1_240829_frozen.csv dpo_qlora_qwen1_5_0_5b_chat_xtuner,HFv1 HellaSwag,41.45,,hf_open_llm_v1_240829_frozen.csv dpo_qlora_qwen1_5_0_5b_chat_xtuner,HFv1 MMLU,31.04,,hf_open_llm_v1_240829_frozen.csv dpo_qlora_qwen1_5_0_5b_chat_xtuner,HFv1 TruthfulQA,40.04,,hf_open_llm_v1_240829_frozen.csv dpo_qlora_qwen1_5_0_5b_chat_xtuner,HFv1 Winogrande,53.28,,hf_open_llm_v1_240829_frozen.csv dpo_qwen1_5_0_5b_chat,HF OpenLLM v1,33.47,,hf_open_llm_v1_240829_frozen.csv dpo_qwen1_5_0_5b_chat,HFv1 ARC,29.61,,hf_open_llm_v1_240829_frozen.csv dpo_qwen1_5_0_5b_chat,HFv1 GSM8K,2.81,,hf_open_llm_v1_240829_frozen.csv dpo_qwen1_5_0_5b_chat,HFv1 HellaSwag,42.71,,hf_open_llm_v1_240829_frozen.csv dpo_qwen1_5_0_5b_chat,HFv1 MMLU,30.64,,hf_open_llm_v1_240829_frozen.csv dpo_qwen1_5_0_5b_chat,HFv1 TruthfulQA,41.23,,hf_open_llm_v1_240829_frozen.csv dpo_qwen1_5_0_5b_chat,HFv1 Winogrande,53.83,,hf_open_llm_v1_240829_frozen.csv dpo_qwen1_5_0_5b_chat_alignment_handbook,HF OpenLLM v1,35.68,,hf_open_llm_v1_240829_frozen.csv dpo_qwen1_5_0_5b_chat_alignment_handbook,HFv1 ARC,31.83,,hf_open_llm_v1_240829_frozen.csv dpo_qwen1_5_0_5b_chat_alignment_handbook,HFv1 GSM8K,6.97,,hf_open_llm_v1_240829_frozen.csv dpo_qwen1_5_0_5b_chat_alignment_handbook,HFv1 HellaSwag,44.49,,hf_open_llm_v1_240829_frozen.csv dpo_qwen1_5_0_5b_chat_alignment_handbook,HFv1 MMLU,33.46,,hf_open_llm_v1_240829_frozen.csv dpo_qwen1_5_0_5b_chat_alignment_handbook,HFv1 TruthfulQA,42.05,,hf_open_llm_v1_240829_frozen.csv dpo_qwen1_5_0_5b_chat_alignment_handbook,HFv1 Winogrande,55.25,,hf_open_llm_v1_240829_frozen.csv dpo_test_hermes_open_llama3b,HF OpenLLM v1,39.42,,hf_open_llm_v1_240829_frozen.csv dpo_test_hermes_open_llama3b,HFv1 ARC,39.25,,hf_open_llm_v1_240829_frozen.csv dpo_test_hermes_open_llama3b,HFv1 GSM8K,1.36,,hf_open_llm_v1_240829_frozen.csv dpo_test_hermes_open_llama3b,HFv1 HellaSwag,67.46,,hf_open_llm_v1_240829_frozen.csv dpo_test_hermes_open_llama3b,HFv1 MMLU,24.21,,hf_open_llm_v1_240829_frozen.csv dpo_test_hermes_open_llama3b,HFv1 TruthfulQA,39.81,,hf_open_llm_v1_240829_frozen.csv dpo_test_hermes_open_llama3b,HFv1 Winogrande,64.4,,hf_open_llm_v1_240829_frozen.csv dpopenhermes_7b,HF OpenLLM v1,67.58,,hf_open_llm_v1_240829_frozen.csv dpopenhermes_7b,HFv1 ARC,65.7,,hf_open_llm_v1_240829_frozen.csv dpopenhermes_7b,HFv1 GSM8K,54.36,,hf_open_llm_v1_240829_frozen.csv dpopenhermes_7b,HFv1 HellaSwag,85.96,,hf_open_llm_v1_240829_frozen.csv dpopenhermes_7b,HFv1 MMLU,63.89,,hf_open_llm_v1_240829_frozen.csv dpopenhermes_7b,HFv1 TruthfulQA,56.95,,hf_open_llm_v1_240829_frozen.csv dpopenhermes_7b,HFv1 Winogrande,78.61,,hf_open_llm_v1_240829_frozen.csv dpopenhermes_7b_v2,HF OpenLLM v1,69.58,,hf_open_llm_v1_240829_frozen.csv dpopenhermes_7b_v2,HFv1 ARC,66.64,,hf_open_llm_v1_240829_frozen.csv dpopenhermes_7b_v2,HFv1 GSM8K,63.61,,hf_open_llm_v1_240829_frozen.csv dpopenhermes_7b_v2,HFv1 HellaSwag,85.22,,hf_open_llm_v1_240829_frozen.csv dpopenhermes_7b_v2,HFv1 MMLU,63.64,,hf_open_llm_v1_240829_frozen.csv dpopenhermes_7b_v2,HFv1 TruthfulQA,59.22,,hf_open_llm_v1_240829_frozen.csv dpopenhermes_7b_v2,HFv1 Winogrande,79.16,,hf_open_llm_v1_240829_frozen.csv ds_diasum_md_mixtral,HF OpenLLM v1,68.42,,hf_open_llm_v1_240829_frozen.csv ds_diasum_md_mixtral,HFv1 ARC,66.3,,hf_open_llm_v1_240829_frozen.csv ds_diasum_md_mixtral,HFv1 GSM8K,53.22,,hf_open_llm_v1_240829_frozen.csv ds_diasum_md_mixtral,HFv1 HellaSwag,85.45,,hf_open_llm_v1_240829_frozen.csv ds_diasum_md_mixtral,HFv1 MMLU,69.51,,hf_open_llm_v1_240829_frozen.csv ds_diasum_md_mixtral,HFv1 TruthfulQA,55.72,,hf_open_llm_v1_240829_frozen.csv ds_diasum_md_mixtral,HFv1 Winogrande,80.35,,hf_open_llm_v1_240829_frozen.csv duplicitous_mammal_13b,HF OpenLLM v1,56.57,,hf_open_llm_v1_240829_frozen.csv duplicitous_mammal_13b,HFv1 ARC,61.69,,hf_open_llm_v1_240829_frozen.csv duplicitous_mammal_13b,HFv1 GSM8K,9.1,,hf_open_llm_v1_240829_frozen.csv duplicitous_mammal_13b,HFv1 HellaSwag,83.79,,hf_open_llm_v1_240829_frozen.csv duplicitous_mammal_13b,HFv1 MMLU,57.5,,hf_open_llm_v1_240829_frozen.csv duplicitous_mammal_13b,HFv1 TruthfulQA,52.27,,hf_open_llm_v1_240829_frozen.csv duplicitous_mammal_13b,HFv1 Winogrande,75.06,,hf_open_llm_v1_240829_frozen.csv duplicitous_slurpbeast_13b,HF OpenLLM v1,56.62,,hf_open_llm_v1_240829_frozen.csv duplicitous_slurpbeast_13b,HFv1 ARC,62.12,,hf_open_llm_v1_240829_frozen.csv duplicitous_slurpbeast_13b,HFv1 GSM8K,8.79,,hf_open_llm_v1_240829_frozen.csv duplicitous_slurpbeast_13b,HFv1 HellaSwag,83.92,,hf_open_llm_v1_240829_frozen.csv duplicitous_slurpbeast_13b,HFv1 MMLU,57.53,,hf_open_llm_v1_240829_frozen.csv duplicitous_slurpbeast_13b,HFv1 TruthfulQA,52.33,,hf_open_llm_v1_240829_frozen.csv duplicitous_slurpbeast_13b,HFv1 Winogrande,75.06,,hf_open_llm_v1_240829_frozen.csv eastasia_4x7b_moe_experiment,HF OpenLLM v1,42.12,,hf_open_llm_v1_240829_frozen.csv eastasia_4x7b_moe_experiment,HFv1 ARC,39.51,,hf_open_llm_v1_240829_frozen.csv eastasia_4x7b_moe_experiment,HFv1 GSM8K,0.15,,hf_open_llm_v1_240829_frozen.csv eastasia_4x7b_moe_experiment,HFv1 HellaSwag,48.92,,hf_open_llm_v1_240829_frozen.csv eastasia_4x7b_moe_experiment,HFv1 MMLU,56.2,,hf_open_llm_v1_240829_frozen.csv eastasia_4x7b_moe_experiment,HFv1 TruthfulQA,49.83,,hf_open_llm_v1_240829_frozen.csv eastasia_4x7b_moe_experiment,HFv1 Winogrande,58.09,,hf_open_llm_v1_240829_frozen.csv eeve_korean_2_8b_v1_0,HF OpenLLM v1,55.9,,hf_open_llm_v1_240829_frozen.csv eeve_korean_2_8b_v1_0,HFv1 ARC,57.25,,hf_open_llm_v1_240829_frozen.csv eeve_korean_2_8b_v1_0,HFv1 GSM8K,36.39,,hf_open_llm_v1_240829_frozen.csv eeve_korean_2_8b_v1_0,HFv1 HellaSwag,72.15,,hf_open_llm_v1_240829_frozen.csv eeve_korean_2_8b_v1_0,HFv1 MMLU,51.62,,hf_open_llm_v1_240829_frozen.csv eeve_korean_2_8b_v1_0,HFv1 TruthfulQA,44.27,,hf_open_llm_v1_240829_frozen.csv eeve_korean_2_8b_v1_0,HFv1 Winogrande,73.72,,hf_open_llm_v1_240829_frozen.csv eeve_korean_instruct_10_8b_v1_0,HF OpenLLM v1,66.48,,hf_open_llm_v1_240829_frozen.csv eeve_korean_instruct_10_8b_v1_0,HFv1 ARC,64.85,,hf_open_llm_v1_240829_frozen.csv eeve_korean_instruct_10_8b_v1_0,HFv1 GSM8K,50.72,,hf_open_llm_v1_240829_frozen.csv eeve_korean_instruct_10_8b_v1_0,HFv1 HellaSwag,83.04,,hf_open_llm_v1_240829_frozen.csv eeve_korean_instruct_10_8b_v1_0,HFv1 MMLU,64.23,,hf_open_llm_v1_240829_frozen.csv eeve_korean_instruct_10_8b_v1_0,HFv1 TruthfulQA,54.09,,hf_open_llm_v1_240829_frozen.csv eeve_korean_instruct_10_8b_v1_0,HFv1 Winogrande,81.93,,hf_open_llm_v1_240829_frozen.csv eeve_korean_instruct_2_8b_v1_0,HF OpenLLM v1,58.71,,hf_open_llm_v1_240829_frozen.csv eeve_korean_instruct_2_8b_v1_0,HFv1 ARC,58.28,,hf_open_llm_v1_240829_frozen.csv eeve_korean_instruct_2_8b_v1_0,HFv1 GSM8K,45.11,,hf_open_llm_v1_240829_frozen.csv eeve_korean_instruct_2_8b_v1_0,HFv1 HellaSwag,72.42,,hf_open_llm_v1_240829_frozen.csv eeve_korean_instruct_2_8b_v1_0,HFv1 MMLU,53.35,,hf_open_llm_v1_240829_frozen.csv eeve_korean_instruct_2_8b_v1_0,HFv1 TruthfulQA,48.32,,hf_open_llm_v1_240829_frozen.csv eeve_korean_instruct_2_8b_v1_0,HFv1 Winogrande,74.82,,hf_open_llm_v1_240829_frozen.csv einstein_4d_marcoro14_7b_full_slerp,HF OpenLLM v1,71.73,,hf_open_llm_v1_240829_frozen.csv einstein_4d_marcoro14_7b_full_slerp,HFv1 ARC,68.86,,hf_open_llm_v1_240829_frozen.csv einstein_4d_marcoro14_7b_full_slerp,HFv1 GSM8K,68.46,,hf_open_llm_v1_240829_frozen.csv einstein_4d_marcoro14_7b_full_slerp,HFv1 HellaSwag,85.98,,hf_open_llm_v1_240829_frozen.csv einstein_4d_marcoro14_7b_full_slerp,HFv1 MMLU,64.57,,hf_open_llm_v1_240829_frozen.csv einstein_4d_marcoro14_7b_full_slerp,HFv1 TruthfulQA,62.07,,hf_open_llm_v1_240829_frozen.csv einstein_4d_marcoro14_7b_full_slerp,HFv1 Winogrande,80.43,,hf_open_llm_v1_240829_frozen.csv einstein_4d_marcoro14_nddmpk_krishnahercules_7b_slerp,HF OpenLLM v1,73.08,,hf_open_llm_v1_240829_frozen.csv einstein_4d_marcoro14_nddmpk_krishnahercules_7b_slerp,HFv1 ARC,69.71,,hf_open_llm_v1_240829_frozen.csv einstein_4d_marcoro14_nddmpk_krishnahercules_7b_slerp,HFv1 GSM8K,70.66,,hf_open_llm_v1_240829_frozen.csv einstein_4d_marcoro14_nddmpk_krishnahercules_7b_slerp,HFv1 HellaSwag,87.04,,hf_open_llm_v1_240829_frozen.csv einstein_4d_marcoro14_nddmpk_krishnahercules_7b_slerp,HFv1 MMLU,65.32,,hf_open_llm_v1_240829_frozen.csv einstein_4d_marcoro14_nddmpk_krishnahercules_7b_slerp,HFv1 TruthfulQA,64.37,,hf_open_llm_v1_240829_frozen.csv einstein_4d_marcoro14_nddmpk_krishnahercules_7b_slerp,HFv1 Winogrande,81.37,,hf_open_llm_v1_240829_frozen.csv einstein_4d_moe_2x7b_test,HF OpenLLM v1,72.5,,hf_open_llm_v1_240829_frozen.csv einstein_4d_moe_2x7b_test,HFv1 ARC,69.71,,hf_open_llm_v1_240829_frozen.csv einstein_4d_moe_2x7b_test,HFv1 GSM8K,69.6,,hf_open_llm_v1_240829_frozen.csv einstein_4d_moe_2x7b_test,HFv1 HellaSwag,86.52,,hf_open_llm_v1_240829_frozen.csv einstein_4d_moe_2x7b_test,HFv1 MMLU,65.41,,hf_open_llm_v1_240829_frozen.csv einstein_4d_moe_2x7b_test,HFv1 TruthfulQA,62.29,,hf_open_llm_v1_240829_frozen.csv einstein_4d_moe_2x7b_test,HFv1 Winogrande,81.45,,hf_open_llm_v1_240829_frozen.csv einstein_v4_7b,HF OpenLLM v1,66.62,,hf_open_llm_v1_240829_frozen.csv einstein_v4_7b,HFv1 ARC,64.68,,hf_open_llm_v1_240829_frozen.csv einstein_v4_7b,HFv1 GSM8K,57.62,,hf_open_llm_v1_240829_frozen.csv einstein_v4_7b,HFv1 HellaSwag,83.75,,hf_open_llm_v1_240829_frozen.csv einstein_v4_7b,HFv1 MMLU,62.31,,hf_open_llm_v1_240829_frozen.csv einstein_v4_7b,HFv1 TruthfulQA,55.15,,hf_open_llm_v1_240829_frozen.csv einstein_v4_7b,HFv1 Winogrande,76.24,,hf_open_llm_v1_240829_frozen.csv einstein_v4_phi2,HF OpenLLM v1,60.77,,hf_open_llm_v1_240829_frozen.csv einstein_v4_phi2,HFv1 ARC,59.98,,hf_open_llm_v1_240829_frozen.csv einstein_v4_phi2,HFv1 GSM8K,53.98,,hf_open_llm_v1_240829_frozen.csv einstein_v4_phi2,HFv1 HellaSwag,74.07,,hf_open_llm_v1_240829_frozen.csv einstein_v4_phi2,HFv1 MMLU,56.89,,hf_open_llm_v1_240829_frozen.csv einstein_v4_phi2,HFv1 TruthfulQA,45.8,,hf_open_llm_v1_240829_frozen.csv einstein_v4_phi2,HFv1 Winogrande,73.88,,hf_open_llm_v1_240829_frozen.csv einstein_v4_qwen_1_5_32b,HF OpenLLM v1,68.54,,hf_open_llm_v1_240829_frozen.csv einstein_v4_qwen_1_5_32b,HFv1 ARC,62.37,,hf_open_llm_v1_240829_frozen.csv einstein_v4_qwen_1_5_32b,HFv1 GSM8K,51.71,,hf_open_llm_v1_240829_frozen.csv einstein_v4_qwen_1_5_32b,HFv1 HellaSwag,83.85,,hf_open_llm_v1_240829_frozen.csv einstein_v4_qwen_1_5_32b,HFv1 MMLU,74.04,,hf_open_llm_v1_240829_frozen.csv einstein_v4_qwen_1_5_32b,HFv1 TruthfulQA,58.86,,hf_open_llm_v1_240829_frozen.csv einstein_v4_qwen_1_5_32b,HFv1 Winogrande,80.43,,hf_open_llm_v1_240829_frozen.csv einstein_v5_v0_2_7b,HF OpenLLM v1,65.65,,hf_open_llm_v1_240829_frozen.csv einstein_v5_v0_2_7b,HFv1 ARC,60.92,,hf_open_llm_v1_240829_frozen.csv einstein_v5_v0_2_7b,HFv1 GSM8K,59.67,,hf_open_llm_v1_240829_frozen.csv einstein_v5_v0_2_7b,HFv1 HellaSwag,80.99,,hf_open_llm_v1_240829_frozen.csv einstein_v5_v0_2_7b,HFv1 MMLU,61.02,,hf_open_llm_v1_240829_frozen.csv einstein_v5_v0_2_7b,HFv1 TruthfulQA,52.59,,hf_open_llm_v1_240829_frozen.csv einstein_v5_v0_2_7b,HFv1 Winogrande,78.69,,hf_open_llm_v1_240829_frozen.csv einstein_v6_1_llama3_8b_instruct_ties,HF OpenLLM v1,69.01,,hf_open_llm_v1_240829_frozen.csv einstein_v6_1_llama3_8b_instruct_ties,HFv1 ARC,63.23,,hf_open_llm_v1_240829_frozen.csv einstein_v6_1_llama3_8b_instruct_ties,HFv1 GSM8K,70.05,,hf_open_llm_v1_240829_frozen.csv einstein_v6_1_llama3_8b_instruct_ties,HFv1 HellaSwag,81.56,,hf_open_llm_v1_240829_frozen.csv einstein_v6_1_llama3_8b_instruct_ties,HFv1 MMLU,68.23,,hf_open_llm_v1_240829_frozen.csv einstein_v6_1_llama3_8b_instruct_ties,HFv1 TruthfulQA,52.44,,hf_open_llm_v1_240829_frozen.csv einstein_v6_1_llama3_8b_instruct_ties,HFv1 Winogrande,78.53,,hf_open_llm_v1_240829_frozen.csv einstein_v6_1_phi2,HF OpenLLM v1,61.25,,hf_open_llm_v1_240829_frozen.csv einstein_v6_1_phi2,HFv1 ARC,60.15,,hf_open_llm_v1_240829_frozen.csv einstein_v6_1_phi2,HFv1 GSM8K,56.48,,hf_open_llm_v1_240829_frozen.csv einstein_v6_1_phi2,HFv1 HellaSwag,74.31,,hf_open_llm_v1_240829_frozen.csv einstein_v6_1_phi2,HFv1 MMLU,56.82,,hf_open_llm_v1_240829_frozen.csv einstein_v6_1_phi2,HFv1 TruthfulQA,46.24,,hf_open_llm_v1_240829_frozen.csv einstein_v6_1_phi2,HFv1 Winogrande,73.48,,hf_open_llm_v1_240829_frozen.csv einstein_v6_7b,HF OpenLLM v1,67.12,,hf_open_llm_v1_240829_frozen.csv einstein_v6_7b,HFv1 ARC,63.57,,hf_open_llm_v1_240829_frozen.csv einstein_v6_7b,HFv1 GSM8K,63.53,,hf_open_llm_v1_240829_frozen.csv einstein_v6_7b,HFv1 HellaSwag,82.76,,hf_open_llm_v1_240829_frozen.csv einstein_v6_7b,HFv1 MMLU,62.23,,hf_open_llm_v1_240829_frozen.csv einstein_v6_7b,HFv1 TruthfulQA,52.02,,hf_open_llm_v1_240829_frozen.csv einstein_v6_7b,HFv1 Winogrande,78.61,,hf_open_llm_v1_240829_frozen.csv elyza_japanese_llama_2_7b_instruct,HF OpenLLM v1,49.78,,hf_open_llm_v1_240829_frozen.csv elyza_japanese_llama_2_7b_instruct,HFv1 ARC,53.16,,hf_open_llm_v1_240829_frozen.csv elyza_japanese_llama_2_7b_instruct,HFv1 GSM8K,7.88,,hf_open_llm_v1_240829_frozen.csv elyza_japanese_llama_2_7b_instruct,HFv1 HellaSwag,78.25,,hf_open_llm_v1_240829_frozen.csv elyza_japanese_llama_2_7b_instruct,HFv1 MMLU,47.07,,hf_open_llm_v1_240829_frozen.csv elyza_japanese_llama_2_7b_instruct,HFv1 TruthfulQA,39.08,,hf_open_llm_v1_240829_frozen.csv elyza_japanese_llama_2_7b_instruct,HFv1 Winogrande,73.24,,hf_open_llm_v1_240829_frozen.csv emertonbeagle_7b_dpo,HF OpenLLM v1,75.39,,hf_open_llm_v1_240829_frozen.csv emertonbeagle_7b_dpo,HFv1 ARC,72.78,,hf_open_llm_v1_240829_frozen.csv emertonbeagle_7b_dpo,HFv1 GSM8K,66.41,,hf_open_llm_v1_240829_frozen.csv emertonbeagle_7b_dpo,HFv1 HellaSwag,89.12,,hf_open_llm_v1_240829_frozen.csv emertonbeagle_7b_dpo,HFv1 MMLU,64.47,,hf_open_llm_v1_240829_frozen.csv emertonbeagle_7b_dpo,HFv1 TruthfulQA,75.96,,hf_open_llm_v1_240829_frozen.csv emertonbeagle_7b_dpo,HFv1 Winogrande,83.58,,hf_open_llm_v1_240829_frozen.csv emertonmonarch_7b,HF OpenLLM v1,75.74,,hf_open_llm_v1_240829_frozen.csv emertonmonarch_7b,HFv1 ARC,72.7,,hf_open_llm_v1_240829_frozen.csv emertonmonarch_7b,HFv1 GSM8K,65.28,,hf_open_llm_v1_240829_frozen.csv emertonmonarch_7b,HFv1 HellaSwag,89.16,,hf_open_llm_v1_240829_frozen.csv emertonmonarch_7b,HFv1 MMLU,64.05,,hf_open_llm_v1_240829_frozen.csv emertonmonarch_7b,HFv1 TruthfulQA,78.09,,hf_open_llm_v1_240829_frozen.csv emertonmonarch_7b,HFv1 Winogrande,85.16,,hf_open_llm_v1_240829_frozen.csv emertonomnibeagle_7b_dpo,HF OpenLLM v1,75.67,,hf_open_llm_v1_240829_frozen.csv emertonomnibeagle_7b_dpo,HFv1 ARC,72.7,,hf_open_llm_v1_240829_frozen.csv emertonomnibeagle_7b_dpo,HFv1 GSM8K,68.54,,hf_open_llm_v1_240829_frozen.csv emertonomnibeagle_7b_dpo,HFv1 HellaSwag,88.44,,hf_open_llm_v1_240829_frozen.csv emertonomnibeagle_7b_dpo,HFv1 MMLU,64.44,,hf_open_llm_v1_240829_frozen.csv emertonomnibeagle_7b_dpo,HFv1 TruthfulQA,75.62,,hf_open_llm_v1_240829_frozen.csv emertonomnibeagle_7b_dpo,HFv1 Winogrande,84.29,,hf_open_llm_v1_240829_frozen.csv ensemble5_platypus2_13b_qlora_0_80_epoch,HF OpenLLM v1,54.76,,hf_open_llm_v1_240829_frozen.csv ensemble5_platypus2_13b_qlora_0_80_epoch,HFv1 ARC,59.73,,hf_open_llm_v1_240829_frozen.csv ensemble5_platypus2_13b_qlora_0_80_epoch,HFv1 GSM8K,1.9,,hf_open_llm_v1_240829_frozen.csv ensemble5_platypus2_13b_qlora_0_80_epoch,HFv1 HellaSwag,82.66,,hf_open_llm_v1_240829_frozen.csv ensemble5_platypus2_13b_qlora_0_80_epoch,HFv1 MMLU,56.94,,hf_open_llm_v1_240829_frozen.csv ensemble5_platypus2_13b_qlora_0_80_epoch,HFv1 TruthfulQA,52.92,,hf_open_llm_v1_240829_frozen.csv ensemble5_platypus2_13b_qlora_0_80_epoch,HFv1 Winogrande,74.43,,hf_open_llm_v1_240829_frozen.csv ensemblev5_nova_13b,HF OpenLLM v1,56.49,,hf_open_llm_v1_240829_frozen.csv ensemblev5_nova_13b,HFv1 ARC,62.71,,hf_open_llm_v1_240829_frozen.csv ensemblev5_nova_13b,HFv1 GSM8K,10.77,,hf_open_llm_v1_240829_frozen.csv ensemblev5_nova_13b,HFv1 HellaSwag,82.55,,hf_open_llm_v1_240829_frozen.csv ensemblev5_nova_13b,HFv1 MMLU,56.79,,hf_open_llm_v1_240829_frozen.csv ensemblev5_nova_13b,HFv1 TruthfulQA,49.86,,hf_open_llm_v1_240829_frozen.csv ensemblev5_nova_13b,HFv1 Winogrande,76.24,,hf_open_llm_v1_240829_frozen.csv eris_floramix_dpo_7b,HF OpenLLM v1,74.87,,hf_open_llm_v1_240829_frozen.csv eris_floramix_dpo_7b,HFv1 ARC,73.04,,hf_open_llm_v1_240829_frozen.csv eris_floramix_dpo_7b,HFv1 GSM8K,67.55,,hf_open_llm_v1_240829_frozen.csv eris_floramix_dpo_7b,HFv1 HellaSwag,88.28,,hf_open_llm_v1_240829_frozen.csv eris_floramix_dpo_7b,HFv1 MMLU,64.71,,hf_open_llm_v1_240829_frozen.csv eris_floramix_dpo_7b,HFv1 TruthfulQA,70.94,,hf_open_llm_v1_240829_frozen.csv eris_floramix_dpo_7b,HFv1 Winogrande,84.69,,hf_open_llm_v1_240829_frozen.csv eris_remix_dpo_7b,HF OpenLLM v1,74.71,,hf_open_llm_v1_240829_frozen.csv eris_remix_dpo_7b,HFv1 ARC,72.44,,hf_open_llm_v1_240829_frozen.csv eris_remix_dpo_7b,HFv1 GSM8K,68.84,,hf_open_llm_v1_240829_frozen.csv eris_remix_dpo_7b,HFv1 HellaSwag,88.03,,hf_open_llm_v1_240829_frozen.csv eris_remix_dpo_7b,HFv1 MMLU,65.29,,hf_open_llm_v1_240829_frozen.csv eris_remix_dpo_7b,HFv1 TruthfulQA,68.92,,hf_open_llm_v1_240829_frozen.csv eris_remix_dpo_7b,HFv1 Winogrande,84.77,,hf_open_llm_v1_240829_frozen.csv eros_n_psyche_7b_model_stock,HF OpenLLM v1,71.76,,hf_open_llm_v1_240829_frozen.csv eros_n_psyche_7b_model_stock,HFv1 ARC,69.2,,hf_open_llm_v1_240829_frozen.csv eros_n_psyche_7b_model_stock,HFv1 GSM8K,66.49,,hf_open_llm_v1_240829_frozen.csv eros_n_psyche_7b_model_stock,HFv1 HellaSwag,86.25,,hf_open_llm_v1_240829_frozen.csv eros_n_psyche_7b_model_stock,HFv1 MMLU,65.15,,hf_open_llm_v1_240829_frozen.csv eros_n_psyche_7b_model_stock,HFv1 TruthfulQA,62.9,,hf_open_llm_v1_240829_frozen.csv eros_n_psyche_7b_model_stock,HFv1 Winogrande,80.58,,hf_open_llm_v1_240829_frozen.csv eurus_70b_nca_fixed,HF OpenLLM v1,59.84,,hf_open_llm_v1_240829_frozen.csv eurus_70b_nca_fixed,HFv1 ARC,55.63,,hf_open_llm_v1_240829_frozen.csv eurus_70b_nca_fixed,HFv1 GSM8K,48.37,,hf_open_llm_v1_240829_frozen.csv eurus_70b_nca_fixed,HFv1 HellaSwag,72.38,,hf_open_llm_v1_240829_frozen.csv eurus_70b_nca_fixed,HFv1 MMLU,55.68,,hf_open_llm_v1_240829_frozen.csv eurus_70b_nca_fixed,HFv1 TruthfulQA,54.42,,hf_open_llm_v1_240829_frozen.csv eurus_70b_nca_fixed,HFv1 Winogrande,72.53,,hf_open_llm_v1_240829_frozen.csv eurus_70b_sft_fixed,HF OpenLLM v1,58.32,,hf_open_llm_v1_240829_frozen.csv eurus_70b_sft_fixed,HFv1 ARC,55.2,,hf_open_llm_v1_240829_frozen.csv eurus_70b_sft_fixed,HFv1 GSM8K,44.35,,hf_open_llm_v1_240829_frozen.csv eurus_70b_sft_fixed,HFv1 HellaSwag,73.33,,hf_open_llm_v1_240829_frozen.csv eurus_70b_sft_fixed,HFv1 MMLU,55.37,,hf_open_llm_v1_240829_frozen.csv eurus_70b_sft_fixed,HFv1 TruthfulQA,49.55,,hf_open_llm_v1_240829_frozen.csv eurus_70b_sft_fixed,HFv1 Winogrande,72.14,,hf_open_llm_v1_240829_frozen.csv evaloric_1_1b,HF OpenLLM v1,37.54,,hf_open_llm_v1_240829_frozen.csv evaloric_1_1b,HFv1 ARC,35.07,,hf_open_llm_v1_240829_frozen.csv evaloric_1_1b,HFv1 GSM8K,1.14,,hf_open_llm_v1_240829_frozen.csv evaloric_1_1b,HFv1 HellaSwag,60.93,,hf_open_llm_v1_240829_frozen.csv evaloric_1_1b,HFv1 MMLU,25.36,,hf_open_llm_v1_240829_frozen.csv evaloric_1_1b,HFv1 TruthfulQA,37.78,,hf_open_llm_v1_240829_frozen.csv evaloric_1_1b,HFv1 Winogrande,64.96,,hf_open_llm_v1_240829_frozen.csv evaloric_1_1b_test,HF OpenLLM v1,39.43,,hf_open_llm_v1_240829_frozen.csv evaloric_1_1b_test,HFv1 ARC,36.6,,hf_open_llm_v1_240829_frozen.csv evaloric_1_1b_test,HFv1 GSM8K,5.0,,hf_open_llm_v1_240829_frozen.csv evaloric_1_1b_test,HFv1 HellaSwag,60.97,,hf_open_llm_v1_240829_frozen.csv evaloric_1_1b_test,HFv1 MMLU,26.12,,hf_open_llm_v1_240829_frozen.csv evaloric_1_1b_test,HFv1 TruthfulQA,38.28,,hf_open_llm_v1_240829_frozen.csv evaloric_1_1b_test,HFv1 Winogrande,69.61,,hf_open_llm_v1_240829_frozen.csv evangelion_7b,HF OpenLLM v1,71.71,,hf_open_llm_v1_240829_frozen.csv evangelion_7b,HFv1 ARC,68.94,,hf_open_llm_v1_240829_frozen.csv evangelion_7b,HFv1 GSM8K,66.94,,hf_open_llm_v1_240829_frozen.csv evangelion_7b,HFv1 HellaSwag,86.45,,hf_open_llm_v1_240829_frozen.csv evangelion_7b,HFv1 MMLU,63.97,,hf_open_llm_v1_240829_frozen.csv evangelion_7b,HFv1 TruthfulQA,64.01,,hf_open_llm_v1_240829_frozen.csv evangelion_7b,HFv1 Winogrande,79.95,,hf_open_llm_v1_240829_frozen.csv everynight_7b_slerp,HF OpenLLM v1,72.54,,hf_open_llm_v1_240829_frozen.csv everynight_7b_slerp,HFv1 ARC,70.05,,hf_open_llm_v1_240829_frozen.csv everynight_7b_slerp,HFv1 GSM8K,63.68,,hf_open_llm_v1_240829_frozen.csv everynight_7b_slerp,HFv1 HellaSwag,87.7,,hf_open_llm_v1_240829_frozen.csv everynight_7b_slerp,HFv1 MMLU,64.88,,hf_open_llm_v1_240829_frozen.csv everynight_7b_slerp,HFv1 TruthfulQA,66.07,,hf_open_llm_v1_240829_frozen.csv everynight_7b_slerp,HFv1 Winogrande,82.87,,hf_open_llm_v1_240829_frozen.csv everyone_coder_33b_base,HF OpenLLM v1,49.48,,hf_open_llm_v1_240829_frozen.csv everyone_coder_33b_base,HFv1 ARC,45.99,,hf_open_llm_v1_240829_frozen.csv everyone_coder_33b_base,HFv1 GSM8K,39.8,,hf_open_llm_v1_240829_frozen.csv everyone_coder_33b_base,HFv1 HellaSwag,61.71,,hf_open_llm_v1_240829_frozen.csv everyone_coder_33b_base,HFv1 MMLU,44.05,,hf_open_llm_v1_240829_frozen.csv everyone_coder_33b_base,HFv1 TruthfulQA,42.26,,hf_open_llm_v1_240829_frozen.csv everyone_coder_33b_base,HFv1 Winogrande,63.06,,hf_open_llm_v1_240829_frozen.csv everythinglm_13b_v3_peft,HF OpenLLM v1,54.24,,hf_open_llm_v1_240829_frozen.csv everythinglm_13b_v3_peft,HFv1 ARC,58.36,,hf_open_llm_v1_240829_frozen.csv everythinglm_13b_v3_peft,HFv1 GSM8K,5.53,,hf_open_llm_v1_240829_frozen.csv everythinglm_13b_v3_peft,HFv1 HellaSwag,81.03,,hf_open_llm_v1_240829_frozen.csv everythinglm_13b_v3_peft,HFv1 MMLU,54.7,,hf_open_llm_v1_240829_frozen.csv everythinglm_13b_v3_peft,HFv1 TruthfulQA,52.98,,hf_open_llm_v1_240829_frozen.csv everythinglm_13b_v3_peft,HFv1 Winogrande,72.85,,hf_open_llm_v1_240829_frozen.csv ex_llm_e1,HF OpenLLM v1,43.11,,hf_open_llm_v1_240829_frozen.csv ex_llm_e1,HFv1 ARC,39.93,,hf_open_llm_v1_240829_frozen.csv ex_llm_e1,HFv1 GSM8K,4.32,,hf_open_llm_v1_240829_frozen.csv ex_llm_e1,HFv1 HellaSwag,68.11,,hf_open_llm_v1_240829_frozen.csv ex_llm_e1,HFv1 MMLU,39.44,,hf_open_llm_v1_240829_frozen.csv ex_llm_e1,HFv1 TruthfulQA,42.01,,hf_open_llm_v1_240829_frozen.csv ex_llm_e1,HFv1 Winogrande,64.88,,hf_open_llm_v1_240829_frozen.csv excalibur_7b_dpo,HF OpenLLM v1,73.84,,hf_open_llm_v1_240829_frozen.csv excalibur_7b_dpo,HFv1 ARC,70.9,,hf_open_llm_v1_240829_frozen.csv excalibur_7b_dpo,HFv1 GSM8K,65.43,,hf_open_llm_v1_240829_frozen.csv excalibur_7b_dpo,HFv1 HellaSwag,87.93,,hf_open_llm_v1_240829_frozen.csv excalibur_7b_dpo,HFv1 MMLU,65.46,,hf_open_llm_v1_240829_frozen.csv excalibur_7b_dpo,HFv1 TruthfulQA,70.82,,hf_open_llm_v1_240829_frozen.csv excalibur_7b_dpo,HFv1 Winogrande,82.48,,hf_open_llm_v1_240829_frozen.csv experiment26_spin_iter_0,HF OpenLLM v1,76.04,,hf_open_llm_v1_240829_frozen.csv experiment26_spin_iter_0,HFv1 ARC,72.44,,hf_open_llm_v1_240829_frozen.csv experiment26_spin_iter_0,HFv1 GSM8K,70.28,,hf_open_llm_v1_240829_frozen.csv experiment26_spin_iter_0,HFv1 HellaSwag,88.74,,hf_open_llm_v1_240829_frozen.csv experiment26_spin_iter_0,HFv1 MMLU,64.64,,hf_open_llm_v1_240829_frozen.csv experiment26_spin_iter_0,HFv1 TruthfulQA,74.9,,hf_open_llm_v1_240829_frozen.csv experiment26_spin_iter_0,HFv1 Winogrande,85.24,,hf_open_llm_v1_240829_frozen.csv experiment_dpo_m7b2_1_merged,HF OpenLLM v1,59.52,,hf_open_llm_v1_240829_frozen.csv experiment_dpo_m7b2_1_merged,HFv1 ARC,59.47,,hf_open_llm_v1_240829_frozen.csv experiment_dpo_m7b2_1_merged,HFv1 GSM8K,34.72,,hf_open_llm_v1_240829_frozen.csv experiment_dpo_m7b2_1_merged,HFv1 HellaSwag,82.42,,hf_open_llm_v1_240829_frozen.csv experiment_dpo_m7b2_1_merged,HFv1 MMLU,62.21,,hf_open_llm_v1_240829_frozen.csv experiment_dpo_m7b2_1_merged,HFv1 TruthfulQA,40.01,,hf_open_llm_v1_240829_frozen.csv experiment_dpo_m7b2_1_merged,HFv1 Winogrande,78.3,,hf_open_llm_v1_240829_frozen.csv experiment_dpo_m7b2_3_merged,HF OpenLLM v1,29.55,,hf_open_llm_v1_240829_frozen.csv experiment_dpo_m7b2_3_merged,HFv1 ARC,29.52,,hf_open_llm_v1_240829_frozen.csv experiment_dpo_m7b2_3_merged,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv experiment_dpo_m7b2_3_merged,HFv1 HellaSwag,25.9,,hf_open_llm_v1_240829_frozen.csv experiment_dpo_m7b2_3_merged,HFv1 MMLU,23.12,,hf_open_llm_v1_240829_frozen.csv experiment_dpo_m7b2_3_merged,HFv1 TruthfulQA,48.27,,hf_open_llm_v1_240829_frozen.csv experiment_dpo_m7b2_3_merged,HFv1 Winogrande,50.51,,hf_open_llm_v1_240829_frozen.csv experiment_orpo_m7b2_1_merged,HF OpenLLM v1,59.62,,hf_open_llm_v1_240829_frozen.csv experiment_orpo_m7b2_1_merged,HFv1 ARC,59.39,,hf_open_llm_v1_240829_frozen.csv experiment_orpo_m7b2_1_merged,HFv1 GSM8K,34.5,,hf_open_llm_v1_240829_frozen.csv experiment_orpo_m7b2_1_merged,HFv1 HellaSwag,82.48,,hf_open_llm_v1_240829_frozen.csv experiment_orpo_m7b2_1_merged,HFv1 MMLU,62.61,,hf_open_llm_v1_240829_frozen.csv experiment_orpo_m7b2_1_merged,HFv1 TruthfulQA,40.38,,hf_open_llm_v1_240829_frozen.csv experiment_orpo_m7b2_1_merged,HFv1 Winogrande,78.37,,hf_open_llm_v1_240829_frozen.csv experiment_orpo_m7b2_2_merged,HF OpenLLM v1,59.54,,hf_open_llm_v1_240829_frozen.csv experiment_orpo_m7b2_2_merged,HFv1 ARC,59.64,,hf_open_llm_v1_240829_frozen.csv experiment_orpo_m7b2_2_merged,HFv1 GSM8K,34.42,,hf_open_llm_v1_240829_frozen.csv experiment_orpo_m7b2_2_merged,HFv1 HellaSwag,82.44,,hf_open_llm_v1_240829_frozen.csv experiment_orpo_m7b2_2_merged,HFv1 MMLU,62.25,,hf_open_llm_v1_240829_frozen.csv experiment_orpo_m7b2_2_merged,HFv1 TruthfulQA,40.09,,hf_open_llm_v1_240829_frozen.csv experiment_orpo_m7b2_2_merged,HFv1 Winogrande,78.37,,hf_open_llm_v1_240829_frozen.csv experiment_sft_m7b2_1_merged,HF OpenLLM v1,56.93,,hf_open_llm_v1_240829_frozen.csv experiment_sft_m7b2_1_merged,HFv1 ARC,56.83,,hf_open_llm_v1_240829_frozen.csv experiment_sft_m7b2_1_merged,HFv1 GSM8K,25.32,,hf_open_llm_v1_240829_frozen.csv experiment_sft_m7b2_1_merged,HFv1 HellaSwag,79.75,,hf_open_llm_v1_240829_frozen.csv experiment_sft_m7b2_1_merged,HFv1 MMLU,56.76,,hf_open_llm_v1_240829_frozen.csv experiment_sft_m7b2_1_merged,HFv1 TruthfulQA,46.29,,hf_open_llm_v1_240829_frozen.csv experiment_sft_m7b2_1_merged,HFv1 Winogrande,76.64,,hf_open_llm_v1_240829_frozen.csv experiment_sft_m7b2_2_merged,HF OpenLLM v1,59.59,,hf_open_llm_v1_240829_frozen.csv experiment_sft_m7b2_2_merged,HFv1 ARC,59.3,,hf_open_llm_v1_240829_frozen.csv experiment_sft_m7b2_2_merged,HFv1 GSM8K,34.57,,hf_open_llm_v1_240829_frozen.csv experiment_sft_m7b2_2_merged,HFv1 HellaSwag,82.47,,hf_open_llm_v1_240829_frozen.csv experiment_sft_m7b2_2_merged,HFv1 MMLU,62.42,,hf_open_llm_v1_240829_frozen.csv experiment_sft_m7b2_2_merged,HFv1 TruthfulQA,40.25,,hf_open_llm_v1_240829_frozen.csv experiment_sft_m7b2_2_merged,HFv1 Winogrande,78.53,,hf_open_llm_v1_240829_frozen.csv experiment_sft_m7b2_3_merged,HF OpenLLM v1,59.55,,hf_open_llm_v1_240829_frozen.csv experiment_sft_m7b2_3_merged,HFv1 ARC,59.56,,hf_open_llm_v1_240829_frozen.csv experiment_sft_m7b2_3_merged,HFv1 GSM8K,34.57,,hf_open_llm_v1_240829_frozen.csv experiment_sft_m7b2_3_merged,HFv1 HellaSwag,82.39,,hf_open_llm_v1_240829_frozen.csv experiment_sft_m7b2_3_merged,HFv1 MMLU,62.3,,hf_open_llm_v1_240829_frozen.csv experiment_sft_m7b2_3_merged,HFv1 TruthfulQA,40.04,,hf_open_llm_v1_240829_frozen.csv experiment_sft_m7b2_3_merged,HFv1 Winogrande,78.45,,hf_open_llm_v1_240829_frozen.csv facebook_opt_125m_qcqa_ub_6_best_for_kv_cache,HF OpenLLM v1,28.66,,hf_open_llm_v1_240829_frozen.csv facebook_opt_125m_qcqa_ub_6_best_for_kv_cache,HFv1 ARC,24.23,,hf_open_llm_v1_240829_frozen.csv facebook_opt_125m_qcqa_ub_6_best_for_kv_cache,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv facebook_opt_125m_qcqa_ub_6_best_for_kv_cache,HFv1 HellaSwag,25.0,,hf_open_llm_v1_240829_frozen.csv facebook_opt_125m_qcqa_ub_6_best_for_kv_cache,HFv1 MMLU,23.12,,hf_open_llm_v1_240829_frozen.csv facebook_opt_125m_qcqa_ub_6_best_for_kv_cache,HFv1 TruthfulQA,48.41,,hf_open_llm_v1_240829_frozen.csv facebook_opt_125m_qcqa_ub_6_best_for_kv_cache,HFv1 Winogrande,51.22,,hf_open_llm_v1_240829_frozen.csv facebook_opt_125m_qcqa_ub_6_best_for_q_loss,HF OpenLLM v1,28.37,,hf_open_llm_v1_240829_frozen.csv facebook_opt_125m_qcqa_ub_6_best_for_q_loss,HFv1 ARC,23.29,,hf_open_llm_v1_240829_frozen.csv facebook_opt_125m_qcqa_ub_6_best_for_q_loss,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv facebook_opt_125m_qcqa_ub_6_best_for_q_loss,HFv1 HellaSwag,25.57,,hf_open_llm_v1_240829_frozen.csv facebook_opt_125m_qcqa_ub_6_best_for_q_loss,HFv1 MMLU,23.15,,hf_open_llm_v1_240829_frozen.csv facebook_opt_125m_qcqa_ub_6_best_for_q_loss,HFv1 TruthfulQA,49.03,,hf_open_llm_v1_240829_frozen.csv facebook_opt_125m_qcqa_ub_6_best_for_q_loss,HFv1 Winogrande,49.17,,hf_open_llm_v1_240829_frozen.csv facebook_opt_6_7b_gqa_ub_16_best_for_kv_cache,HF OpenLLM v1,28.84,,hf_open_llm_v1_240829_frozen.csv facebook_opt_6_7b_gqa_ub_16_best_for_kv_cache,HFv1 ARC,23.04,,hf_open_llm_v1_240829_frozen.csv facebook_opt_6_7b_gqa_ub_16_best_for_kv_cache,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv facebook_opt_6_7b_gqa_ub_16_best_for_kv_cache,HFv1 HellaSwag,25.94,,hf_open_llm_v1_240829_frozen.csv facebook_opt_6_7b_gqa_ub_16_best_for_kv_cache,HFv1 MMLU,23.12,,hf_open_llm_v1_240829_frozen.csv facebook_opt_6_7b_gqa_ub_16_best_for_kv_cache,HFv1 TruthfulQA,48.99,,hf_open_llm_v1_240829_frozen.csv facebook_opt_6_7b_gqa_ub_16_best_for_kv_cache,HFv1 Winogrande,51.93,,hf_open_llm_v1_240829_frozen.csv facebook_opt_6_7b_qcqa_ub_16_best_for_kv_cache,HF OpenLLM v1,28.58,,hf_open_llm_v1_240829_frozen.csv facebook_opt_6_7b_qcqa_ub_16_best_for_kv_cache,HFv1 ARC,23.81,,hf_open_llm_v1_240829_frozen.csv facebook_opt_6_7b_qcqa_ub_16_best_for_kv_cache,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv facebook_opt_6_7b_qcqa_ub_16_best_for_kv_cache,HFv1 HellaSwag,27.05,,hf_open_llm_v1_240829_frozen.csv facebook_opt_6_7b_qcqa_ub_16_best_for_kv_cache,HFv1 MMLU,23.12,,hf_open_llm_v1_240829_frozen.csv facebook_opt_6_7b_qcqa_ub_16_best_for_kv_cache,HFv1 TruthfulQA,46.69,,hf_open_llm_v1_240829_frozen.csv facebook_opt_6_7b_qcqa_ub_16_best_for_kv_cache,HFv1 Winogrande,50.83,,hf_open_llm_v1_240829_frozen.csv facebook_opt_6_7b_qcqa_ub_16_best_for_q_loss,HF OpenLLM v1,28.25,,hf_open_llm_v1_240829_frozen.csv facebook_opt_6_7b_qcqa_ub_16_best_for_q_loss,HFv1 ARC,21.67,,hf_open_llm_v1_240829_frozen.csv facebook_opt_6_7b_qcqa_ub_16_best_for_q_loss,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv facebook_opt_6_7b_qcqa_ub_16_best_for_q_loss,HFv1 HellaSwag,26.65,,hf_open_llm_v1_240829_frozen.csv facebook_opt_6_7b_qcqa_ub_16_best_for_q_loss,HFv1 MMLU,23.15,,hf_open_llm_v1_240829_frozen.csv facebook_opt_6_7b_qcqa_ub_16_best_for_q_loss,HFv1 TruthfulQA,46.81,,hf_open_llm_v1_240829_frozen.csv facebook_opt_6_7b_qcqa_ub_16_best_for_q_loss,HFv1 Winogrande,51.22,,hf_open_llm_v1_240829_frozen.csv falcon_11b,HF OpenLLM v1,64.28,,hf_open_llm_v1_240829_frozen.csv falcon_11b,HFv1 ARC,59.73,,hf_open_llm_v1_240829_frozen.csv falcon_11b,HFv1 GSM8K,53.83,,hf_open_llm_v1_240829_frozen.csv falcon_11b,HFv1 HellaSwag,82.91,,hf_open_llm_v1_240829_frozen.csv falcon_11b,HFv1 MMLU,58.37,,hf_open_llm_v1_240829_frozen.csv falcon_11b,HFv1 TruthfulQA,52.56,,hf_open_llm_v1_240829_frozen.csv falcon_11b,HFv1 Winogrande,78.3,,hf_open_llm_v1_240829_frozen.csv falcon_180b,HF OpenLLM v1,67.85,,hf_open_llm_v1_240829_frozen.csv falcon_180b,HFv1 ARC,69.45,,hf_open_llm_v1_240829_frozen.csv falcon_180b,HFv1 GSM8K,45.94,,hf_open_llm_v1_240829_frozen.csv falcon_180b,HFv1 HellaSwag,88.89,,hf_open_llm_v1_240829_frozen.csv falcon_180b,HFv1 MMLU,70.5,,hf_open_llm_v1_240829_frozen.csv falcon_180b,HFv1 TruthfulQA,45.47,,hf_open_llm_v1_240829_frozen.csv falcon_180b,HFv1 Winogrande,86.9,,hf_open_llm_v1_240829_frozen.csv falcon_1b_t_sft,HF OpenLLM v1,35.02,,hf_open_llm_v1_240829_frozen.csv falcon_1b_t_sft,HFv1 ARC,32.94,,hf_open_llm_v1_240829_frozen.csv falcon_1b_t_sft,HFv1 GSM8K,0.3,,hf_open_llm_v1_240829_frozen.csv falcon_1b_t_sft,HFv1 HellaSwag,57.24,,hf_open_llm_v1_240829_frozen.csv falcon_1b_t_sft,HFv1 MMLU,25.26,,hf_open_llm_v1_240829_frozen.csv falcon_1b_t_sft,HFv1 TruthfulQA,38.49,,hf_open_llm_v1_240829_frozen.csv falcon_1b_t_sft,HFv1 Winogrande,55.88,,hf_open_llm_v1_240829_frozen.csv falcon_40b,HF OpenLLM v1,58.07,,hf_open_llm_v1_240829_frozen.csv falcon_40b,HFv1 ARC,61.86,,hf_open_llm_v1_240829_frozen.csv falcon_40b,HFv1 GSM8K,21.46,,hf_open_llm_v1_240829_frozen.csv falcon_40b,HFv1 HellaSwag,85.28,,hf_open_llm_v1_240829_frozen.csv falcon_40b,HFv1 MMLU,56.89,,hf_open_llm_v1_240829_frozen.csv falcon_40b,HFv1 TruthfulQA,41.65,,hf_open_llm_v1_240829_frozen.csv falcon_40b,HFv1 Winogrande,81.29,,hf_open_llm_v1_240829_frozen.csv falcon_7b,HF OpenLLM v1,44.17,,hf_open_llm_v1_240829_frozen.csv falcon_7b,HFv1 ARC,47.87,,hf_open_llm_v1_240829_frozen.csv falcon_7b,HFv1 GSM8K,4.62,,hf_open_llm_v1_240829_frozen.csv falcon_7b,HFv1 HellaSwag,78.13,,hf_open_llm_v1_240829_frozen.csv falcon_7b,HFv1 MMLU,27.79,,hf_open_llm_v1_240829_frozen.csv falcon_7b,HFv1 TruthfulQA,34.26,,hf_open_llm_v1_240829_frozen.csv falcon_7b,HFv1 Winogrande,72.38,,hf_open_llm_v1_240829_frozen.csv falcon_7b_3epoch_norobots,HF OpenLLM v1,43.65,,hf_open_llm_v1_240829_frozen.csv falcon_7b_3epoch_norobots,HFv1 ARC,47.61,,hf_open_llm_v1_240829_frozen.csv falcon_7b_3epoch_norobots,HFv1 GSM8K,1.52,,hf_open_llm_v1_240829_frozen.csv falcon_7b_3epoch_norobots,HFv1 HellaSwag,77.24,,hf_open_llm_v1_240829_frozen.csv falcon_7b_3epoch_norobots,HFv1 MMLU,29.73,,hf_open_llm_v1_240829_frozen.csv falcon_7b_3epoch_norobots,HFv1 TruthfulQA,36.27,,hf_open_llm_v1_240829_frozen.csv falcon_7b_3epoch_norobots,HFv1 Winogrande,69.53,,hf_open_llm_v1_240829_frozen.csv falcon_7b_instruct,HF OpenLLM v1,43.26,,hf_open_llm_v1_240829_frozen.csv falcon_7b_instruct,HFv1 ARC,46.16,,hf_open_llm_v1_240829_frozen.csv falcon_7b_instruct,HFv1 GSM8K,4.7,,hf_open_llm_v1_240829_frozen.csv falcon_7b_instruct,HFv1 HellaSwag,70.85,,hf_open_llm_v1_240829_frozen.csv falcon_7b_instruct,HFv1 MMLU,25.84,,hf_open_llm_v1_240829_frozen.csv falcon_7b_instruct,HFv1 TruthfulQA,44.08,,hf_open_llm_v1_240829_frozen.csv falcon_7b_instruct,HFv1 Winogrande,67.96,,hf_open_llm_v1_240829_frozen.csv falcon_7b_norobots,HF OpenLLM v1,44.46,,hf_open_llm_v1_240829_frozen.csv falcon_7b_norobots,HFv1 ARC,47.87,,hf_open_llm_v1_240829_frozen.csv falcon_7b_norobots,HFv1 GSM8K,4.47,,hf_open_llm_v1_240829_frozen.csv falcon_7b_norobots,HFv1 HellaSwag,77.92,,hf_open_llm_v1_240829_frozen.csv falcon_7b_norobots,HFv1 MMLU,27.94,,hf_open_llm_v1_240829_frozen.csv falcon_7b_norobots,HFv1 TruthfulQA,36.81,,hf_open_llm_v1_240829_frozen.csv falcon_7b_norobots,HFv1 Winogrande,71.74,,hf_open_llm_v1_240829_frozen.csv falcon_rw_1b,HF OpenLLM v1,37.07,,hf_open_llm_v1_240829_frozen.csv falcon_rw_1b,HFv1 ARC,35.07,,hf_open_llm_v1_240829_frozen.csv falcon_rw_1b,HFv1 GSM8K,0.53,,hf_open_llm_v1_240829_frozen.csv falcon_rw_1b,HFv1 HellaSwag,63.56,,hf_open_llm_v1_240829_frozen.csv falcon_rw_1b,HFv1 MMLU,25.28,,hf_open_llm_v1_240829_frozen.csv falcon_rw_1b,HFv1 TruthfulQA,35.96,,hf_open_llm_v1_240829_frozen.csv falcon_rw_1b,HFv1 Winogrande,62.04,,hf_open_llm_v1_240829_frozen.csv falcon_rw_1b_chat,HF OpenLLM v1,37.37,,hf_open_llm_v1_240829_frozen.csv falcon_rw_1b_chat,HFv1 ARC,35.58,,hf_open_llm_v1_240829_frozen.csv falcon_rw_1b_chat,HFv1 GSM8K,1.67,,hf_open_llm_v1_240829_frozen.csv falcon_rw_1b_chat,HFv1 HellaSwag,61.12,,hf_open_llm_v1_240829_frozen.csv falcon_rw_1b_chat,HFv1 MMLU,24.51,,hf_open_llm_v1_240829_frozen.csv falcon_rw_1b_chat,HFv1 TruthfulQA,39.62,,hf_open_llm_v1_240829_frozen.csv falcon_rw_1b_chat,HFv1 Winogrande,61.72,,hf_open_llm_v1_240829_frozen.csv falcon_rw_1b_instruct_openorca,HF OpenLLM v1,37.63,,hf_open_llm_v1_240829_frozen.csv falcon_rw_1b_instruct_openorca,HFv1 ARC,34.56,,hf_open_llm_v1_240829_frozen.csv falcon_rw_1b_instruct_openorca,HFv1 GSM8K,3.41,,hf_open_llm_v1_240829_frozen.csv falcon_rw_1b_instruct_openorca,HFv1 HellaSwag,60.93,,hf_open_llm_v1_240829_frozen.csv falcon_rw_1b_instruct_openorca,HFv1 MMLU,28.77,,hf_open_llm_v1_240829_frozen.csv falcon_rw_1b_instruct_openorca,HFv1 TruthfulQA,37.42,,hf_open_llm_v1_240829_frozen.csv falcon_rw_1b_instruct_openorca,HFv1 Winogrande,60.69,,hf_open_llm_v1_240829_frozen.csv faro_yi_34b,HF OpenLLM v1,71.88,,hf_open_llm_v1_240829_frozen.csv faro_yi_34b,HFv1 ARC,66.55,,hf_open_llm_v1_240829_frozen.csv faro_yi_34b,HFv1 GSM8K,65.28,,hf_open_llm_v1_240829_frozen.csv faro_yi_34b,HFv1 HellaSwag,83.53,,hf_open_llm_v1_240829_frozen.csv faro_yi_34b,HFv1 MMLU,76.6,,hf_open_llm_v1_240829_frozen.csv faro_yi_34b,HFv1 TruthfulQA,55.64,,hf_open_llm_v1_240829_frozen.csv faro_yi_34b,HFv1 Winogrande,83.66,,hf_open_llm_v1_240829_frozen.csv faro_yi_34b_200k,HF OpenLLM v1,71.88,,hf_open_llm_v1_240829_frozen.csv faro_yi_34b_200k,HFv1 ARC,66.55,,hf_open_llm_v1_240829_frozen.csv faro_yi_34b_200k,HFv1 GSM8K,65.28,,hf_open_llm_v1_240829_frozen.csv faro_yi_34b_200k,HFv1 HellaSwag,83.53,,hf_open_llm_v1_240829_frozen.csv faro_yi_34b_200k,HFv1 MMLU,76.6,,hf_open_llm_v1_240829_frozen.csv faro_yi_34b_200k,HFv1 TruthfulQA,55.64,,hf_open_llm_v1_240829_frozen.csv faro_yi_34b_200k,HFv1 Winogrande,83.66,,hf_open_llm_v1_240829_frozen.csv faro_yi_9b,HF OpenLLM v1,66.37,,hf_open_llm_v1_240829_frozen.csv faro_yi_9b,HFv1 ARC,61.26,,hf_open_llm_v1_240829_frozen.csv faro_yi_9b,HFv1 GSM8K,63.0,,hf_open_llm_v1_240829_frozen.csv faro_yi_9b,HFv1 HellaSwag,76.95,,hf_open_llm_v1_240829_frozen.csv faro_yi_9b,HFv1 MMLU,70.77,,hf_open_llm_v1_240829_frozen.csv faro_yi_9b,HFv1 TruthfulQA,50.17,,hf_open_llm_v1_240829_frozen.csv faro_yi_9b,HFv1 Winogrande,76.09,,hf_open_llm_v1_240829_frozen.csv faro_yi_9b_200k,HF OpenLLM v1,66.37,,hf_open_llm_v1_240829_frozen.csv faro_yi_9b_200k,HFv1 ARC,61.26,,hf_open_llm_v1_240829_frozen.csv faro_yi_9b_200k,HFv1 GSM8K,63.0,,hf_open_llm_v1_240829_frozen.csv faro_yi_9b_200k,HFv1 HellaSwag,76.95,,hf_open_llm_v1_240829_frozen.csv faro_yi_9b_200k,HFv1 MMLU,70.77,,hf_open_llm_v1_240829_frozen.csv faro_yi_9b_200k,HFv1 TruthfulQA,50.17,,hf_open_llm_v1_240829_frozen.csv faro_yi_9b_200k,HFv1 Winogrande,76.09,,hf_open_llm_v1_240829_frozen.csv faro_yi_9b_dpo,HF OpenLLM v1,68.77,,hf_open_llm_v1_240829_frozen.csv faro_yi_9b_dpo,HFv1 ARC,64.16,,hf_open_llm_v1_240829_frozen.csv faro_yi_9b_dpo,HFv1 GSM8K,64.75,,hf_open_llm_v1_240829_frozen.csv faro_yi_9b_dpo,HFv1 HellaSwag,78.92,,hf_open_llm_v1_240829_frozen.csv faro_yi_9b_dpo,HFv1 MMLU,70.74,,hf_open_llm_v1_240829_frozen.csv faro_yi_9b_dpo,HFv1 TruthfulQA,56.25,,hf_open_llm_v1_240829_frozen.csv faro_yi_9b_dpo,HFv1 Winogrande,77.82,,hf_open_llm_v1_240829_frozen.csv fasciculus_arcuatus_7b_slerp,HF OpenLLM v1,76.07,,hf_open_llm_v1_240829_frozen.csv fasciculus_arcuatus_7b_slerp,HFv1 ARC,73.55,,hf_open_llm_v1_240829_frozen.csv fasciculus_arcuatus_7b_slerp,HFv1 GSM8K,71.04,,hf_open_llm_v1_240829_frozen.csv fasciculus_arcuatus_7b_slerp,HFv1 HellaSwag,88.95,,hf_open_llm_v1_240829_frozen.csv fasciculus_arcuatus_7b_slerp,HFv1 MMLU,64.65,,hf_open_llm_v1_240829_frozen.csv fasciculus_arcuatus_7b_slerp,HFv1 TruthfulQA,72.53,,hf_open_llm_v1_240829_frozen.csv fasciculus_arcuatus_7b_slerp,HFv1 Winogrande,85.71,,hf_open_llm_v1_240829_frozen.csv fbopt_350m_8bit,HF OpenLLM v1,30.21,,hf_open_llm_v1_240829_frozen.csv fbopt_350m_8bit,HFv1 ARC,23.55,,hf_open_llm_v1_240829_frozen.csv fbopt_350m_8bit,HFv1 GSM8K,1.29,,hf_open_llm_v1_240829_frozen.csv fbopt_350m_8bit,HFv1 HellaSwag,36.6,,hf_open_llm_v1_240829_frozen.csv fbopt_350m_8bit,HFv1 MMLU,26.22,,hf_open_llm_v1_240829_frozen.csv fbopt_350m_8bit,HFv1 TruthfulQA,40.97,,hf_open_llm_v1_240829_frozen.csv fbopt_350m_8bit,HFv1 Winogrande,52.64,,hf_open_llm_v1_240829_frozen.csv felix_8b,HF OpenLLM v1,67.1,,hf_open_llm_v1_240829_frozen.csv felix_8b,HFv1 ARC,65.02,,hf_open_llm_v1_240829_frozen.csv felix_8b,HFv1 GSM8K,51.78,,hf_open_llm_v1_240829_frozen.csv felix_8b,HFv1 HellaSwag,84.61,,hf_open_llm_v1_240829_frozen.csv felix_8b,HFv1 MMLU,61.05,,hf_open_llm_v1_240829_frozen.csv felix_8b,HFv1 TruthfulQA,64.23,,hf_open_llm_v1_240829_frozen.csv felix_8b,HFv1 Winogrande,75.93,,hf_open_llm_v1_240829_frozen.csv ferret_7b,HF OpenLLM v1,53.93,,hf_open_llm_v1_240829_frozen.csv ferret_7b,HFv1 ARC,62.29,,hf_open_llm_v1_240829_frozen.csv ferret_7b,HFv1 GSM8K,2.05,,hf_open_llm_v1_240829_frozen.csv ferret_7b,HFv1 HellaSwag,81.33,,hf_open_llm_v1_240829_frozen.csv ferret_7b,HFv1 MMLU,60.27,,hf_open_llm_v1_240829_frozen.csv ferret_7b,HFv1 TruthfulQA,40.01,,hf_open_llm_v1_240829_frozen.csv ferret_7b,HFv1 Winogrande,77.66,,hf_open_llm_v1_240829_frozen.csv fettuccinelake_dpo_7b_slerp,HF OpenLLM v1,69.09,,hf_open_llm_v1_240829_frozen.csv fettuccinelake_dpo_7b_slerp,HFv1 ARC,67.92,,hf_open_llm_v1_240829_frozen.csv fettuccinelake_dpo_7b_slerp,HFv1 GSM8K,47.76,,hf_open_llm_v1_240829_frozen.csv fettuccinelake_dpo_7b_slerp,HFv1 HellaSwag,86.37,,hf_open_llm_v1_240829_frozen.csv fettuccinelake_dpo_7b_slerp,HFv1 MMLU,63.24,,hf_open_llm_v1_240829_frozen.csv fettuccinelake_dpo_7b_slerp,HFv1 TruthfulQA,68.64,,hf_open_llm_v1_240829_frozen.csv fettuccinelake_dpo_7b_slerp,HFv1 Winogrande,80.58,,hf_open_llm_v1_240829_frozen.csv fialka_13b_v3,HF OpenLLM v1,34.58,,hf_open_llm_v1_240829_frozen.csv fialka_13b_v3,HFv1 ARC,30.97,,hf_open_llm_v1_240829_frozen.csv fialka_13b_v3,HFv1 GSM8K,1.29,,hf_open_llm_v1_240829_frozen.csv fialka_13b_v3,HFv1 HellaSwag,48.83,,hf_open_llm_v1_240829_frozen.csv fialka_13b_v3,HFv1 MMLU,26.36,,hf_open_llm_v1_240829_frozen.csv fialka_13b_v3,HFv1 TruthfulQA,40.58,,hf_open_llm_v1_240829_frozen.csv fialka_13b_v3,HFv1 Winogrande,59.43,,hf_open_llm_v1_240829_frozen.csv fialka_13b_v3_1,HF OpenLLM v1,34.11,,hf_open_llm_v1_240829_frozen.csv fialka_13b_v3_1,HFv1 ARC,29.95,,hf_open_llm_v1_240829_frozen.csv fialka_13b_v3_1,HFv1 GSM8K,0.53,,hf_open_llm_v1_240829_frozen.csv fialka_13b_v3_1,HFv1 HellaSwag,47.28,,hf_open_llm_v1_240829_frozen.csv fialka_13b_v3_1,HFv1 MMLU,25.41,,hf_open_llm_v1_240829_frozen.csv fialka_13b_v3_1,HFv1 TruthfulQA,43.03,,hf_open_llm_v1_240829_frozen.csv fialka_13b_v3_1,HFv1 Winogrande,58.48,,hf_open_llm_v1_240829_frozen.csv fialka_13b_v4,HF OpenLLM v1,34.18,,hf_open_llm_v1_240829_frozen.csv fialka_13b_v4,HFv1 ARC,29.69,,hf_open_llm_v1_240829_frozen.csv fialka_13b_v4,HFv1 GSM8K,0.38,,hf_open_llm_v1_240829_frozen.csv fialka_13b_v4,HFv1 HellaSwag,47.37,,hf_open_llm_v1_240829_frozen.csv fialka_13b_v4,HFv1 MMLU,25.09,,hf_open_llm_v1_240829_frozen.csv fialka_13b_v4,HFv1 TruthfulQA,43.65,,hf_open_llm_v1_240829_frozen.csv fialka_13b_v4,HFv1 Winogrande,58.88,,hf_open_llm_v1_240829_frozen.csv fialka_7b_v3,HF OpenLLM v1,46.4,,hf_open_llm_v1_240829_frozen.csv fialka_7b_v3,HFv1 ARC,48.55,,hf_open_llm_v1_240829_frozen.csv fialka_7b_v3,HFv1 GSM8K,1.52,,hf_open_llm_v1_240829_frozen.csv fialka_7b_v3,HFv1 HellaSwag,71.05,,hf_open_llm_v1_240829_frozen.csv fialka_7b_v3,HFv1 MMLU,43.06,,hf_open_llm_v1_240829_frozen.csv fialka_7b_v3,HFv1 TruthfulQA,44.79,,hf_open_llm_v1_240829_frozen.csv fialka_7b_v3,HFv1 Winogrande,69.46,,hf_open_llm_v1_240829_frozen.csv fietje_2b,HF OpenLLM v1,51.59,,hf_open_llm_v1_240829_frozen.csv fietje_2b,HFv1 ARC,53.5,,hf_open_llm_v1_240829_frozen.csv fietje_2b,HFv1 GSM8K,27.9,,hf_open_llm_v1_240829_frozen.csv fietje_2b,HFv1 HellaSwag,67.11,,hf_open_llm_v1_240829_frozen.csv fietje_2b,HFv1 MMLU,49.3,,hf_open_llm_v1_240829_frozen.csv fietje_2b,HFv1 TruthfulQA,39.93,,hf_open_llm_v1_240829_frozen.csv fietje_2b,HFv1 Winogrande,71.82,,hf_open_llm_v1_240829_frozen.csv fietje_2b_chat,HF OpenLLM v1,48.75,,hf_open_llm_v1_240829_frozen.csv fietje_2b_chat,HFv1 ARC,54.01,,hf_open_llm_v1_240829_frozen.csv fietje_2b_chat,HFv1 GSM8K,6.14,,hf_open_llm_v1_240829_frozen.csv fietje_2b_chat,HFv1 HellaSwag,68.92,,hf_open_llm_v1_240829_frozen.csv fietje_2b_chat,HFv1 MMLU,49.92,,hf_open_llm_v1_240829_frozen.csv fietje_2b_chat,HFv1 TruthfulQA,41.94,,hf_open_llm_v1_240829_frozen.csv fietje_2b_chat,HFv1 Winogrande,71.59,,hf_open_llm_v1_240829_frozen.csv fietje_2b_instruct,HF OpenLLM v1,50.3,,hf_open_llm_v1_240829_frozen.csv fietje_2b_instruct,HFv1 ARC,53.41,,hf_open_llm_v1_240829_frozen.csv fietje_2b_instruct,HFv1 GSM8K,14.71,,hf_open_llm_v1_240829_frozen.csv fietje_2b_instruct,HFv1 HellaSwag,68.08,,hf_open_llm_v1_240829_frozen.csv fietje_2b_instruct,HFv1 MMLU,49.74,,hf_open_llm_v1_240829_frozen.csv fietje_2b_instruct,HFv1 TruthfulQA,43.47,,hf_open_llm_v1_240829_frozen.csv fietje_2b_instruct,HFv1 Winogrande,72.38,,hf_open_llm_v1_240829_frozen.csv flan_llama_7b_2_llama2_7b_flash_868_full_model,HF OpenLLM v1,49.64,,hf_open_llm_v1_240829_frozen.csv flan_llama_7b_2_llama2_7b_flash_868_full_model,HFv1 ARC,52.47,,hf_open_llm_v1_240829_frozen.csv flan_llama_7b_2_llama2_7b_flash_868_full_model,HFv1 GSM8K,6.82,,hf_open_llm_v1_240829_frozen.csv flan_llama_7b_2_llama2_7b_flash_868_full_model,HFv1 HellaSwag,79.08,,hf_open_llm_v1_240829_frozen.csv flan_llama_7b_2_llama2_7b_flash_868_full_model,HFv1 MMLU,47.58,,hf_open_llm_v1_240829_frozen.csv flan_llama_7b_2_llama2_7b_flash_868_full_model,HFv1 TruthfulQA,37.14,,hf_open_llm_v1_240829_frozen.csv flan_llama_7b_2_llama2_7b_flash_868_full_model,HFv1 Winogrande,74.74,,hf_open_llm_v1_240829_frozen.csv flor_1_3b_xat,HF OpenLLM v1,32.27,,hf_open_llm_v1_240829_frozen.csv flor_1_3b_xat,HFv1 ARC,26.79,,hf_open_llm_v1_240829_frozen.csv flor_1_3b_xat,HFv1 GSM8K,0.76,,hf_open_llm_v1_240829_frozen.csv flor_1_3b_xat,HFv1 HellaSwag,41.63,,hf_open_llm_v1_240829_frozen.csv flor_1_3b_xat,HFv1 MMLU,26.65,,hf_open_llm_v1_240829_frozen.csv flor_1_3b_xat,HFv1 TruthfulQA,44.38,,hf_open_llm_v1_240829_frozen.csv flor_1_3b_xat,HFv1 Winogrande,53.43,,hf_open_llm_v1_240829_frozen.csv flora_dpo_7b,HF OpenLLM v1,74.26,,hf_open_llm_v1_240829_frozen.csv flora_dpo_7b,HFv1 ARC,71.76,,hf_open_llm_v1_240829_frozen.csv flora_dpo_7b,HFv1 GSM8K,65.81,,hf_open_llm_v1_240829_frozen.csv flora_dpo_7b,HFv1 HellaSwag,88.28,,hf_open_llm_v1_240829_frozen.csv flora_dpo_7b,HFv1 MMLU,64.13,,hf_open_llm_v1_240829_frozen.csv flora_dpo_7b,HFv1 TruthfulQA,71.08,,hf_open_llm_v1_240829_frozen.csv flora_dpo_7b,HFv1 Winogrande,84.53,,hf_open_llm_v1_240829_frozen.csv flyingllama_v2,HF OpenLLM v1,30.19,,hf_open_llm_v1_240829_frozen.csv flyingllama_v2,HFv1 ARC,24.74,,hf_open_llm_v1_240829_frozen.csv flyingllama_v2,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv flyingllama_v2,HFv1 HellaSwag,38.44,,hf_open_llm_v1_240829_frozen.csv flyingllama_v2,HFv1 MMLU,26.37,,hf_open_llm_v1_240829_frozen.csv flyingllama_v2,HFv1 TruthfulQA,41.3,,hf_open_llm_v1_240829_frozen.csv flyingllama_v2,HFv1 Winogrande,50.28,,hf_open_llm_v1_240829_frozen.csv franken_solar_18b_v1_0,HF OpenLLM v1,67.03,,hf_open_llm_v1_240829_frozen.csv franken_solar_18b_v1_0,HFv1 ARC,65.53,,hf_open_llm_v1_240829_frozen.csv franken_solar_18b_v1_0,HFv1 GSM8K,45.79,,hf_open_llm_v1_240829_frozen.csv franken_solar_18b_v1_0,HFv1 HellaSwag,86.45,,hf_open_llm_v1_240829_frozen.csv franken_solar_18b_v1_0,HFv1 MMLU,63.72,,hf_open_llm_v1_240829_frozen.csv franken_solar_18b_v1_0,HFv1 TruthfulQA,62.14,,hf_open_llm_v1_240829_frozen.csv franken_solar_18b_v1_0,HFv1 Winogrande,78.53,,hf_open_llm_v1_240829_frozen.csv frankenmonarch_7b,HF OpenLLM v1,71.67,,hf_open_llm_v1_240829_frozen.csv frankenmonarch_7b,HFv1 ARC,71.59,,hf_open_llm_v1_240829_frozen.csv frankenmonarch_7b,HFv1 GSM8K,48.67,,hf_open_llm_v1_240829_frozen.csv frankenmonarch_7b,HFv1 HellaSwag,88.59,,hf_open_llm_v1_240829_frozen.csv frankenmonarch_7b,HFv1 MMLU,63.93,,hf_open_llm_v1_240829_frozen.csv frankenmonarch_7b,HFv1 TruthfulQA,73.69,,hf_open_llm_v1_240829_frozen.csv frankenmonarch_7b,HFv1 Winogrande,83.58,,hf_open_llm_v1_240829_frozen.csv free_llama3_dpo_v0_2,HF OpenLLM v1,62.69,,hf_open_llm_v1_240829_frozen.csv free_llama3_dpo_v0_2,HFv1 ARC,59.9,,hf_open_llm_v1_240829_frozen.csv free_llama3_dpo_v0_2,HFv1 GSM8K,44.66,,hf_open_llm_v1_240829_frozen.csv free_llama3_dpo_v0_2,HFv1 HellaSwag,81.88,,hf_open_llm_v1_240829_frozen.csv free_llama3_dpo_v0_2,HFv1 MMLU,66.59,,hf_open_llm_v1_240829_frozen.csv free_llama3_dpo_v0_2,HFv1 TruthfulQA,45.83,,hf_open_llm_v1_240829_frozen.csv free_llama3_dpo_v0_2,HFv1 Winogrande,77.27,,hf_open_llm_v1_240829_frozen.csv freeze_kosolar_10_7b_v0_2_1_4_dedup,HF OpenLLM v1,60.06,,hf_open_llm_v1_240829_frozen.csv freeze_kosolar_10_7b_v0_2_1_4_dedup,HFv1 ARC,58.45,,hf_open_llm_v1_240829_frozen.csv freeze_kosolar_10_7b_v0_2_1_4_dedup,HFv1 GSM8K,32.22,,hf_open_llm_v1_240829_frozen.csv freeze_kosolar_10_7b_v0_2_1_4_dedup,HFv1 HellaSwag,81.26,,hf_open_llm_v1_240829_frozen.csv freeze_kosolar_10_7b_v0_2_1_4_dedup,HFv1 MMLU,64.83,,hf_open_llm_v1_240829_frozen.csv freeze_kosolar_10_7b_v0_2_1_4_dedup,HFv1 TruthfulQA,44.5,,hf_open_llm_v1_240829_frozen.csv freeze_kosolar_10_7b_v0_2_1_4_dedup,HFv1 Winogrande,79.08,,hf_open_llm_v1_240829_frozen.csv fsfairx_zephyr_chat_v0_1,HF OpenLLM v1,61.2,,hf_open_llm_v1_240829_frozen.csv fsfairx_zephyr_chat_v0_1,HFv1 ARC,63.31,,hf_open_llm_v1_240829_frozen.csv fsfairx_zephyr_chat_v0_1,HFv1 GSM8K,27.22,,hf_open_llm_v1_240829_frozen.csv fsfairx_zephyr_chat_v0_1,HFv1 HellaSwag,84.42,,hf_open_llm_v1_240829_frozen.csv fsfairx_zephyr_chat_v0_1,HFv1 MMLU,61.21,,hf_open_llm_v1_240829_frozen.csv fsfairx_zephyr_chat_v0_1,HFv1 TruthfulQA,53.56,,hf_open_llm_v1_240829_frozen.csv fsfairx_zephyr_chat_v0_1,HFv1 Winogrande,77.51,,hf_open_llm_v1_240829_frozen.csv fusellm_7b,HF OpenLLM v1,51.07,,hf_open_llm_v1_240829_frozen.csv fusellm_7b,HFv1 ARC,53.24,,hf_open_llm_v1_240829_frozen.csv fusellm_7b,HFv1 GSM8K,14.33,,hf_open_llm_v1_240829_frozen.csv fusellm_7b,HFv1 HellaSwag,78.72,,hf_open_llm_v1_240829_frozen.csv fusellm_7b,HFv1 MMLU,47.93,,hf_open_llm_v1_240829_frozen.csv fusellm_7b,HFv1 TruthfulQA,38.17,,hf_open_llm_v1_240829_frozen.csv fusellm_7b,HFv1 Winogrande,74.03,,hf_open_llm_v1_240829_frozen.csv fusionnet_7bx2_moe_ko_dpo_adapter_attach,HF OpenLLM v1,76.09,,hf_open_llm_v1_240829_frozen.csv fusionnet_7bx2_moe_ko_dpo_adapter_attach,HFv1 ARC,73.89,,hf_open_llm_v1_240829_frozen.csv fusionnet_7bx2_moe_ko_dpo_adapter_attach,HFv1 GSM8K,69.83,,hf_open_llm_v1_240829_frozen.csv fusionnet_7bx2_moe_ko_dpo_adapter_attach,HFv1 HellaSwag,88.94,,hf_open_llm_v1_240829_frozen.csv fusionnet_7bx2_moe_ko_dpo_adapter_attach,HFv1 MMLU,65.03,,hf_open_llm_v1_240829_frozen.csv fusionnet_7bx2_moe_ko_dpo_adapter_attach,HFv1 TruthfulQA,71.24,,hf_open_llm_v1_240829_frozen.csv fusionnet_7bx2_moe_ko_dpo_adapter_attach,HFv1 Winogrande,87.61,,hf_open_llm_v1_240829_frozen.csv gaja_v1_00,HF OpenLLM v1,47.69,,hf_open_llm_v1_240829_frozen.csv gaja_v1_00,HFv1 ARC,52.82,,hf_open_llm_v1_240829_frozen.csv gaja_v1_00,HFv1 GSM8K,0.91,,hf_open_llm_v1_240829_frozen.csv gaja_v1_00,HFv1 HellaSwag,76.31,,hf_open_llm_v1_240829_frozen.csv gaja_v1_00,HFv1 MMLU,40.83,,hf_open_llm_v1_240829_frozen.csv gaja_v1_00,HFv1 TruthfulQA,44.64,,hf_open_llm_v1_240829_frozen.csv gaja_v1_00,HFv1 Winogrande,70.64,,hf_open_llm_v1_240829_frozen.csv gaja_v2_00_dpo,HF OpenLLM v1,46.91,,hf_open_llm_v1_240829_frozen.csv gaja_v2_00_dpo,HFv1 ARC,51.71,,hf_open_llm_v1_240829_frozen.csv gaja_v2_00_dpo,HFv1 GSM8K,0.23,,hf_open_llm_v1_240829_frozen.csv gaja_v2_00_dpo,HFv1 HellaSwag,75.87,,hf_open_llm_v1_240829_frozen.csv gaja_v2_00_dpo,HFv1 MMLU,40.79,,hf_open_llm_v1_240829_frozen.csv gaja_v2_00_dpo,HFv1 TruthfulQA,41.29,,hf_open_llm_v1_240829_frozen.csv gaja_v2_00_dpo,HFv1 Winogrande,71.59,,hf_open_llm_v1_240829_frozen.csv galpaca_30b_miniorca,HF OpenLLM v1,42.23,,hf_open_llm_v1_240829_frozen.csv galpaca_30b_miniorca,HFv1 ARC,48.89,,hf_open_llm_v1_240829_frozen.csv galpaca_30b_miniorca,HFv1 GSM8K,1.82,,hf_open_llm_v1_240829_frozen.csv galpaca_30b_miniorca,HFv1 HellaSwag,57.8,,hf_open_llm_v1_240829_frozen.csv galpaca_30b_miniorca,HFv1 MMLU,43.72,,hf_open_llm_v1_240829_frozen.csv galpaca_30b_miniorca,HFv1 TruthfulQA,41.1,,hf_open_llm_v1_240829_frozen.csv galpaca_30b_miniorca,HFv1 Winogrande,60.06,,hf_open_llm_v1_240829_frozen.csv garrulus,HF OpenLLM v1,75.16,,hf_open_llm_v1_240829_frozen.csv garrulus,HFv1 ARC,73.29,,hf_open_llm_v1_240829_frozen.csv garrulus,HFv1 GSM8K,64.52,,hf_open_llm_v1_240829_frozen.csv garrulus,HFv1 HellaSwag,88.87,,hf_open_llm_v1_240829_frozen.csv garrulus,HFv1 MMLU,64.57,,hf_open_llm_v1_240829_frozen.csv garrulus,HFv1 TruthfulQA,68.23,,hf_open_llm_v1_240829_frozen.csv garrulus,HFv1 Winogrande,91.48,,hf_open_llm_v1_240829_frozen.csv geitje_7b_chat_v2,HF OpenLLM v1,50.79,,hf_open_llm_v1_240829_frozen.csv geitje_7b_chat_v2,HFv1 ARC,50.34,,hf_open_llm_v1_240829_frozen.csv geitje_7b_chat_v2,HFv1 GSM8K,16.22,,hf_open_llm_v1_240829_frozen.csv geitje_7b_chat_v2,HFv1 HellaSwag,74.13,,hf_open_llm_v1_240829_frozen.csv geitje_7b_chat_v2,HFv1 MMLU,49.0,,hf_open_llm_v1_240829_frozen.csv geitje_7b_chat_v2,HFv1 TruthfulQA,43.55,,hf_open_llm_v1_240829_frozen.csv geitje_7b_chat_v2,HFv1 Winogrande,71.51,,hf_open_llm_v1_240829_frozen.csv geitje_7b_ultra,HF OpenLLM v1,52.61,,hf_open_llm_v1_240829_frozen.csv geitje_7b_ultra,HFv1 ARC,45.48,,hf_open_llm_v1_240829_frozen.csv geitje_7b_ultra,HFv1 GSM8K,19.41,,hf_open_llm_v1_240829_frozen.csv geitje_7b_ultra,HFv1 HellaSwag,75.5,,hf_open_llm_v1_240829_frozen.csv geitje_7b_ultra,HFv1 MMLU,50.16,,hf_open_llm_v1_240829_frozen.csv geitje_7b_ultra,HFv1 TruthfulQA,53.36,,hf_open_llm_v1_240829_frozen.csv geitje_7b_ultra,HFv1 Winogrande,71.74,,hf_open_llm_v1_240829_frozen.csv gem_14b_instruct,HF OpenLLM v1,60.9,,hf_open_llm_v1_240829_frozen.csv gem_14b_instruct,HFv1 ARC,54.61,,hf_open_llm_v1_240829_frozen.csv gem_14b_instruct,HFv1 GSM8K,46.93,,hf_open_llm_v1_240829_frozen.csv gem_14b_instruct,HFv1 HellaSwag,77.69,,hf_open_llm_v1_240829_frozen.csv gem_14b_instruct,HFv1 MMLU,66.54,,hf_open_llm_v1_240829_frozen.csv gem_14b_instruct,HFv1 TruthfulQA,45.38,,hf_open_llm_v1_240829_frozen.csv gem_14b_instruct,HFv1 Winogrande,74.27,,hf_open_llm_v1_240829_frozen.csv gemma_1_1_7b_it,HF OpenLLM v1,60.09,,hf_open_llm_v1_240829_frozen.csv gemma_1_1_7b_it,HFv1 ARC,60.07,,hf_open_llm_v1_240829_frozen.csv gemma_1_1_7b_it,HFv1 GSM8K,42.99,,hf_open_llm_v1_240829_frozen.csv gemma_1_1_7b_it,HFv1 HellaSwag,76.21,,hf_open_llm_v1_240829_frozen.csv gemma_1_1_7b_it,HFv1 MMLU,60.92,,hf_open_llm_v1_240829_frozen.csv gemma_1_1_7b_it,HFv1 TruthfulQA,50.74,,hf_open_llm_v1_240829_frozen.csv gemma_1_1_7b_it,HFv1 Winogrande,69.93,,hf_open_llm_v1_240829_frozen.csv gemma_2b,HF OpenLLM v1,46.51,,hf_open_llm_v1_240829_frozen.csv gemma_2b,HFv1 ARC,48.46,,hf_open_llm_v1_240829_frozen.csv gemma_2b,HFv1 GSM8K,17.36,,hf_open_llm_v1_240829_frozen.csv gemma_2b,HFv1 HellaSwag,71.77,,hf_open_llm_v1_240829_frozen.csv gemma_2b,HFv1 MMLU,41.77,,hf_open_llm_v1_240829_frozen.csv gemma_2b,HFv1 TruthfulQA,33.13,,hf_open_llm_v1_240829_frozen.csv gemma_2b,HFv1 Winogrande,66.77,,hf_open_llm_v1_240829_frozen.csv gemma_2b_it_nlai_p1,HF OpenLLM v1,42.78,,hf_open_llm_v1_240829_frozen.csv gemma_2b_it_nlai_p1,HFv1 ARC,43.94,,hf_open_llm_v1_240829_frozen.csv gemma_2b_it_nlai_p1,HFv1 GSM8K,5.46,,hf_open_llm_v1_240829_frozen.csv gemma_2b_it_nlai_p1,HFv1 HellaSwag,62.71,,hf_open_llm_v1_240829_frozen.csv gemma_2b_it_nlai_p1,HFv1 MMLU,37.68,,hf_open_llm_v1_240829_frozen.csv gemma_2b_it_nlai_p1,HFv1 TruthfulQA,45.85,,hf_open_llm_v1_240829_frozen.csv gemma_2b_it_nlai_p1,HFv1 Winogrande,61.01,,hf_open_llm_v1_240829_frozen.csv gemma_2b_it_nlai_v0,HF OpenLLM v1,42.8,,hf_open_llm_v1_240829_frozen.csv gemma_2b_it_nlai_v0,HFv1 ARC,44.03,,hf_open_llm_v1_240829_frozen.csv gemma_2b_it_nlai_v0,HFv1 GSM8K,5.38,,hf_open_llm_v1_240829_frozen.csv gemma_2b_it_nlai_v0,HFv1 HellaSwag,62.67,,hf_open_llm_v1_240829_frozen.csv gemma_2b_it_nlai_v0,HFv1 MMLU,37.58,,hf_open_llm_v1_240829_frozen.csv gemma_2b_it_nlai_v0,HFv1 TruthfulQA,45.8,,hf_open_llm_v1_240829_frozen.csv gemma_2b_it_nlai_v0,HFv1 Winogrande,61.33,,hf_open_llm_v1_240829_frozen.csv gemma_2b_it_sp_test,HF OpenLLM v1,42.79,,hf_open_llm_v1_240829_frozen.csv gemma_2b_it_sp_test,HFv1 ARC,44.03,,hf_open_llm_v1_240829_frozen.csv gemma_2b_it_sp_test,HFv1 GSM8K,5.31,,hf_open_llm_v1_240829_frozen.csv gemma_2b_it_sp_test,HFv1 HellaSwag,62.82,,hf_open_llm_v1_240829_frozen.csv gemma_2b_it_sp_test,HFv1 MMLU,37.67,,hf_open_llm_v1_240829_frozen.csv gemma_2b_it_sp_test,HFv1 TruthfulQA,45.77,,hf_open_llm_v1_240829_frozen.csv gemma_2b_it_sp_test,HFv1 Winogrande,61.17,,hf_open_llm_v1_240829_frozen.csv gemma_2b_it_sp_test1,HF OpenLLM v1,42.79,,hf_open_llm_v1_240829_frozen.csv gemma_2b_it_sp_test1,HFv1 ARC,44.03,,hf_open_llm_v1_240829_frozen.csv gemma_2b_it_sp_test1,HFv1 GSM8K,5.31,,hf_open_llm_v1_240829_frozen.csv gemma_2b_it_sp_test1,HFv1 HellaSwag,62.82,,hf_open_llm_v1_240829_frozen.csv gemma_2b_it_sp_test1,HFv1 MMLU,37.67,,hf_open_llm_v1_240829_frozen.csv gemma_2b_it_sp_test1,HFv1 TruthfulQA,45.77,,hf_open_llm_v1_240829_frozen.csv gemma_2b_it_sp_test1,HFv1 Winogrande,61.17,,hf_open_llm_v1_240829_frozen.csv gemma_2b_it_sp_test_openherms_step500,HF OpenLLM v1,42.79,,hf_open_llm_v1_240829_frozen.csv gemma_2b_it_sp_test_openherms_step500,HFv1 ARC,44.03,,hf_open_llm_v1_240829_frozen.csv gemma_2b_it_sp_test_openherms_step500,HFv1 GSM8K,5.31,,hf_open_llm_v1_240829_frozen.csv gemma_2b_it_sp_test_openherms_step500,HFv1 HellaSwag,62.82,,hf_open_llm_v1_240829_frozen.csv gemma_2b_it_sp_test_openherms_step500,HFv1 MMLU,37.67,,hf_open_llm_v1_240829_frozen.csv gemma_2b_it_sp_test_openherms_step500,HFv1 TruthfulQA,45.77,,hf_open_llm_v1_240829_frozen.csv gemma_2b_it_sp_test_openherms_step500,HFv1 Winogrande,61.17,,hf_open_llm_v1_240829_frozen.csv gemma_2b_nlaf_v0,HF OpenLLM v1,42.83,,hf_open_llm_v1_240829_frozen.csv gemma_2b_nlaf_v0,HFv1 ARC,43.77,,hf_open_llm_v1_240829_frozen.csv gemma_2b_nlaf_v0,HFv1 GSM8K,5.91,,hf_open_llm_v1_240829_frozen.csv gemma_2b_nlaf_v0,HFv1 HellaSwag,62.73,,hf_open_llm_v1_240829_frozen.csv gemma_2b_nlaf_v0,HFv1 MMLU,37.72,,hf_open_llm_v1_240829_frozen.csv gemma_2b_nlaf_v0,HFv1 TruthfulQA,45.85,,hf_open_llm_v1_240829_frozen.csv gemma_2b_nlaf_v0,HFv1 Winogrande,61.01,,hf_open_llm_v1_240829_frozen.csv gemma_2b_openhermes,HF OpenLLM v1,42.78,,hf_open_llm_v1_240829_frozen.csv gemma_2b_openhermes,HFv1 ARC,43.94,,hf_open_llm_v1_240829_frozen.csv gemma_2b_openhermes,HFv1 GSM8K,5.61,,hf_open_llm_v1_240829_frozen.csv gemma_2b_openhermes,HFv1 HellaSwag,62.74,,hf_open_llm_v1_240829_frozen.csv gemma_2b_openhermes,HFv1 MMLU,37.62,,hf_open_llm_v1_240829_frozen.csv gemma_2b_openhermes,HFv1 TruthfulQA,45.83,,hf_open_llm_v1_240829_frozen.csv gemma_2b_openhermes,HFv1 Winogrande,60.93,,hf_open_llm_v1_240829_frozen.csv gemma_2b_orpo,HF OpenLLM v1,47.35,,hf_open_llm_v1_240829_frozen.csv gemma_2b_orpo,HFv1 ARC,49.15,,hf_open_llm_v1_240829_frozen.csv gemma_2b_orpo,HFv1 GSM8K,13.87,,hf_open_llm_v1_240829_frozen.csv gemma_2b_orpo,HFv1 HellaSwag,73.72,,hf_open_llm_v1_240829_frozen.csv gemma_2b_orpo,HFv1 MMLU,38.52,,hf_open_llm_v1_240829_frozen.csv gemma_2b_orpo,HFv1 TruthfulQA,44.53,,hf_open_llm_v1_240829_frozen.csv gemma_2b_orpo,HFv1 Winogrande,64.33,,hf_open_llm_v1_240829_frozen.csv gemma_2b_sft_telugu,HF OpenLLM v1,41.25,,hf_open_llm_v1_240829_frozen.csv gemma_2b_sft_telugu,HFv1 ARC,41.38,,hf_open_llm_v1_240829_frozen.csv gemma_2b_sft_telugu,HFv1 GSM8K,4.02,,hf_open_llm_v1_240829_frozen.csv gemma_2b_sft_telugu,HFv1 HellaSwag,63.2,,hf_open_llm_v1_240829_frozen.csv gemma_2b_sft_telugu,HFv1 MMLU,31.94,,hf_open_llm_v1_240829_frozen.csv gemma_2b_sft_telugu,HFv1 TruthfulQA,46.95,,hf_open_llm_v1_240829_frozen.csv gemma_2b_sft_telugu,HFv1 Winogrande,59.98,,hf_open_llm_v1_240829_frozen.csv gemma_2b_tamil,HF OpenLLM v1,45.13,,hf_open_llm_v1_240829_frozen.csv gemma_2b_tamil,HFv1 ARC,47.44,,hf_open_llm_v1_240829_frozen.csv gemma_2b_tamil,HFv1 GSM8K,12.89,,hf_open_llm_v1_240829_frozen.csv gemma_2b_tamil,HFv1 HellaSwag,71.3,,hf_open_llm_v1_240829_frozen.csv gemma_2b_tamil,HFv1 MMLU,38.21,,hf_open_llm_v1_240829_frozen.csv gemma_2b_tamil,HFv1 TruthfulQA,34.93,,hf_open_llm_v1_240829_frozen.csv gemma_2b_tamil,HFv1 Winogrande,65.98,,hf_open_llm_v1_240829_frozen.csv gemma_2b_zephyr_dpo,HF OpenLLM v1,49.38,,hf_open_llm_v1_240829_frozen.csv gemma_2b_zephyr_dpo,HFv1 ARC,51.96,,hf_open_llm_v1_240829_frozen.csv gemma_2b_zephyr_dpo,HFv1 GSM8K,18.57,,hf_open_llm_v1_240829_frozen.csv gemma_2b_zephyr_dpo,HFv1 HellaSwag,73.33,,hf_open_llm_v1_240829_frozen.csv gemma_2b_zephyr_dpo,HFv1 MMLU,43.31,,hf_open_llm_v1_240829_frozen.csv gemma_2b_zephyr_dpo,HFv1 TruthfulQA,42.62,,hf_open_llm_v1_240829_frozen.csv gemma_2b_zephyr_dpo,HFv1 Winogrande,66.46,,hf_open_llm_v1_240829_frozen.csv gemma_7b,HF OpenLLM v1,63.75,,hf_open_llm_v1_240829_frozen.csv gemma_7b,HFv1 ARC,61.09,,hf_open_llm_v1_240829_frozen.csv gemma_7b,HFv1 GSM8K,50.87,,hf_open_llm_v1_240829_frozen.csv gemma_7b,HFv1 HellaSwag,82.2,,hf_open_llm_v1_240829_frozen.csv gemma_7b,HFv1 MMLU,64.56,,hf_open_llm_v1_240829_frozen.csv gemma_7b,HFv1 TruthfulQA,44.79,,hf_open_llm_v1_240829_frozen.csv gemma_7b,HFv1 Winogrande,79.01,,hf_open_llm_v1_240829_frozen.csv gemma_7b_open_platypus_commercial,HF OpenLLM v1,62.71,,hf_open_llm_v1_240829_frozen.csv gemma_7b_open_platypus_commercial,HFv1 ARC,62.8,,hf_open_llm_v1_240829_frozen.csv gemma_7b_open_platypus_commercial,HFv1 GSM8K,40.33,,hf_open_llm_v1_240829_frozen.csv gemma_7b_open_platypus_commercial,HFv1 HellaSwag,81.65,,hf_open_llm_v1_240829_frozen.csv gemma_7b_open_platypus_commercial,HFv1 MMLU,58.94,,hf_open_llm_v1_240829_frozen.csv gemma_7b_open_platypus_commercial,HFv1 TruthfulQA,53.54,,hf_open_llm_v1_240829_frozen.csv gemma_7b_open_platypus_commercial,HFv1 Winogrande,79.01,,hf_open_llm_v1_240829_frozen.csv gemma_7b_openhermes,HF OpenLLM v1,53.67,,hf_open_llm_v1_240829_frozen.csv gemma_7b_openhermes,HFv1 ARC,51.28,,hf_open_llm_v1_240829_frozen.csv gemma_7b_openhermes,HFv1 GSM8K,29.87,,hf_open_llm_v1_240829_frozen.csv gemma_7b_openhermes,HFv1 HellaSwag,71.93,,hf_open_llm_v1_240829_frozen.csv gemma_7b_openhermes,HFv1 MMLU,53.56,,hf_open_llm_v1_240829_frozen.csv gemma_7b_openhermes,HFv1 TruthfulQA,47.18,,hf_open_llm_v1_240829_frozen.csv gemma_7b_openhermes,HFv1 Winogrande,68.19,,hf_open_llm_v1_240829_frozen.csv gemma_ko_1_1_2b_it,HF OpenLLM v1,30.92,,hf_open_llm_v1_240829_frozen.csv gemma_ko_1_1_2b_it,HFv1 ARC,26.71,,hf_open_llm_v1_240829_frozen.csv gemma_ko_1_1_2b_it,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv gemma_ko_1_1_2b_it,HFv1 HellaSwag,36.2,,hf_open_llm_v1_240829_frozen.csv gemma_ko_1_1_2b_it,HFv1 MMLU,26.79,,hf_open_llm_v1_240829_frozen.csv gemma_ko_1_1_2b_it,HFv1 TruthfulQA,40.41,,hf_open_llm_v1_240829_frozen.csv gemma_ko_1_1_2b_it,HFv1 Winogrande,55.49,,hf_open_llm_v1_240829_frozen.csv gemma_nlaf_v1,HF OpenLLM v1,42.76,,hf_open_llm_v1_240829_frozen.csv gemma_nlaf_v1,HFv1 ARC,43.86,,hf_open_llm_v1_240829_frozen.csv gemma_nlaf_v1,HFv1 GSM8K,5.38,,hf_open_llm_v1_240829_frozen.csv gemma_nlaf_v1,HFv1 HellaSwag,62.7,,hf_open_llm_v1_240829_frozen.csv gemma_nlaf_v1,HFv1 MMLU,37.66,,hf_open_llm_v1_240829_frozen.csv gemma_nlaf_v1,HFv1 TruthfulQA,45.86,,hf_open_llm_v1_240829_frozen.csv gemma_nlaf_v1,HFv1 Winogrande,61.09,,hf_open_llm_v1_240829_frozen.csv genai_nova_13b,HF OpenLLM v1,56.98,,hf_open_llm_v1_240829_frozen.csv genai_nova_13b,HFv1 ARC,62.29,,hf_open_llm_v1_240829_frozen.csv genai_nova_13b,HFv1 GSM8K,7.73,,hf_open_llm_v1_240829_frozen.csv genai_nova_13b,HFv1 HellaSwag,83.27,,hf_open_llm_v1_240829_frozen.csv genai_nova_13b,HFv1 MMLU,59.47,,hf_open_llm_v1_240829_frozen.csv genai_nova_13b,HFv1 TruthfulQA,51.79,,hf_open_llm_v1_240829_frozen.csv genai_nova_13b,HFv1 Winogrande,77.35,,hf_open_llm_v1_240829_frozen.csv genz_70b,HF OpenLLM v1,68.35,,hf_open_llm_v1_240829_frozen.csv genz_70b,HFv1 ARC,71.42,,hf_open_llm_v1_240829_frozen.csv genz_70b,HFv1 GSM8K,33.74,,hf_open_llm_v1_240829_frozen.csv genz_70b,HFv1 HellaSwag,87.99,,hf_open_llm_v1_240829_frozen.csv genz_70b,HFv1 MMLU,70.78,,hf_open_llm_v1_240829_frozen.csv genz_70b,HFv1 TruthfulQA,62.66,,hf_open_llm_v1_240829_frozen.csv genz_70b,HFv1 Winogrande,83.5,,hf_open_llm_v1_240829_frozen.csv ghost_7b_alpha,HF OpenLLM v1,57.65,,hf_open_llm_v1_240829_frozen.csv ghost_7b_alpha,HFv1 ARC,54.86,,hf_open_llm_v1_240829_frozen.csv ghost_7b_alpha,HFv1 GSM8K,47.69,,hf_open_llm_v1_240829_frozen.csv ghost_7b_alpha,HFv1 HellaSwag,76.1,,hf_open_llm_v1_240829_frozen.csv ghost_7b_alpha,HFv1 MMLU,50.71,,hf_open_llm_v1_240829_frozen.csv ghost_7b_alpha,HFv1 TruthfulQA,44.63,,hf_open_llm_v1_240829_frozen.csv ghost_7b_alpha,HFv1 Winogrande,71.9,,hf_open_llm_v1_240829_frozen.csv ghost_7b_v0_9_0,HF OpenLLM v1,56.89,,hf_open_llm_v1_240829_frozen.csv ghost_7b_v0_9_0,HFv1 ARC,53.07,,hf_open_llm_v1_240829_frozen.csv ghost_7b_v0_9_0,HFv1 GSM8K,33.74,,hf_open_llm_v1_240829_frozen.csv ghost_7b_v0_9_0,HFv1 HellaSwag,77.93,,hf_open_llm_v1_240829_frozen.csv ghost_7b_v0_9_0,HFv1 MMLU,55.09,,hf_open_llm_v1_240829_frozen.csv ghost_7b_v0_9_0,HFv1 TruthfulQA,47.79,,hf_open_llm_v1_240829_frozen.csv ghost_7b_v0_9_0,HFv1 Winogrande,73.72,,hf_open_llm_v1_240829_frozen.csv giftedconvo13bloranoecons,HF OpenLLM v1,53.35,,hf_open_llm_v1_240829_frozen.csv giftedconvo13bloranoecons,HFv1 ARC,59.39,,hf_open_llm_v1_240829_frozen.csv giftedconvo13bloranoecons,HFv1 GSM8K,7.81,,hf_open_llm_v1_240829_frozen.csv giftedconvo13bloranoecons,HFv1 HellaSwag,83.19,,hf_open_llm_v1_240829_frozen.csv giftedconvo13bloranoecons,HFv1 MMLU,55.15,,hf_open_llm_v1_240829_frozen.csv giftedconvo13bloranoecons,HFv1 TruthfulQA,40.56,,hf_open_llm_v1_240829_frozen.csv giftedconvo13bloranoecons,HFv1 Winogrande,74.03,,hf_open_llm_v1_240829_frozen.csv giftedconvo13bloranoeconse4,HF OpenLLM v1,53.74,,hf_open_llm_v1_240829_frozen.csv giftedconvo13bloranoeconse4,HFv1 ARC,59.9,,hf_open_llm_v1_240829_frozen.csv giftedconvo13bloranoeconse4,HFv1 GSM8K,7.81,,hf_open_llm_v1_240829_frozen.csv giftedconvo13bloranoeconse4,HFv1 HellaSwag,84.11,,hf_open_llm_v1_240829_frozen.csv giftedconvo13bloranoeconse4,HFv1 MMLU,54.67,,hf_open_llm_v1_240829_frozen.csv giftedconvo13bloranoeconse4,HFv1 TruthfulQA,41.94,,hf_open_llm_v1_240829_frozen.csv giftedconvo13bloranoeconse4,HFv1 Winogrande,74.03,,hf_open_llm_v1_240829_frozen.csv giraffe_13b_32k_v3,HF OpenLLM v1,57.24,,hf_open_llm_v1_240829_frozen.csv giraffe_13b_32k_v3,HFv1 ARC,59.04,,hf_open_llm_v1_240829_frozen.csv giraffe_13b_32k_v3,HFv1 GSM8K,26.16,,hf_open_llm_v1_240829_frozen.csv giraffe_13b_32k_v3,HFv1 HellaSwag,79.59,,hf_open_llm_v1_240829_frozen.csv giraffe_13b_32k_v3,HFv1 MMLU,55.01,,hf_open_llm_v1_240829_frozen.csv giraffe_13b_32k_v3,HFv1 TruthfulQA,46.68,,hf_open_llm_v1_240829_frozen.csv giraffe_13b_32k_v3,HFv1 Winogrande,76.95,,hf_open_llm_v1_240829_frozen.csv giraffe_beta_13b_32k,HF OpenLLM v1,54.69,,hf_open_llm_v1_240829_frozen.csv giraffe_beta_13b_32k,HFv1 ARC,55.63,,hf_open_llm_v1_240829_frozen.csv giraffe_beta_13b_32k,HFv1 GSM8K,21.3,,hf_open_llm_v1_240829_frozen.csv giraffe_beta_13b_32k,HFv1 HellaSwag,80.42,,hf_open_llm_v1_240829_frozen.csv giraffe_beta_13b_32k,HFv1 MMLU,53.61,,hf_open_llm_v1_240829_frozen.csv giraffe_beta_13b_32k,HFv1 TruthfulQA,42.58,,hf_open_llm_v1_240829_frozen.csv giraffe_beta_13b_32k,HFv1 Winogrande,74.59,,hf_open_llm_v1_240829_frozen.csv go_bruins_v2_1,HF OpenLLM v1,74.5,,hf_open_llm_v1_240829_frozen.csv go_bruins_v2_1,HFv1 ARC,71.93,,hf_open_llm_v1_240829_frozen.csv go_bruins_v2_1,HFv1 GSM8K,70.43,,hf_open_llm_v1_240829_frozen.csv go_bruins_v2_1,HFv1 HellaSwag,88.33,,hf_open_llm_v1_240829_frozen.csv go_bruins_v2_1,HFv1 MMLU,65.0,,hf_open_llm_v1_240829_frozen.csv go_bruins_v2_1,HFv1 TruthfulQA,69.16,,hf_open_llm_v1_240829_frozen.csv go_bruins_v2_1,HFv1 Winogrande,82.16,,hf_open_llm_v1_240829_frozen.csv godzilla2_70b,HF OpenLLM v1,69.46,,hf_open_llm_v1_240829_frozen.csv godzilla2_70b,HFv1 ARC,71.42,,hf_open_llm_v1_240829_frozen.csv godzilla2_70b,HFv1 GSM8K,43.21,,hf_open_llm_v1_240829_frozen.csv godzilla2_70b,HFv1 HellaSwag,87.53,,hf_open_llm_v1_240829_frozen.csv godzilla2_70b,HFv1 MMLU,69.88,,hf_open_llm_v1_240829_frozen.csv godzilla2_70b,HFv1 TruthfulQA,61.54,,hf_open_llm_v1_240829_frozen.csv godzilla2_70b,HFv1 Winogrande,83.19,,hf_open_llm_v1_240829_frozen.csv goldenmaiden_7b_model_stock,HF OpenLLM v1,75.57,,hf_open_llm_v1_240829_frozen.csv goldenmaiden_7b_model_stock,HFv1 ARC,73.21,,hf_open_llm_v1_240829_frozen.csv goldenmaiden_7b_model_stock,HFv1 GSM8K,68.84,,hf_open_llm_v1_240829_frozen.csv goldenmaiden_7b_model_stock,HFv1 HellaSwag,88.71,,hf_open_llm_v1_240829_frozen.csv goldenmaiden_7b_model_stock,HFv1 MMLU,64.96,,hf_open_llm_v1_240829_frozen.csv goldenmaiden_7b_model_stock,HFv1 TruthfulQA,72.56,,hf_open_llm_v1_240829_frozen.csv goldenmaiden_7b_model_stock,HFv1 Winogrande,85.16,,hf_open_llm_v1_240829_frozen.csv gollie_7b,HF OpenLLM v1,37.48,,hf_open_llm_v1_240829_frozen.csv gollie_7b,HFv1 ARC,36.09,,hf_open_llm_v1_240829_frozen.csv gollie_7b,HFv1 GSM8K,3.26,,hf_open_llm_v1_240829_frozen.csv gollie_7b,HFv1 HellaSwag,57.93,,hf_open_llm_v1_240829_frozen.csv gollie_7b,HFv1 MMLU,29.38,,hf_open_llm_v1_240829_frozen.csv gollie_7b,HFv1 TruthfulQA,39.27,,hf_open_llm_v1_240829_frozen.csv gollie_7b,HFv1 Winogrande,58.96,,hf_open_llm_v1_240829_frozen.csv gonzo_chat_7b,HF OpenLLM v1,66.63,,hf_open_llm_v1_240829_frozen.csv gonzo_chat_7b,HFv1 ARC,65.02,,hf_open_llm_v1_240829_frozen.csv gonzo_chat_7b,HFv1 GSM8K,47.61,,hf_open_llm_v1_240829_frozen.csv gonzo_chat_7b,HFv1 HellaSwag,85.4,,hf_open_llm_v1_240829_frozen.csv gonzo_chat_7b,HFv1 MMLU,63.75,,hf_open_llm_v1_240829_frozen.csv gonzo_chat_7b,HFv1 TruthfulQA,60.23,,hf_open_llm_v1_240829_frozen.csv gonzo_chat_7b,HFv1 Winogrande,77.74,,hf_open_llm_v1_240829_frozen.csv gowizardlm,HF OpenLLM v1,47.64,,hf_open_llm_v1_240829_frozen.csv gowizardlm,HFv1 ARC,49.74,,hf_open_llm_v1_240829_frozen.csv gowizardlm,HFv1 GSM8K,3.94,,hf_open_llm_v1_240829_frozen.csv gowizardlm,HFv1 HellaSwag,71.9,,hf_open_llm_v1_240829_frozen.csv gowizardlm,HFv1 MMLU,42.96,,hf_open_llm_v1_240829_frozen.csv gowizardlm,HFv1 TruthfulQA,47.66,,hf_open_llm_v1_240829_frozen.csv gowizardlm,HFv1 Winogrande,69.61,,hf_open_llm_v1_240829_frozen.csv gpt2,HF OpenLLM v1,28.53,,hf_open_llm_v1_240829_frozen.csv gpt2,HFv1 ARC,22.01,,hf_open_llm_v1_240829_frozen.csv gpt2,HFv1 GSM8K,0.68,,hf_open_llm_v1_240829_frozen.csv gpt2,HFv1 HellaSwag,31.58,,hf_open_llm_v1_240829_frozen.csv gpt2,HFv1 MMLU,25.83,,hf_open_llm_v1_240829_frozen.csv gpt2,HFv1 TruthfulQA,41.15,,hf_open_llm_v1_240829_frozen.csv gpt2,HFv1 Winogrande,50.43,,hf_open_llm_v1_240829_frozen.csv gpt2_camel_physics_platypus,HF OpenLLM v1,28.41,,hf_open_llm_v1_240829_frozen.csv gpt2_camel_physics_platypus,HFv1 ARC,23.04,,hf_open_llm_v1_240829_frozen.csv gpt2_camel_physics_platypus,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv gpt2_camel_physics_platypus,HFv1 HellaSwag,31.32,,hf_open_llm_v1_240829_frozen.csv gpt2_camel_physics_platypus,HFv1 MMLU,26.91,,hf_open_llm_v1_240829_frozen.csv gpt2_camel_physics_platypus,HFv1 TruthfulQA,39.56,,hf_open_llm_v1_240829_frozen.csv gpt2_camel_physics_platypus,HFv1 Winogrande,49.64,,hf_open_llm_v1_240829_frozen.csv gpt2_chatbot,HF OpenLLM v1,33.91,,hf_open_llm_v1_240829_frozen.csv gpt2_chatbot,HFv1 ARC,29.69,,hf_open_llm_v1_240829_frozen.csv gpt2_chatbot,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv gpt2_chatbot,HFv1 HellaSwag,50.27,,hf_open_llm_v1_240829_frozen.csv gpt2_chatbot,HFv1 MMLU,26.42,,hf_open_llm_v1_240829_frozen.csv gpt2_chatbot,HFv1 TruthfulQA,40.38,,hf_open_llm_v1_240829_frozen.csv gpt2_chatbot,HFv1 Winogrande,56.67,,hf_open_llm_v1_240829_frozen.csv gpt2_dolly,HF OpenLLM v1,29.21,,hf_open_llm_v1_240829_frozen.csv gpt2_dolly,HFv1 ARC,22.7,,hf_open_llm_v1_240829_frozen.csv gpt2_dolly,HFv1 GSM8K,0.15,,hf_open_llm_v1_240829_frozen.csv gpt2_dolly,HFv1 HellaSwag,30.77,,hf_open_llm_v1_240829_frozen.csv gpt2_dolly,HFv1 MMLU,25.81,,hf_open_llm_v1_240829_frozen.csv gpt2_dolly,HFv1 TruthfulQA,44.97,,hf_open_llm_v1_240829_frozen.csv gpt2_dolly,HFv1 Winogrande,51.46,,hf_open_llm_v1_240829_frozen.csv gpt2_final,HF OpenLLM v1,28.7,,hf_open_llm_v1_240829_frozen.csv gpt2_final,HFv1 ARC,21.59,,hf_open_llm_v1_240829_frozen.csv gpt2_final,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv gpt2_final,HFv1 HellaSwag,26.02,,hf_open_llm_v1_240829_frozen.csv gpt2_final,HFv1 MMLU,24.79,,hf_open_llm_v1_240829_frozen.csv gpt2_final,HFv1 TruthfulQA,49.87,,hf_open_llm_v1_240829_frozen.csv gpt2_final,HFv1 Winogrande,49.96,,hf_open_llm_v1_240829_frozen.csv gpt2_guanaco_dolly_platypus,HF OpenLLM v1,28.52,,hf_open_llm_v1_240829_frozen.csv gpt2_guanaco_dolly_platypus,HFv1 ARC,23.55,,hf_open_llm_v1_240829_frozen.csv gpt2_guanaco_dolly_platypus,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv gpt2_guanaco_dolly_platypus,HFv1 HellaSwag,31.03,,hf_open_llm_v1_240829_frozen.csv gpt2_guanaco_dolly_platypus,HFv1 MMLU,26.4,,hf_open_llm_v1_240829_frozen.csv gpt2_guanaco_dolly_platypus,HFv1 TruthfulQA,40.02,,hf_open_llm_v1_240829_frozen.csv gpt2_guanaco_dolly_platypus,HFv1 Winogrande,50.12,,hf_open_llm_v1_240829_frozen.csv gpt2_large,HF OpenLLM v1,32.07,,hf_open_llm_v1_240829_frozen.csv gpt2_large,HFv1 ARC,25.77,,hf_open_llm_v1_240829_frozen.csv gpt2_large,HFv1 GSM8K,0.83,,hf_open_llm_v1_240829_frozen.csv gpt2_large,HFv1 HellaSwag,45.62,,hf_open_llm_v1_240829_frozen.csv gpt2_large,HFv1 MMLU,26.07,,hf_open_llm_v1_240829_frozen.csv gpt2_large,HFv1 TruthfulQA,38.72,,hf_open_llm_v1_240829_frozen.csv gpt2_large,HFv1 Winogrande,55.41,,hf_open_llm_v1_240829_frozen.csv gpt2_large_conversational,HF OpenLLM v1,32.33,,hf_open_llm_v1_240829_frozen.csv gpt2_large_conversational,HFv1 ARC,26.96,,hf_open_llm_v1_240829_frozen.csv gpt2_large_conversational,HFv1 GSM8K,0.08,,hf_open_llm_v1_240829_frozen.csv gpt2_large_conversational,HFv1 HellaSwag,44.98,,hf_open_llm_v1_240829_frozen.csv gpt2_large_conversational,HFv1 MMLU,26.33,,hf_open_llm_v1_240829_frozen.csv gpt2_large_conversational,HFv1 TruthfulQA,39.6,,hf_open_llm_v1_240829_frozen.csv gpt2_large_conversational,HFv1 Winogrande,56.04,,hf_open_llm_v1_240829_frozen.csv gpt2_open_platypus,HF OpenLLM v1,28.58,,hf_open_llm_v1_240829_frozen.csv gpt2_open_platypus,HFv1 ARC,22.18,,hf_open_llm_v1_240829_frozen.csv gpt2_open_platypus,HFv1 GSM8K,0.15,,hf_open_llm_v1_240829_frozen.csv gpt2_open_platypus,HFv1 HellaSwag,31.29,,hf_open_llm_v1_240829_frozen.csv gpt2_open_platypus,HFv1 MMLU,26.19,,hf_open_llm_v1_240829_frozen.csv gpt2_open_platypus,HFv1 TruthfulQA,40.35,,hf_open_llm_v1_240829_frozen.csv gpt2_open_platypus,HFv1 Winogrande,51.3,,hf_open_llm_v1_240829_frozen.csv gpt2_platypus_camel_physics,HF OpenLLM v1,28.41,,hf_open_llm_v1_240829_frozen.csv gpt2_platypus_camel_physics,HFv1 ARC,23.04,,hf_open_llm_v1_240829_frozen.csv gpt2_platypus_camel_physics,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv gpt2_platypus_camel_physics,HFv1 HellaSwag,31.32,,hf_open_llm_v1_240829_frozen.csv gpt2_platypus_camel_physics,HFv1 MMLU,26.91,,hf_open_llm_v1_240829_frozen.csv gpt2_platypus_camel_physics,HFv1 TruthfulQA,39.56,,hf_open_llm_v1_240829_frozen.csv gpt2_platypus_camel_physics,HFv1 Winogrande,51.54,,hf_open_llm_v1_240829_frozen.csv gpt2_platypus_dolly_guanaco,HF OpenLLM v1,28.51,,hf_open_llm_v1_240829_frozen.csv gpt2_platypus_dolly_guanaco,HFv1 ARC,23.21,,hf_open_llm_v1_240829_frozen.csv gpt2_platypus_dolly_guanaco,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv gpt2_platypus_dolly_guanaco,HFv1 HellaSwag,31.04,,hf_open_llm_v1_240829_frozen.csv gpt2_platypus_dolly_guanaco,HFv1 MMLU,26.16,,hf_open_llm_v1_240829_frozen.csv gpt2_platypus_dolly_guanaco,HFv1 TruthfulQA,40.31,,hf_open_llm_v1_240829_frozen.csv gpt2_platypus_dolly_guanaco,HFv1 Winogrande,50.36,,hf_open_llm_v1_240829_frozen.csv gpt2_test,HF OpenLLM v1,28.4,,hf_open_llm_v1_240829_frozen.csv gpt2_test,HFv1 ARC,21.84,,hf_open_llm_v1_240829_frozen.csv gpt2_test,HFv1 GSM8K,0.3,,hf_open_llm_v1_240829_frozen.csv gpt2_test,HFv1 HellaSwag,31.6,,hf_open_llm_v1_240829_frozen.csv gpt2_test,HFv1 MMLU,25.86,,hf_open_llm_v1_240829_frozen.csv gpt2_test,HFv1 TruthfulQA,40.67,,hf_open_llm_v1_240829_frozen.csv gpt2_test,HFv1 Winogrande,50.12,,hf_open_llm_v1_240829_frozen.csv gpt2_xl,HF OpenLLM v1,34.38,,hf_open_llm_v1_240829_frozen.csv gpt2_xl,HFv1 ARC,30.29,,hf_open_llm_v1_240829_frozen.csv gpt2_xl,HFv1 GSM8K,1.29,,hf_open_llm_v1_240829_frozen.csv gpt2_xl,HFv1 HellaSwag,51.36,,hf_open_llm_v1_240829_frozen.csv gpt2_xl,HFv1 MMLU,26.54,,hf_open_llm_v1_240829_frozen.csv gpt2_xl,HFv1 TruthfulQA,38.54,,hf_open_llm_v1_240829_frozen.csv gpt2_xl,HFv1 Winogrande,58.25,,hf_open_llm_v1_240829_frozen.csv gpt2_xl_lima,HF OpenLLM v1,34.12,,hf_open_llm_v1_240829_frozen.csv gpt2_xl_lima,HFv1 ARC,31.14,,hf_open_llm_v1_240829_frozen.csv gpt2_xl_lima,HFv1 GSM8K,0.91,,hf_open_llm_v1_240829_frozen.csv gpt2_xl_lima,HFv1 HellaSwag,51.28,,hf_open_llm_v1_240829_frozen.csv gpt2_xl_lima,HFv1 MMLU,25.43,,hf_open_llm_v1_240829_frozen.csv gpt2_xl_lima,HFv1 TruthfulQA,38.74,,hf_open_llm_v1_240829_frozen.csv gpt2_xl_lima,HFv1 Winogrande,57.22,,hf_open_llm_v1_240829_frozen.csv gpt3_finnish_13b,HF OpenLLM v1,32.95,,hf_open_llm_v1_240829_frozen.csv gpt3_finnish_13b,HFv1 ARC,24.66,,hf_open_llm_v1_240829_frozen.csv gpt3_finnish_13b,HFv1 GSM8K,0.3,,hf_open_llm_v1_240829_frozen.csv gpt3_finnish_13b,HFv1 HellaSwag,46.76,,hf_open_llm_v1_240829_frozen.csv gpt3_finnish_13b,HFv1 MMLU,23.49,,hf_open_llm_v1_240829_frozen.csv gpt3_finnish_13b,HFv1 TruthfulQA,44.47,,hf_open_llm_v1_240829_frozen.csv gpt3_finnish_13b,HFv1 Winogrande,58.01,,hf_open_llm_v1_240829_frozen.csv gpt3_finnish_large,HF OpenLLM v1,29.11,,hf_open_llm_v1_240829_frozen.csv gpt3_finnish_large,HFv1 ARC,21.76,,hf_open_llm_v1_240829_frozen.csv gpt3_finnish_large,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv gpt3_finnish_large,HFv1 HellaSwag,32.88,,hf_open_llm_v1_240829_frozen.csv gpt3_finnish_large,HFv1 MMLU,24.11,,hf_open_llm_v1_240829_frozen.csv gpt3_finnish_large,HFv1 TruthfulQA,44.35,,hf_open_llm_v1_240829_frozen.csv gpt3_finnish_large,HFv1 Winogrande,51.54,,hf_open_llm_v1_240829_frozen.csv gpt3_finnish_small,HF OpenLLM v1,27.95,,hf_open_llm_v1_240829_frozen.csv gpt3_finnish_small,HFv1 ARC,20.48,,hf_open_llm_v1_240829_frozen.csv gpt3_finnish_small,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv gpt3_finnish_small,HFv1 HellaSwag,28.09,,hf_open_llm_v1_240829_frozen.csv gpt3_finnish_small,HFv1 MMLU,24.47,,hf_open_llm_v1_240829_frozen.csv gpt3_finnish_small,HFv1 TruthfulQA,46.47,,hf_open_llm_v1_240829_frozen.csv gpt3_finnish_small,HFv1 Winogrande,48.22,,hf_open_llm_v1_240829_frozen.csv gpt_2_large_115k_steps,HF OpenLLM v1,29.21,,hf_open_llm_v1_240829_frozen.csv gpt_2_large_115k_steps,HFv1 ARC,22.7,,hf_open_llm_v1_240829_frozen.csv gpt_2_large_115k_steps,HFv1 GSM8K,0.08,,hf_open_llm_v1_240829_frozen.csv gpt_2_large_115k_steps,HFv1 HellaSwag,27.79,,hf_open_llm_v1_240829_frozen.csv gpt_2_large_115k_steps,HFv1 MMLU,25.52,,hf_open_llm_v1_240829_frozen.csv gpt_2_large_115k_steps,HFv1 TruthfulQA,45.67,,hf_open_llm_v1_240829_frozen.csv gpt_2_large_115k_steps,HFv1 Winogrande,53.51,,hf_open_llm_v1_240829_frozen.csv gpt_2_large_20k_steps,HF OpenLLM v1,28.65,,hf_open_llm_v1_240829_frozen.csv gpt_2_large_20k_steps,HFv1 ARC,22.78,,hf_open_llm_v1_240829_frozen.csv gpt_2_large_20k_steps,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv gpt_2_large_20k_steps,HFv1 HellaSwag,25.81,,hf_open_llm_v1_240829_frozen.csv gpt_2_large_20k_steps,HFv1 MMLU,23.84,,hf_open_llm_v1_240829_frozen.csv gpt_2_large_20k_steps,HFv1 TruthfulQA,50.99,,hf_open_llm_v1_240829_frozen.csv gpt_2_large_20k_steps,HFv1 Winogrande,48.46,,hf_open_llm_v1_240829_frozen.csv gpt_2_large_32k_steps,HF OpenLLM v1,28.59,,hf_open_llm_v1_240829_frozen.csv gpt_2_large_32k_steps,HFv1 ARC,22.53,,hf_open_llm_v1_240829_frozen.csv gpt_2_large_32k_steps,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv gpt_2_large_32k_steps,HFv1 HellaSwag,26.39,,hf_open_llm_v1_240829_frozen.csv gpt_2_large_32k_steps,HFv1 MMLU,23.73,,hf_open_llm_v1_240829_frozen.csv gpt_2_large_32k_steps,HFv1 TruthfulQA,49.25,,hf_open_llm_v1_240829_frozen.csv gpt_2_large_32k_steps,HFv1 Winogrande,49.64,,hf_open_llm_v1_240829_frozen.csv gpt_2_large_40k_steps,HF OpenLLM v1,28.55,,hf_open_llm_v1_240829_frozen.csv gpt_2_large_40k_steps,HFv1 ARC,22.87,,hf_open_llm_v1_240829_frozen.csv gpt_2_large_40k_steps,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv gpt_2_large_40k_steps,HFv1 HellaSwag,26.55,,hf_open_llm_v1_240829_frozen.csv gpt_2_large_40k_steps,HFv1 MMLU,24.15,,hf_open_llm_v1_240829_frozen.csv gpt_2_large_40k_steps,HFv1 TruthfulQA,47.84,,hf_open_llm_v1_240829_frozen.csv gpt_2_large_40k_steps,HFv1 Winogrande,49.88,,hf_open_llm_v1_240829_frozen.csv gpt_2_large_43k_steps,HF OpenLLM v1,28.79,,hf_open_llm_v1_240829_frozen.csv gpt_2_large_43k_steps,HFv1 ARC,22.78,,hf_open_llm_v1_240829_frozen.csv gpt_2_large_43k_steps,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv gpt_2_large_43k_steps,HFv1 HellaSwag,26.66,,hf_open_llm_v1_240829_frozen.csv gpt_2_large_43k_steps,HFv1 MMLU,24.05,,hf_open_llm_v1_240829_frozen.csv gpt_2_large_43k_steps,HFv1 TruthfulQA,48.32,,hf_open_llm_v1_240829_frozen.csv gpt_2_large_43k_steps,HFv1 Winogrande,50.91,,hf_open_llm_v1_240829_frozen.csv gpt_2_large_51k_steps,HF OpenLLM v1,28.65,,hf_open_llm_v1_240829_frozen.csv gpt_2_large_51k_steps,HFv1 ARC,22.78,,hf_open_llm_v1_240829_frozen.csv gpt_2_large_51k_steps,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv gpt_2_large_51k_steps,HFv1 HellaSwag,25.81,,hf_open_llm_v1_240829_frozen.csv gpt_2_large_51k_steps,HFv1 MMLU,23.84,,hf_open_llm_v1_240829_frozen.csv gpt_2_large_51k_steps,HFv1 TruthfulQA,50.99,,hf_open_llm_v1_240829_frozen.csv gpt_2_large_51k_steps,HFv1 Winogrande,48.46,,hf_open_llm_v1_240829_frozen.csv gpt_2_slimorcadeduped_airoboros_3_1_metamathqa_sft_124m,HF OpenLLM v1,28.3,,hf_open_llm_v1_240829_frozen.csv gpt_2_slimorcadeduped_airoboros_3_1_metamathqa_sft_124m,HFv1 ARC,24.57,,hf_open_llm_v1_240829_frozen.csv gpt_2_slimorcadeduped_airoboros_3_1_metamathqa_sft_124m,HFv1 GSM8K,2.12,,hf_open_llm_v1_240829_frozen.csv gpt_2_slimorcadeduped_airoboros_3_1_metamathqa_sft_124m,HFv1 HellaSwag,29.43,,hf_open_llm_v1_240829_frozen.csv gpt_2_slimorcadeduped_airoboros_3_1_metamathqa_sft_124m,HFv1 MMLU,25.82,,hf_open_llm_v1_240829_frozen.csv gpt_2_slimorcadeduped_airoboros_3_1_metamathqa_sft_124m,HFv1 TruthfulQA,38.84,,hf_open_llm_v1_240829_frozen.csv gpt_2_slimorcadeduped_airoboros_3_1_metamathqa_sft_124m,HFv1 Winogrande,49.01,,hf_open_llm_v1_240829_frozen.csv gpt_2_xl_camel_ai_physics,HF OpenLLM v1,33.96,,hf_open_llm_v1_240829_frozen.csv gpt_2_xl_camel_ai_physics,HFv1 ARC,29.52,,hf_open_llm_v1_240829_frozen.csv gpt_2_xl_camel_ai_physics,HFv1 GSM8K,0.15,,hf_open_llm_v1_240829_frozen.csv gpt_2_xl_camel_ai_physics,HFv1 HellaSwag,50.62,,hf_open_llm_v1_240829_frozen.csv gpt_2_xl_camel_ai_physics,HFv1 MMLU,26.79,,hf_open_llm_v1_240829_frozen.csv gpt_2_xl_camel_ai_physics,HFv1 TruthfulQA,39.12,,hf_open_llm_v1_240829_frozen.csv gpt_2_xl_camel_ai_physics,HFv1 Winogrande,57.54,,hf_open_llm_v1_240829_frozen.csv gpt_bigcode_santacoder,HF OpenLLM v1,28.49,,hf_open_llm_v1_240829_frozen.csv gpt_bigcode_santacoder,HFv1 ARC,21.16,,hf_open_llm_v1_240829_frozen.csv gpt_bigcode_santacoder,HFv1 GSM8K,0.53,,hf_open_llm_v1_240829_frozen.csv gpt_bigcode_santacoder,HFv1 HellaSwag,30.84,,hf_open_llm_v1_240829_frozen.csv gpt_bigcode_santacoder,HFv1 MMLU,24.97,,hf_open_llm_v1_240829_frozen.csv gpt_bigcode_santacoder,HFv1 TruthfulQA,45.64,,hf_open_llm_v1_240829_frozen.csv gpt_bigcode_santacoder,HFv1 Winogrande,47.83,,hf_open_llm_v1_240829_frozen.csv gpt_j_6b,HF OpenLLM v1,40.1,,hf_open_llm_v1_240829_frozen.csv gpt_j_6b,HFv1 ARC,41.38,,hf_open_llm_v1_240829_frozen.csv gpt_j_6b,HFv1 GSM8K,2.96,,hf_open_llm_v1_240829_frozen.csv gpt_j_6b,HFv1 HellaSwag,67.54,,hf_open_llm_v1_240829_frozen.csv gpt_j_6b,HFv1 MMLU,26.78,,hf_open_llm_v1_240829_frozen.csv gpt_j_6b,HFv1 TruthfulQA,35.96,,hf_open_llm_v1_240829_frozen.csv gpt_j_6b,HFv1 Winogrande,65.98,,hf_open_llm_v1_240829_frozen.csv gpt_neo_125m,HF OpenLLM v1,29.47,,hf_open_llm_v1_240829_frozen.csv gpt_neo_125m,HFv1 ARC,22.95,,hf_open_llm_v1_240829_frozen.csv gpt_neo_125m,HFv1 GSM8K,0.3,,hf_open_llm_v1_240829_frozen.csv gpt_neo_125m,HFv1 HellaSwag,30.26,,hf_open_llm_v1_240829_frozen.csv gpt_neo_125m,HFv1 MMLU,25.97,,hf_open_llm_v1_240829_frozen.csv gpt_neo_125m,HFv1 TruthfulQA,45.58,,hf_open_llm_v1_240829_frozen.csv gpt_neo_125m,HFv1 Winogrande,51.78,,hf_open_llm_v1_240829_frozen.csv gpt_neo_1_3b,HF OpenLLM v1,33.58,,hf_open_llm_v1_240829_frozen.csv gpt_neo_1_3b,HFv1 ARC,31.23,,hf_open_llm_v1_240829_frozen.csv gpt_neo_1_3b,HFv1 GSM8K,0.45,,hf_open_llm_v1_240829_frozen.csv gpt_neo_1_3b,HFv1 HellaSwag,48.47,,hf_open_llm_v1_240829_frozen.csv gpt_neo_1_3b,HFv1 MMLU,24.82,,hf_open_llm_v1_240829_frozen.csv gpt_neo_1_3b,HFv1 TruthfulQA,39.63,,hf_open_llm_v1_240829_frozen.csv gpt_neo_1_3b,HFv1 Winogrande,56.91,,hf_open_llm_v1_240829_frozen.csv gpt_neo_2_7b,HF OpenLLM v1,36.2,,hf_open_llm_v1_240829_frozen.csv gpt_neo_2_7b,HFv1 ARC,33.36,,hf_open_llm_v1_240829_frozen.csv gpt_neo_2_7b,HFv1 GSM8K,1.29,,hf_open_llm_v1_240829_frozen.csv gpt_neo_2_7b,HFv1 HellaSwag,56.24,,hf_open_llm_v1_240829_frozen.csv gpt_neo_2_7b,HFv1 MMLU,26.45,,hf_open_llm_v1_240829_frozen.csv gpt_neo_2_7b,HFv1 TruthfulQA,39.78,,hf_open_llm_v1_240829_frozen.csv gpt_neo_2_7b,HFv1 Winogrande,60.06,,hf_open_llm_v1_240829_frozen.csv gpt_neox_20b,HF OpenLLM v1,41.69,,hf_open_llm_v1_240829_frozen.csv gpt_neox_20b,HFv1 ARC,45.73,,hf_open_llm_v1_240829_frozen.csv gpt_neox_20b,HFv1 GSM8K,5.46,,hf_open_llm_v1_240829_frozen.csv gpt_neox_20b,HFv1 HellaSwag,73.45,,hf_open_llm_v1_240829_frozen.csv gpt_neox_20b,HFv1 MMLU,25.0,,hf_open_llm_v1_240829_frozen.csv gpt_neox_20b,HFv1 TruthfulQA,31.61,,hf_open_llm_v1_240829_frozen.csv gpt_neox_20b,HFv1 Winogrande,68.9,,hf_open_llm_v1_240829_frozen.csv gpt_sw3_126m,HF OpenLLM v1,28.49,,hf_open_llm_v1_240829_frozen.csv gpt_sw3_126m,HFv1 ARC,22.18,,hf_open_llm_v1_240829_frozen.csv gpt_sw3_126m,HFv1 GSM8K,0.08,,hf_open_llm_v1_240829_frozen.csv gpt_sw3_126m,HFv1 HellaSwag,29.56,,hf_open_llm_v1_240829_frozen.csv gpt_sw3_126m,HFv1 MMLU,24.53,,hf_open_llm_v1_240829_frozen.csv gpt_sw3_126m,HFv1 TruthfulQA,44.07,,hf_open_llm_v1_240829_frozen.csv gpt_sw3_126m,HFv1 Winogrande,50.67,,hf_open_llm_v1_240829_frozen.csv gpt_sw3_126m_instruct,HF OpenLLM v1,28.2,,hf_open_llm_v1_240829_frozen.csv gpt_sw3_126m_instruct,HFv1 ARC,23.38,,hf_open_llm_v1_240829_frozen.csv gpt_sw3_126m_instruct,HFv1 GSM8K,0.99,,hf_open_llm_v1_240829_frozen.csv gpt_sw3_126m_instruct,HFv1 HellaSwag,29.88,,hf_open_llm_v1_240829_frozen.csv gpt_sw3_126m_instruct,HFv1 MMLU,23.78,,hf_open_llm_v1_240829_frozen.csv gpt_sw3_126m_instruct,HFv1 TruthfulQA,42.65,,hf_open_llm_v1_240829_frozen.csv gpt_sw3_126m_instruct,HFv1 Winogrande,48.54,,hf_open_llm_v1_240829_frozen.csv gpt_sw3_1_3b,HF OpenLLM v1,34.31,,hf_open_llm_v1_240829_frozen.csv gpt_sw3_1_3b,HFv1 ARC,30.38,,hf_open_llm_v1_240829_frozen.csv gpt_sw3_1_3b,HFv1 GSM8K,0.08,,hf_open_llm_v1_240829_frozen.csv gpt_sw3_1_3b,HFv1 HellaSwag,50.4,,hf_open_llm_v1_240829_frozen.csv gpt_sw3_1_3b,HFv1 MMLU,26.14,,hf_open_llm_v1_240829_frozen.csv gpt_sw3_1_3b,HFv1 TruthfulQA,39.97,,hf_open_llm_v1_240829_frozen.csv gpt_sw3_1_3b,HFv1 Winogrande,58.88,,hf_open_llm_v1_240829_frozen.csv gpt_sw3_1_3b_instruct,HF OpenLLM v1,34.54,,hf_open_llm_v1_240829_frozen.csv gpt_sw3_1_3b_instruct,HFv1 ARC,30.97,,hf_open_llm_v1_240829_frozen.csv gpt_sw3_1_3b_instruct,HFv1 GSM8K,1.59,,hf_open_llm_v1_240829_frozen.csv gpt_sw3_1_3b_instruct,HFv1 HellaSwag,51.42,,hf_open_llm_v1_240829_frozen.csv gpt_sw3_1_3b_instruct,HFv1 MMLU,26.17,,hf_open_llm_v1_240829_frozen.csv gpt_sw3_1_3b_instruct,HFv1 TruthfulQA,40.31,,hf_open_llm_v1_240829_frozen.csv gpt_sw3_1_3b_instruct,HFv1 Winogrande,56.75,,hf_open_llm_v1_240829_frozen.csv gpt_sw3_20b,HF OpenLLM v1,40.71,,hf_open_llm_v1_240829_frozen.csv gpt_sw3_20b,HFv1 ARC,41.81,,hf_open_llm_v1_240829_frozen.csv gpt_sw3_20b,HFv1 GSM8K,0.99,,hf_open_llm_v1_240829_frozen.csv gpt_sw3_20b,HFv1 HellaSwag,68.75,,hf_open_llm_v1_240829_frozen.csv gpt_sw3_20b,HFv1 MMLU,28.47,,hf_open_llm_v1_240829_frozen.csv gpt_sw3_20b,HFv1 TruthfulQA,37.1,,hf_open_llm_v1_240829_frozen.csv gpt_sw3_20b,HFv1 Winogrande,67.17,,hf_open_llm_v1_240829_frozen.csv gpt_sw3_20b_instruct,HF OpenLLM v1,43.7,,hf_open_llm_v1_240829_frozen.csv gpt_sw3_20b_instruct,HFv1 ARC,43.17,,hf_open_llm_v1_240829_frozen.csv gpt_sw3_20b_instruct,HFv1 GSM8K,8.79,,hf_open_llm_v1_240829_frozen.csv gpt_sw3_20b_instruct,HFv1 HellaSwag,71.09,,hf_open_llm_v1_240829_frozen.csv gpt_sw3_20b_instruct,HFv1 MMLU,31.32,,hf_open_llm_v1_240829_frozen.csv gpt_sw3_20b_instruct,HFv1 TruthfulQA,41.02,,hf_open_llm_v1_240829_frozen.csv gpt_sw3_20b_instruct,HFv1 Winogrande,66.77,,hf_open_llm_v1_240829_frozen.csv gpt_sw3_356m,HF OpenLLM v1,30.41,,hf_open_llm_v1_240829_frozen.csv gpt_sw3_356m,HFv1 ARC,23.63,,hf_open_llm_v1_240829_frozen.csv gpt_sw3_356m,HFv1 GSM8K,0.23,,hf_open_llm_v1_240829_frozen.csv gpt_sw3_356m,HFv1 HellaSwag,37.05,,hf_open_llm_v1_240829_frozen.csv gpt_sw3_356m,HFv1 MMLU,25.93,,hf_open_llm_v1_240829_frozen.csv gpt_sw3_356m,HFv1 TruthfulQA,42.55,,hf_open_llm_v1_240829_frozen.csv gpt_sw3_356m,HFv1 Winogrande,53.04,,hf_open_llm_v1_240829_frozen.csv gpt_sw3_356m_instruct,HF OpenLLM v1,30.93,,hf_open_llm_v1_240829_frozen.csv gpt_sw3_356m_instruct,HFv1 ARC,26.96,,hf_open_llm_v1_240829_frozen.csv gpt_sw3_356m_instruct,HFv1 GSM8K,1.74,,hf_open_llm_v1_240829_frozen.csv gpt_sw3_356m_instruct,HFv1 HellaSwag,38.01,,hf_open_llm_v1_240829_frozen.csv gpt_sw3_356m_instruct,HFv1 MMLU,25.53,,hf_open_llm_v1_240829_frozen.csv gpt_sw3_356m_instruct,HFv1 TruthfulQA,40.74,,hf_open_llm_v1_240829_frozen.csv gpt_sw3_356m_instruct,HFv1 Winogrande,52.57,,hf_open_llm_v1_240829_frozen.csv gpt_sw3_40b,HF OpenLLM v1,43.42,,hf_open_llm_v1_240829_frozen.csv gpt_sw3_40b,HFv1 ARC,43.0,,hf_open_llm_v1_240829_frozen.csv gpt_sw3_40b,HFv1 GSM8K,4.7,,hf_open_llm_v1_240829_frozen.csv gpt_sw3_40b,HFv1 HellaSwag,72.37,,hf_open_llm_v1_240829_frozen.csv gpt_sw3_40b,HFv1 MMLU,34.97,,hf_open_llm_v1_240829_frozen.csv gpt_sw3_40b,HFv1 TruthfulQA,37.52,,hf_open_llm_v1_240829_frozen.csv gpt_sw3_40b,HFv1 Winogrande,67.96,,hf_open_llm_v1_240829_frozen.csv gpt_sw3_6_7b,HF OpenLLM v1,37.23,,hf_open_llm_v1_240829_frozen.csv gpt_sw3_6_7b,HFv1 ARC,36.35,,hf_open_llm_v1_240829_frozen.csv gpt_sw3_6_7b,HFv1 GSM8K,0.53,,hf_open_llm_v1_240829_frozen.csv gpt_sw3_6_7b,HFv1 HellaSwag,60.75,,hf_open_llm_v1_240829_frozen.csv gpt_sw3_6_7b,HFv1 MMLU,26.0,,hf_open_llm_v1_240829_frozen.csv gpt_sw3_6_7b,HFv1 TruthfulQA,39.04,,hf_open_llm_v1_240829_frozen.csv gpt_sw3_6_7b,HFv1 Winogrande,60.69,,hf_open_llm_v1_240829_frozen.csv gpt_sw3_6_7b_v2,HF OpenLLM v1,39.49,,hf_open_llm_v1_240829_frozen.csv gpt_sw3_6_7b_v2,HFv1 ARC,39.42,,hf_open_llm_v1_240829_frozen.csv gpt_sw3_6_7b_v2,HFv1 GSM8K,1.21,,hf_open_llm_v1_240829_frozen.csv gpt_sw3_6_7b_v2,HFv1 HellaSwag,66.39,,hf_open_llm_v1_240829_frozen.csv gpt_sw3_6_7b_v2,HFv1 MMLU,30.09,,hf_open_llm_v1_240829_frozen.csv gpt_sw3_6_7b_v2,HFv1 TruthfulQA,35.6,,hf_open_llm_v1_240829_frozen.csv gpt_sw3_6_7b_v2,HFv1 Winogrande,64.25,,hf_open_llm_v1_240829_frozen.csv gpt_sw3_6_7b_v2_instruct,HF OpenLLM v1,41.72,,hf_open_llm_v1_240829_frozen.csv gpt_sw3_6_7b_v2_instruct,HFv1 ARC,40.78,,hf_open_llm_v1_240829_frozen.csv gpt_sw3_6_7b_v2_instruct,HFv1 GSM8K,6.37,,hf_open_llm_v1_240829_frozen.csv gpt_sw3_6_7b_v2_instruct,HFv1 HellaSwag,67.77,,hf_open_llm_v1_240829_frozen.csv gpt_sw3_6_7b_v2_instruct,HFv1 MMLU,31.57,,hf_open_llm_v1_240829_frozen.csv gpt_sw3_6_7b_v2_instruct,HFv1 TruthfulQA,40.32,,hf_open_llm_v1_240829_frozen.csv gpt_sw3_6_7b_v2_instruct,HFv1 Winogrande,63.54,,hf_open_llm_v1_240829_frozen.csv gptneo350m_instruct_sft,HF OpenLLM v1,31.0,,hf_open_llm_v1_240829_frozen.csv gptneo350m_instruct_sft,HFv1 ARC,25.94,,hf_open_llm_v1_240829_frozen.csv gptneo350m_instruct_sft,HFv1 GSM8K,0.3,,hf_open_llm_v1_240829_frozen.csv gptneo350m_instruct_sft,HFv1 HellaSwag,38.55,,hf_open_llm_v1_240829_frozen.csv gptneo350m_instruct_sft,HFv1 MMLU,25.76,,hf_open_llm_v1_240829_frozen.csv gptneo350m_instruct_sft,HFv1 TruthfulQA,45.25,,hf_open_llm_v1_240829_frozen.csv gptneo350m_instruct_sft,HFv1 Winogrande,50.2,,hf_open_llm_v1_240829_frozen.csv greennodelm_7b_v1olet,HF OpenLLM v1,73.68,,hf_open_llm_v1_240829_frozen.csv greennodelm_7b_v1olet,HFv1 ARC,72.61,,hf_open_llm_v1_240829_frozen.csv greennodelm_7b_v1olet,HFv1 GSM8K,66.87,,hf_open_llm_v1_240829_frozen.csv greennodelm_7b_v1olet,HFv1 HellaSwag,87.7,,hf_open_llm_v1_240829_frozen.csv greennodelm_7b_v1olet,HFv1 MMLU,63.51,,hf_open_llm_v1_240829_frozen.csv greennodelm_7b_v1olet,HFv1 TruthfulQA,69.07,,hf_open_llm_v1_240829_frozen.csv greennodelm_7b_v1olet,HFv1 Winogrande,82.32,,hf_open_llm_v1_240829_frozen.csv greennodelm_7b_v2leo,HF OpenLLM v1,73.29,,hf_open_llm_v1_240829_frozen.csv greennodelm_7b_v2leo,HFv1 ARC,69.8,,hf_open_llm_v1_240829_frozen.csv greennodelm_7b_v2leo,HFv1 GSM8K,67.1,,hf_open_llm_v1_240829_frozen.csv greennodelm_7b_v2leo,HFv1 HellaSwag,88.02,,hf_open_llm_v1_240829_frozen.csv greennodelm_7b_v2leo,HFv1 MMLU,65.0,,hf_open_llm_v1_240829_frozen.csv greennodelm_7b_v2leo,HFv1 TruthfulQA,67.83,,hf_open_llm_v1_240829_frozen.csv greennodelm_7b_v2leo,HFv1 Winogrande,82.0,,hf_open_llm_v1_240829_frozen.csv greennodelm_7b_v4leo,HF OpenLLM v1,74.18,,hf_open_llm_v1_240829_frozen.csv greennodelm_7b_v4leo,HFv1 ARC,71.25,,hf_open_llm_v1_240829_frozen.csv greennodelm_7b_v4leo,HFv1 GSM8K,68.61,,hf_open_llm_v1_240829_frozen.csv greennodelm_7b_v4leo,HFv1 HellaSwag,88.24,,hf_open_llm_v1_240829_frozen.csv greennodelm_7b_v4leo,HFv1 MMLU,65.01,,hf_open_llm_v1_240829_frozen.csv greennodelm_7b_v4leo,HFv1 TruthfulQA,69.65,,hf_open_llm_v1_240829_frozen.csv greennodelm_7b_v4leo,HFv1 Winogrande,82.32,,hf_open_llm_v1_240829_frozen.csv griffin_c3t_8l_v0_02_fineweb,HF OpenLLM v1,29.2,,hf_open_llm_v1_240829_frozen.csv griffin_c3t_8l_v0_02_fineweb,HFv1 ARC,23.29,,hf_open_llm_v1_240829_frozen.csv griffin_c3t_8l_v0_02_fineweb,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv griffin_c3t_8l_v0_02_fineweb,HFv1 HellaSwag,25.72,,hf_open_llm_v1_240829_frozen.csv griffin_c3t_8l_v0_02_fineweb,HFv1 MMLU,23.12,,hf_open_llm_v1_240829_frozen.csv griffin_c3t_8l_v0_02_fineweb,HFv1 TruthfulQA,52.11,,hf_open_llm_v1_240829_frozen.csv griffin_c3t_8l_v0_02_fineweb,HFv1 Winogrande,50.99,,hf_open_llm_v1_240829_frozen.csv griffin_llama3t_8l_v0_02_fineweb,HF OpenLLM v1,28.49,,hf_open_llm_v1_240829_frozen.csv griffin_llama3t_8l_v0_02_fineweb,HFv1 ARC,23.46,,hf_open_llm_v1_240829_frozen.csv griffin_llama3t_8l_v0_02_fineweb,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv griffin_llama3t_8l_v0_02_fineweb,HFv1 HellaSwag,25.48,,hf_open_llm_v1_240829_frozen.csv griffin_llama3t_8l_v0_02_fineweb,HFv1 MMLU,23.12,,hf_open_llm_v1_240829_frozen.csv griffin_llama3t_8l_v0_02_fineweb,HFv1 TruthfulQA,50.32,,hf_open_llm_v1_240829_frozen.csv griffin_llama3t_8l_v0_02_fineweb,HFv1 Winogrande,48.54,,hf_open_llm_v1_240829_frozen.csv griffon_7b_model_stock,HF OpenLLM v1,74.83,,hf_open_llm_v1_240829_frozen.csv griffon_7b_model_stock,HFv1 ARC,72.95,,hf_open_llm_v1_240829_frozen.csv griffon_7b_model_stock,HFv1 GSM8K,67.93,,hf_open_llm_v1_240829_frozen.csv griffon_7b_model_stock,HFv1 HellaSwag,88.29,,hf_open_llm_v1_240829_frozen.csv griffon_7b_model_stock,HFv1 MMLU,64.65,,hf_open_llm_v1_240829_frozen.csv griffon_7b_model_stock,HFv1 TruthfulQA,71.48,,hf_open_llm_v1_240829_frozen.csv griffon_7b_model_stock,HFv1 Winogrande,83.66,,hf_open_llm_v1_240829_frozen.csv grindin,HF OpenLLM v1,72.18,,hf_open_llm_v1_240829_frozen.csv grindin,HFv1 ARC,69.88,,hf_open_llm_v1_240829_frozen.csv grindin,HFv1 GSM8K,70.96,,hf_open_llm_v1_240829_frozen.csv grindin,HFv1 HellaSwag,87.02,,hf_open_llm_v1_240829_frozen.csv grindin,HFv1 MMLU,64.98,,hf_open_llm_v1_240829_frozen.csv grindin,HFv1 TruthfulQA,59.34,,hf_open_llm_v1_240829_frozen.csv grindin,HFv1 Winogrande,80.9,,hf_open_llm_v1_240829_frozen.csv gzdx,HF OpenLLM v1,37.97,,hf_open_llm_v1_240829_frozen.csv gzdx,HFv1 ARC,35.75,,hf_open_llm_v1_240829_frozen.csv gzdx,HFv1 GSM8K,10.77,,hf_open_llm_v1_240829_frozen.csv gzdx,HFv1 HellaSwag,55.57,,hf_open_llm_v1_240829_frozen.csv gzdx,HFv1 MMLU,25.19,,hf_open_llm_v1_240829_frozen.csv gzdx,HFv1 TruthfulQA,42.03,,hf_open_llm_v1_240829_frozen.csv gzdx,HFv1 Winogrande,58.48,,hf_open_llm_v1_240829_frozen.csv gzdx_1_1b,HF OpenLLM v1,39.35,,hf_open_llm_v1_240829_frozen.csv gzdx_1_1b,HFv1 ARC,37.03,,hf_open_llm_v1_240829_frozen.csv gzdx_1_1b,HFv1 GSM8K,9.48,,hf_open_llm_v1_240829_frozen.csv gzdx_1_1b,HFv1 HellaSwag,54.67,,hf_open_llm_v1_240829_frozen.csv gzdx_1_1b,HFv1 MMLU,35.5,,hf_open_llm_v1_240829_frozen.csv gzdx_1_1b,HFv1 TruthfulQA,40.47,,hf_open_llm_v1_240829_frozen.csv gzdx_1_1b,HFv1 Winogrande,58.96,,hf_open_llm_v1_240829_frozen.csv h2o_danube2_1_8b_chat,HF OpenLLM v1,49.26,,hf_open_llm_v1_240829_frozen.csv h2o_danube2_1_8b_chat,HFv1 ARC,43.69,,hf_open_llm_v1_240829_frozen.csv h2o_danube2_1_8b_chat,HFv1 GSM8K,30.55,,hf_open_llm_v1_240829_frozen.csv h2o_danube2_1_8b_chat,HFv1 HellaSwag,73.95,,hf_open_llm_v1_240829_frozen.csv h2o_danube2_1_8b_chat,HFv1 MMLU,38.02,,hf_open_llm_v1_240829_frozen.csv h2o_danube2_1_8b_chat,HFv1 TruthfulQA,40.54,,hf_open_llm_v1_240829_frozen.csv h2o_danube2_1_8b_chat,HFv1 Winogrande,68.9,,hf_open_llm_v1_240829_frozen.csv h2o_danube_1_8b_base,HF OpenLLM v1,39.12,,hf_open_llm_v1_240829_frozen.csv h2o_danube_1_8b_base,HFv1 ARC,39.42,,hf_open_llm_v1_240829_frozen.csv h2o_danube_1_8b_base,HFv1 GSM8K,1.44,,hf_open_llm_v1_240829_frozen.csv h2o_danube_1_8b_base,HFv1 HellaSwag,69.58,,hf_open_llm_v1_240829_frozen.csv h2o_danube_1_8b_base,HFv1 MMLU,25.94,,hf_open_llm_v1_240829_frozen.csv h2o_danube_1_8b_base,HFv1 TruthfulQA,33.86,,hf_open_llm_v1_240829_frozen.csv h2o_danube_1_8b_base,HFv1 Winogrande,64.48,,hf_open_llm_v1_240829_frozen.csv h2o_danube_1_8b_chat,HF OpenLLM v1,44.49,,hf_open_llm_v1_240829_frozen.csv h2o_danube_1_8b_chat,HFv1 ARC,41.13,,hf_open_llm_v1_240829_frozen.csv h2o_danube_1_8b_chat,HFv1 GSM8K,17.36,,hf_open_llm_v1_240829_frozen.csv h2o_danube_1_8b_chat,HFv1 HellaSwag,68.06,,hf_open_llm_v1_240829_frozen.csv h2o_danube_1_8b_chat,HFv1 MMLU,33.41,,hf_open_llm_v1_240829_frozen.csv h2o_danube_1_8b_chat,HFv1 TruthfulQA,41.64,,hf_open_llm_v1_240829_frozen.csv h2o_danube_1_8b_chat,HFv1 Winogrande,65.35,,hf_open_llm_v1_240829_frozen.csv h4rmoniousanthea,HF OpenLLM v1,59.76,,hf_open_llm_v1_240829_frozen.csv h4rmoniousanthea,HFv1 ARC,65.87,,hf_open_llm_v1_240829_frozen.csv h4rmoniousanthea,HFv1 GSM8K,12.96,,hf_open_llm_v1_240829_frozen.csv h4rmoniousanthea,HFv1 HellaSwag,84.09,,hf_open_llm_v1_240829_frozen.csv h4rmoniousanthea,HFv1 MMLU,63.67,,hf_open_llm_v1_240829_frozen.csv h4rmoniousanthea,HFv1 TruthfulQA,55.08,,hf_open_llm_v1_240829_frozen.csv h4rmoniousanthea,HFv1 Winogrande,76.87,,hf_open_llm_v1_240829_frozen.csv halu_oas_8b_llama3,HF OpenLLM v1,69.51,,hf_open_llm_v1_240829_frozen.csv halu_oas_8b_llama3,HFv1 ARC,64.08,,hf_open_llm_v1_240829_frozen.csv halu_oas_8b_llama3,HFv1 GSM8K,68.61,,hf_open_llm_v1_240829_frozen.csv halu_oas_8b_llama3,HFv1 HellaSwag,83.35,,hf_open_llm_v1_240829_frozen.csv halu_oas_8b_llama3,HFv1 MMLU,67.8,,hf_open_llm_v1_240829_frozen.csv halu_oas_8b_llama3,HFv1 TruthfulQA,53.45,,hf_open_llm_v1_240829_frozen.csv halu_oas_8b_llama3,HFv1 Winogrande,79.79,,hf_open_llm_v1_240829_frozen.csv han_llm_7b_v2,HF OpenLLM v1,59.06,,hf_open_llm_v1_240829_frozen.csv han_llm_7b_v2,HFv1 ARC,58.79,,hf_open_llm_v1_240829_frozen.csv han_llm_7b_v2,HFv1 GSM8K,33.51,,hf_open_llm_v1_240829_frozen.csv han_llm_7b_v2,HFv1 HellaSwag,81.75,,hf_open_llm_v1_240829_frozen.csv han_llm_7b_v2,HFv1 MMLU,59.93,,hf_open_llm_v1_240829_frozen.csv han_llm_7b_v2,HFv1 TruthfulQA,42.38,,hf_open_llm_v1_240829_frozen.csv han_llm_7b_v2,HFv1 Winogrande,77.98,,hf_open_llm_v1_240829_frozen.csv harpy_7b_model_stock,HF OpenLLM v1,75.51,,hf_open_llm_v1_240829_frozen.csv harpy_7b_model_stock,HFv1 ARC,73.21,,hf_open_llm_v1_240829_frozen.csv harpy_7b_model_stock,HFv1 GSM8K,69.45,,hf_open_llm_v1_240829_frozen.csv harpy_7b_model_stock,HFv1 HellaSwag,88.72,,hf_open_llm_v1_240829_frozen.csv harpy_7b_model_stock,HFv1 MMLU,65.07,,hf_open_llm_v1_240829_frozen.csv harpy_7b_model_stock,HFv1 TruthfulQA,71.35,,hf_open_llm_v1_240829_frozen.csv harpy_7b_model_stock,HFv1 Winogrande,85.24,,hf_open_llm_v1_240829_frozen.csv healix_1_1b_v1_chat_ddpo,HF OpenLLM v1,33.0,,hf_open_llm_v1_240829_frozen.csv healix_1_1b_v1_chat_ddpo,HFv1 ARC,30.55,,hf_open_llm_v1_240829_frozen.csv healix_1_1b_v1_chat_ddpo,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv healix_1_1b_v1_chat_ddpo,HFv1 HellaSwag,44.78,,hf_open_llm_v1_240829_frozen.csv healix_1_1b_v1_chat_ddpo,HFv1 MMLU,24.64,,hf_open_llm_v1_240829_frozen.csv healix_1_1b_v1_chat_ddpo,HFv1 TruthfulQA,41.55,,hf_open_llm_v1_240829_frozen.csv healix_1_1b_v1_chat_ddpo,HFv1 Winogrande,56.51,,hf_open_llm_v1_240829_frozen.csv healix_3b,HF OpenLLM v1,38.93,,hf_open_llm_v1_240829_frozen.csv healix_3b,HFv1 ARC,37.71,,hf_open_llm_v1_240829_frozen.csv healix_3b,HFv1 GSM8K,0.76,,hf_open_llm_v1_240829_frozen.csv healix_3b,HFv1 HellaSwag,65.94,,hf_open_llm_v1_240829_frozen.csv healix_3b,HFv1 MMLU,26.02,,hf_open_llm_v1_240829_frozen.csv healix_3b,HFv1 TruthfulQA,37.4,,hf_open_llm_v1_240829_frozen.csv healix_3b,HFv1 Winogrande,65.75,,hf_open_llm_v1_240829_frozen.csv helpingai_110m,HF OpenLLM v1,29.05,,hf_open_llm_v1_240829_frozen.csv helpingai_110m,HFv1 ARC,22.78,,hf_open_llm_v1_240829_frozen.csv helpingai_110m,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv helpingai_110m,HFv1 HellaSwag,28.02,,hf_open_llm_v1_240829_frozen.csv helpingai_110m,HFv1 MMLU,23.66,,hf_open_llm_v1_240829_frozen.csv helpingai_110m,HFv1 TruthfulQA,48.25,,hf_open_llm_v1_240829_frozen.csv helpingai_110m,HFv1 Winogrande,51.62,,hf_open_llm_v1_240829_frozen.csv helpingai_3b,HF OpenLLM v1,55.59,,hf_open_llm_v1_240829_frozen.csv helpingai_3b,HFv1 ARC,50.6,,hf_open_llm_v1_240829_frozen.csv helpingai_3b,HFv1 GSM8K,36.09,,hf_open_llm_v1_240829_frozen.csv helpingai_3b,HFv1 HellaSwag,76.64,,hf_open_llm_v1_240829_frozen.csv helpingai_3b,HFv1 MMLU,46.82,,hf_open_llm_v1_240829_frozen.csv helpingai_3b,HFv1 TruthfulQA,55.62,,hf_open_llm_v1_240829_frozen.csv helpingai_3b,HFv1 Winogrande,67.8,,hf_open_llm_v1_240829_frozen.csv helpingai_9b,HF OpenLLM v1,63.33,,hf_open_llm_v1_240829_frozen.csv helpingai_9b,HFv1 ARC,58.87,,hf_open_llm_v1_240829_frozen.csv helpingai_9b,HFv1 GSM8K,53.75,,hf_open_llm_v1_240829_frozen.csv helpingai_9b,HFv1 HellaSwag,79.16,,hf_open_llm_v1_240829_frozen.csv helpingai_9b,HFv1 MMLU,65.01,,hf_open_llm_v1_240829_frozen.csv helpingai_9b,HFv1 TruthfulQA,48.52,,hf_open_llm_v1_240829_frozen.csv helpingai_9b,HFv1 Winogrande,74.66,,hf_open_llm_v1_240829_frozen.csv hercules_1_0_mistral_7b,HF OpenLLM v1,58.95,,hf_open_llm_v1_240829_frozen.csv hercules_1_0_mistral_7b,HFv1 ARC,57.08,,hf_open_llm_v1_240829_frozen.csv hercules_1_0_mistral_7b,HFv1 GSM8K,29.87,,hf_open_llm_v1_240829_frozen.csv hercules_1_0_mistral_7b,HFv1 HellaSwag,81.13,,hf_open_llm_v1_240829_frozen.csv hercules_1_0_mistral_7b,HFv1 MMLU,58.98,,hf_open_llm_v1_240829_frozen.csv hercules_1_0_mistral_7b,HFv1 TruthfulQA,49.47,,hf_open_llm_v1_240829_frozen.csv hercules_1_0_mistral_7b,HFv1 Winogrande,77.19,,hf_open_llm_v1_240829_frozen.csv hercules_2_0_mistral_7b,HF OpenLLM v1,62.69,,hf_open_llm_v1_240829_frozen.csv hercules_2_0_mistral_7b,HFv1 ARC,61.09,,hf_open_llm_v1_240829_frozen.csv hercules_2_0_mistral_7b,HFv1 GSM8K,44.43,,hf_open_llm_v1_240829_frozen.csv hercules_2_0_mistral_7b,HFv1 HellaSwag,83.69,,hf_open_llm_v1_240829_frozen.csv hercules_2_0_mistral_7b,HFv1 MMLU,63.47,,hf_open_llm_v1_240829_frozen.csv hercules_2_0_mistral_7b,HFv1 TruthfulQA,43.97,,hf_open_llm_v1_240829_frozen.csv hercules_2_0_mistral_7b,HFv1 Winogrande,79.48,,hf_open_llm_v1_240829_frozen.csv hercules_2_5_mistral_7b,HF OpenLLM v1,63.59,,hf_open_llm_v1_240829_frozen.csv hercules_2_5_mistral_7b,HFv1 ARC,62.03,,hf_open_llm_v1_240829_frozen.csv hercules_2_5_mistral_7b,HFv1 GSM8K,49.05,,hf_open_llm_v1_240829_frozen.csv hercules_2_5_mistral_7b,HFv1 HellaSwag,83.79,,hf_open_llm_v1_240829_frozen.csv hercules_2_5_mistral_7b,HFv1 MMLU,63.49,,hf_open_llm_v1_240829_frozen.csv hercules_2_5_mistral_7b,HFv1 TruthfulQA,43.44,,hf_open_llm_v1_240829_frozen.csv hercules_2_5_mistral_7b,HFv1 Winogrande,79.72,,hf_open_llm_v1_240829_frozen.csv hercules_3_0_mistral_7b,HF OpenLLM v1,62.36,,hf_open_llm_v1_240829_frozen.csv hercules_3_0_mistral_7b,HFv1 ARC,61.26,,hf_open_llm_v1_240829_frozen.csv hercules_3_0_mistral_7b,HFv1 GSM8K,42.91,,hf_open_llm_v1_240829_frozen.csv hercules_3_0_mistral_7b,HFv1 HellaSwag,83.43,,hf_open_llm_v1_240829_frozen.csv hercules_3_0_mistral_7b,HFv1 MMLU,63.68,,hf_open_llm_v1_240829_frozen.csv hercules_3_0_mistral_7b,HFv1 TruthfulQA,43.42,,hf_open_llm_v1_240829_frozen.csv hercules_3_0_mistral_7b,HFv1 Winogrande,79.48,,hf_open_llm_v1_240829_frozen.csv hercules_3_1_mistral_7b,HF OpenLLM v1,62.09,,hf_open_llm_v1_240829_frozen.csv hercules_3_1_mistral_7b,HFv1 ARC,61.18,,hf_open_llm_v1_240829_frozen.csv hercules_3_1_mistral_7b,HFv1 GSM8K,42.3,,hf_open_llm_v1_240829_frozen.csv hercules_3_1_mistral_7b,HFv1 HellaSwag,83.55,,hf_open_llm_v1_240829_frozen.csv hercules_3_1_mistral_7b,HFv1 MMLU,63.65,,hf_open_llm_v1_240829_frozen.csv hercules_3_1_mistral_7b,HFv1 TruthfulQA,42.83,,hf_open_llm_v1_240829_frozen.csv hercules_3_1_mistral_7b,HFv1 Winogrande,79.01,,hf_open_llm_v1_240829_frozen.csv hercules_4_0_mistral_v0_2_7b,HF OpenLLM v1,61.53,,hf_open_llm_v1_240829_frozen.csv hercules_4_0_mistral_v0_2_7b,HFv1 ARC,58.96,,hf_open_llm_v1_240829_frozen.csv hercules_4_0_mistral_v0_2_7b,HFv1 GSM8K,45.41,,hf_open_llm_v1_240829_frozen.csv hercules_4_0_mistral_v0_2_7b,HFv1 HellaSwag,82.6,,hf_open_llm_v1_240829_frozen.csv hercules_4_0_mistral_v0_2_7b,HFv1 MMLU,62.66,,hf_open_llm_v1_240829_frozen.csv hercules_4_0_mistral_v0_2_7b,HFv1 TruthfulQA,40.99,,hf_open_llm_v1_240829_frozen.csv hercules_4_0_mistral_v0_2_7b,HFv1 Winogrande,78.53,,hf_open_llm_v1_240829_frozen.csv hercules_4_0_yi_34b,HF OpenLLM v1,70.85,,hf_open_llm_v1_240829_frozen.csv hercules_4_0_yi_34b,HFv1 ARC,64.51,,hf_open_llm_v1_240829_frozen.csv hercules_4_0_yi_34b,HFv1 GSM8K,63.0,,hf_open_llm_v1_240829_frozen.csv hercules_4_0_yi_34b,HFv1 HellaSwag,85.22,,hf_open_llm_v1_240829_frozen.csv hercules_4_0_yi_34b,HFv1 MMLU,75.2,,hf_open_llm_v1_240829_frozen.csv hercules_4_0_yi_34b,HFv1 TruthfulQA,53.05,,hf_open_llm_v1_240829_frozen.csv hercules_4_0_yi_34b,HFv1 Winogrande,84.14,,hf_open_llm_v1_240829_frozen.csv hercules_mini_1_8b,HF OpenLLM v1,45.57,,hf_open_llm_v1_240829_frozen.csv hercules_mini_1_8b,HFv1 ARC,37.03,,hf_open_llm_v1_240829_frozen.csv hercules_mini_1_8b,HFv1 GSM8K,30.55,,hf_open_llm_v1_240829_frozen.csv hercules_mini_1_8b,HFv1 HellaSwag,59.53,,hf_open_llm_v1_240829_frozen.csv hercules_mini_1_8b,HFv1 MMLU,44.77,,hf_open_llm_v1_240829_frozen.csv hercules_mini_1_8b,HFv1 TruthfulQA,39.24,,hf_open_llm_v1_240829_frozen.csv hercules_mini_1_8b,HFv1 Winogrande,62.27,,hf_open_llm_v1_240829_frozen.csv hercules_qwen1_5_14b,HF OpenLLM v1,66.24,,hf_open_llm_v1_240829_frozen.csv hercules_qwen1_5_14b,HFv1 ARC,56.23,,hf_open_llm_v1_240829_frozen.csv hercules_qwen1_5_14b,HFv1 GSM8K,65.96,,hf_open_llm_v1_240829_frozen.csv hercules_qwen1_5_14b,HFv1 HellaSwag,80.6,,hf_open_llm_v1_240829_frozen.csv hercules_qwen1_5_14b,HFv1 MMLU,68.73,,hf_open_llm_v1_240829_frozen.csv hercules_qwen1_5_14b,HFv1 TruthfulQA,52.03,,hf_open_llm_v1_240829_frozen.csv hercules_qwen1_5_14b,HFv1 Winogrande,73.88,,hf_open_llm_v1_240829_frozen.csv hermes_2_pro_mistral_7b,HF OpenLLM v1,67.35,,hf_open_llm_v1_240829_frozen.csv hermes_2_pro_mistral_7b,HFv1 ARC,64.16,,hf_open_llm_v1_240829_frozen.csv hermes_2_pro_mistral_7b,HFv1 GSM8K,60.42,,hf_open_llm_v1_240829_frozen.csv hermes_2_pro_mistral_7b,HFv1 HellaSwag,82.73,,hf_open_llm_v1_240829_frozen.csv hermes_2_pro_mistral_7b,HFv1 MMLU,62.21,,hf_open_llm_v1_240829_frozen.csv hermes_2_pro_mistral_7b,HFv1 TruthfulQA,58.99,,hf_open_llm_v1_240829_frozen.csv hermes_2_pro_mistral_7b,HFv1 Winogrande,75.61,,hf_open_llm_v1_240829_frozen.csv hermesstar_orcawind_synth_11b,HF OpenLLM v1,66.59,,hf_open_llm_v1_240829_frozen.csv hermesstar_orcawind_synth_11b,HFv1 ARC,65.27,,hf_open_llm_v1_240829_frozen.csv hermesstar_orcawind_synth_11b,HFv1 GSM8K,56.63,,hf_open_llm_v1_240829_frozen.csv hermesstar_orcawind_synth_11b,HFv1 HellaSwag,83.69,,hf_open_llm_v1_240829_frozen.csv hermesstar_orcawind_synth_11b,HFv1 MMLU,65.31,,hf_open_llm_v1_240829_frozen.csv hermesstar_orcawind_synth_11b,HFv1 TruthfulQA,48.55,,hf_open_llm_v1_240829_frozen.csv hermesstar_orcawind_synth_11b,HFv1 Winogrande,80.11,,hf_open_llm_v1_240829_frozen.csv hf_checkpoint2_01052024,HF OpenLLM v1,30.4,,hf_open_llm_v1_240829_frozen.csv hf_checkpoint2_01052024,HFv1 ARC,24.91,,hf_open_llm_v1_240829_frozen.csv hf_checkpoint2_01052024,HFv1 GSM8K,0.53,,hf_open_llm_v1_240829_frozen.csv hf_checkpoint2_01052024,HFv1 HellaSwag,32.23,,hf_open_llm_v1_240829_frozen.csv hf_checkpoint2_01052024,HFv1 MMLU,27.01,,hf_open_llm_v1_240829_frozen.csv hf_checkpoint2_01052024,HFv1 TruthfulQA,45.77,,hf_open_llm_v1_240829_frozen.csv hf_checkpoint2_01052024,HFv1 Winogrande,51.93,,hf_open_llm_v1_240829_frozen.csv hope_for,HF OpenLLM v1,51.3,,hf_open_llm_v1_240829_frozen.csv hope_for,HFv1 ARC,51.28,,hf_open_llm_v1_240829_frozen.csv hope_for,HFv1 GSM8K,16.91,,hf_open_llm_v1_240829_frozen.csv hope_for,HFv1 HellaSwag,74.74,,hf_open_llm_v1_240829_frozen.csv hope_for,HFv1 MMLU,51.56,,hf_open_llm_v1_240829_frozen.csv hope_for,HFv1 TruthfulQA,40.73,,hf_open_llm_v1_240829_frozen.csv hope_for,HFv1 Winogrande,72.61,,hf_open_llm_v1_240829_frozen.csv hope_for_7b_1_0v,HF OpenLLM v1,51.16,,hf_open_llm_v1_240829_frozen.csv hope_for_7b_1_0v,HFv1 ARC,50.43,,hf_open_llm_v1_240829_frozen.csv hope_for_7b_1_0v,HFv1 GSM8K,16.53,,hf_open_llm_v1_240829_frozen.csv hope_for_7b_1_0v,HFv1 HellaSwag,76.44,,hf_open_llm_v1_240829_frozen.csv hope_for_7b_1_0v,HFv1 MMLU,49.68,,hf_open_llm_v1_240829_frozen.csv hope_for_7b_1_0v,HFv1 TruthfulQA,38.66,,hf_open_llm_v1_240829_frozen.csv hope_for_7b_1_0v,HFv1 Winogrande,75.22,,hf_open_llm_v1_240829_frozen.csv hope_for_7b_1_1v,HF OpenLLM v1,50.19,,hf_open_llm_v1_240829_frozen.csv hope_for_7b_1_1v,HFv1 ARC,49.49,,hf_open_llm_v1_240829_frozen.csv hope_for_7b_1_1v,HFv1 GSM8K,14.18,,hf_open_llm_v1_240829_frozen.csv hope_for_7b_1_1v,HFv1 HellaSwag,75.08,,hf_open_llm_v1_240829_frozen.csv hope_for_7b_1_1v,HFv1 MMLU,48.49,,hf_open_llm_v1_240829_frozen.csv hope_for_7b_1_1v,HFv1 TruthfulQA,40.26,,hf_open_llm_v1_240829_frozen.csv hope_for_7b_1_1v,HFv1 Winogrande,73.64,,hf_open_llm_v1_240829_frozen.csv huginn_13b_v4,HF OpenLLM v1,54.04,,hf_open_llm_v1_240829_frozen.csv huginn_13b_v4,HFv1 ARC,60.67,,hf_open_llm_v1_240829_frozen.csv huginn_13b_v4,HFv1 GSM8K,4.62,,hf_open_llm_v1_240829_frozen.csv huginn_13b_v4,HFv1 HellaSwag,82.34,,hf_open_llm_v1_240829_frozen.csv huginn_13b_v4,HFv1 MMLU,52.32,,hf_open_llm_v1_240829_frozen.csv huginn_13b_v4,HFv1 TruthfulQA,50.62,,hf_open_llm_v1_240829_frozen.csv huginn_13b_v4,HFv1 Winogrande,73.64,,hf_open_llm_v1_240829_frozen.csv huginn_13b_v4_5,HF OpenLLM v1,54.04,,hf_open_llm_v1_240829_frozen.csv huginn_13b_v4_5,HFv1 ARC,60.67,,hf_open_llm_v1_240829_frozen.csv huginn_13b_v4_5,HFv1 GSM8K,4.62,,hf_open_llm_v1_240829_frozen.csv huginn_13b_v4_5,HFv1 HellaSwag,82.34,,hf_open_llm_v1_240829_frozen.csv huginn_13b_v4_5,HFv1 MMLU,52.32,,hf_open_llm_v1_240829_frozen.csv huginn_13b_v4_5,HFv1 TruthfulQA,50.62,,hf_open_llm_v1_240829_frozen.csv huginn_13b_v4_5,HFv1 Winogrande,73.64,,hf_open_llm_v1_240829_frozen.csv huginn_19b_prototype,HF OpenLLM v1,52.99,,hf_open_llm_v1_240829_frozen.csv huginn_19b_prototype,HFv1 ARC,59.22,,hf_open_llm_v1_240829_frozen.csv huginn_19b_prototype,HFv1 GSM8K,4.4,,hf_open_llm_v1_240829_frozen.csv huginn_19b_prototype,HFv1 HellaSwag,81.03,,hf_open_llm_v1_240829_frozen.csv huginn_19b_prototype,HFv1 MMLU,55.73,,hf_open_llm_v1_240829_frozen.csv huginn_19b_prototype,HFv1 TruthfulQA,41.15,,hf_open_llm_v1_240829_frozen.csv huginn_19b_prototype,HFv1 Winogrande,76.4,,hf_open_llm_v1_240829_frozen.csv huginn_22b_prototype,HF OpenLLM v1,52.36,,hf_open_llm_v1_240829_frozen.csv huginn_22b_prototype,HFv1 ARC,57.68,,hf_open_llm_v1_240829_frozen.csv huginn_22b_prototype,HFv1 GSM8K,2.27,,hf_open_llm_v1_240829_frozen.csv huginn_22b_prototype,HFv1 HellaSwag,80.69,,hf_open_llm_v1_240829_frozen.csv huginn_22b_prototype,HFv1 MMLU,49.81,,hf_open_llm_v1_240829_frozen.csv huginn_22b_prototype,HFv1 TruthfulQA,52.11,,hf_open_llm_v1_240829_frozen.csv huginn_22b_prototype,HFv1 Winogrande,71.59,,hf_open_llm_v1_240829_frozen.csv huginn_v3_13b,HF OpenLLM v1,54.04,,hf_open_llm_v1_240829_frozen.csv huginn_v3_13b,HFv1 ARC,60.67,,hf_open_llm_v1_240829_frozen.csv huginn_v3_13b,HFv1 GSM8K,4.62,,hf_open_llm_v1_240829_frozen.csv huginn_v3_13b,HFv1 HellaSwag,82.34,,hf_open_llm_v1_240829_frozen.csv huginn_v3_13b,HFv1 MMLU,52.32,,hf_open_llm_v1_240829_frozen.csv huginn_v3_13b,HFv1 TruthfulQA,50.62,,hf_open_llm_v1_240829_frozen.csv huginn_v3_13b,HFv1 Winogrande,73.64,,hf_open_llm_v1_240829_frozen.csv huginnv1_2,HF OpenLLM v1,55.98,,hf_open_llm_v1_240829_frozen.csv huginnv1_2,HFv1 ARC,62.37,,hf_open_llm_v1_240829_frozen.csv huginnv1_2,HFv1 GSM8K,9.17,,hf_open_llm_v1_240829_frozen.csv huginnv1_2,HFv1 HellaSwag,84.28,,hf_open_llm_v1_240829_frozen.csv huginnv1_2,HFv1 MMLU,57.02,,hf_open_llm_v1_240829_frozen.csv huginnv1_2,HFv1 TruthfulQA,47.81,,hf_open_llm_v1_240829_frozen.csv huginnv1_2,HFv1 Winogrande,75.22,,hf_open_llm_v1_240829_frozen.csv hyperion_1_5_mistral_7b,HF OpenLLM v1,61.43,,hf_open_llm_v1_240829_frozen.csv hyperion_1_5_mistral_7b,HFv1 ARC,60.49,,hf_open_llm_v1_240829_frozen.csv hyperion_1_5_mistral_7b,HFv1 GSM8K,40.49,,hf_open_llm_v1_240829_frozen.csv hyperion_1_5_mistral_7b,HFv1 HellaSwag,83.64,,hf_open_llm_v1_240829_frozen.csv hyperion_1_5_mistral_7b,HFv1 MMLU,63.57,,hf_open_llm_v1_240829_frozen.csv hyperion_1_5_mistral_7b,HFv1 TruthfulQA,41.78,,hf_open_llm_v1_240829_frozen.csv hyperion_1_5_mistral_7b,HFv1 Winogrande,78.61,,hf_open_llm_v1_240829_frozen.csv hyperion_2_0_mistral_7b,HF OpenLLM v1,61.88,,hf_open_llm_v1_240829_frozen.csv hyperion_2_0_mistral_7b,HFv1 ARC,61.09,,hf_open_llm_v1_240829_frozen.csv hyperion_2_0_mistral_7b,HFv1 GSM8K,41.77,,hf_open_llm_v1_240829_frozen.csv hyperion_2_0_mistral_7b,HFv1 HellaSwag,83.5,,hf_open_llm_v1_240829_frozen.csv hyperion_2_0_mistral_7b,HFv1 MMLU,63.68,,hf_open_llm_v1_240829_frozen.csv hyperion_2_0_mistral_7b,HFv1 TruthfulQA,41.97,,hf_open_llm_v1_240829_frozen.csv hyperion_2_0_mistral_7b,HFv1 Winogrande,79.24,,hf_open_llm_v1_240829_frozen.csv hyperion_2_0_yi_34b,HF OpenLLM v1,71.09,,hf_open_llm_v1_240829_frozen.csv hyperion_2_0_yi_34b,HFv1 ARC,64.33,,hf_open_llm_v1_240829_frozen.csv hyperion_2_0_yi_34b,HFv1 GSM8K,62.02,,hf_open_llm_v1_240829_frozen.csv hyperion_2_0_yi_34b,HFv1 HellaSwag,85.66,,hf_open_llm_v1_240829_frozen.csv hyperion_2_0_yi_34b,HFv1 MMLU,76.09,,hf_open_llm_v1_240829_frozen.csv hyperion_2_0_yi_34b,HFv1 TruthfulQA,55.3,,hf_open_llm_v1_240829_frozen.csv hyperion_2_0_yi_34b,HFv1 Winogrande,83.11,,hf_open_llm_v1_240829_frozen.csv hyperion_2_1_mistral_7b,HF OpenLLM v1,61.9,,hf_open_llm_v1_240829_frozen.csv hyperion_2_1_mistral_7b,HFv1 ARC,59.9,,hf_open_llm_v1_240829_frozen.csv hyperion_2_1_mistral_7b,HFv1 GSM8K,40.18,,hf_open_llm_v1_240829_frozen.csv hyperion_2_1_mistral_7b,HFv1 HellaSwag,83.3,,hf_open_llm_v1_240829_frozen.csv hyperion_2_1_mistral_7b,HFv1 MMLU,61.46,,hf_open_llm_v1_240829_frozen.csv hyperion_2_1_mistral_7b,HFv1 TruthfulQA,47.58,,hf_open_llm_v1_240829_frozen.csv hyperion_2_1_mistral_7b,HFv1 Winogrande,79.01,,hf_open_llm_v1_240829_frozen.csv hyperion_3_0_mistral_7b_alpha,HF OpenLLM v1,61.52,,hf_open_llm_v1_240829_frozen.csv hyperion_3_0_mistral_7b_alpha,HFv1 ARC,59.98,,hf_open_llm_v1_240829_frozen.csv hyperion_3_0_mistral_7b_alpha,HFv1 GSM8K,41.55,,hf_open_llm_v1_240829_frozen.csv hyperion_3_0_mistral_7b_alpha,HFv1 HellaSwag,83.48,,hf_open_llm_v1_240829_frozen.csv hyperion_3_0_mistral_7b_alpha,HFv1 MMLU,62.5,,hf_open_llm_v1_240829_frozen.csv hyperion_3_0_mistral_7b_alpha,HFv1 TruthfulQA,42.82,,hf_open_llm_v1_240829_frozen.csv hyperion_3_0_mistral_7b_alpha,HFv1 Winogrande,78.77,,hf_open_llm_v1_240829_frozen.csv hyperion_3_0_mistral_7b_dpo,HF OpenLLM v1,63.03,,hf_open_llm_v1_240829_frozen.csv hyperion_3_0_mistral_7b_dpo,HFv1 ARC,60.67,,hf_open_llm_v1_240829_frozen.csv hyperion_3_0_mistral_7b_dpo,HFv1 GSM8K,45.34,,hf_open_llm_v1_240829_frozen.csv hyperion_3_0_mistral_7b_dpo,HFv1 HellaSwag,83.95,,hf_open_llm_v1_240829_frozen.csv hyperion_3_0_mistral_7b_dpo,HFv1 MMLU,62.71,,hf_open_llm_v1_240829_frozen.csv hyperion_3_0_mistral_7b_dpo,HFv1 TruthfulQA,46.17,,hf_open_llm_v1_240829_frozen.csv hyperion_3_0_mistral_7b_dpo,HFv1 Winogrande,79.32,,hf_open_llm_v1_240829_frozen.csv hyperion_3_0_mixtral_3x7b,HF OpenLLM v1,61.84,,hf_open_llm_v1_240829_frozen.csv hyperion_3_0_mixtral_3x7b,HFv1 ARC,60.67,,hf_open_llm_v1_240829_frozen.csv hyperion_3_0_mixtral_3x7b,HFv1 GSM8K,41.39,,hf_open_llm_v1_240829_frozen.csv hyperion_3_0_mixtral_3x7b,HFv1 HellaSwag,83.28,,hf_open_llm_v1_240829_frozen.csv hyperion_3_0_mixtral_3x7b,HFv1 MMLU,63.22,,hf_open_llm_v1_240829_frozen.csv hyperion_3_0_mixtral_3x7b,HFv1 TruthfulQA,43.46,,hf_open_llm_v1_240829_frozen.csv hyperion_3_0_mixtral_3x7b,HFv1 Winogrande,79.01,,hf_open_llm_v1_240829_frozen.csv hyperion_3_0_yi_34b,HF OpenLLM v1,71.18,,hf_open_llm_v1_240829_frozen.csv hyperion_3_0_yi_34b,HFv1 ARC,64.59,,hf_open_llm_v1_240829_frozen.csv hyperion_3_0_yi_34b,HFv1 GSM8K,61.03,,hf_open_llm_v1_240829_frozen.csv hyperion_3_0_yi_34b,HFv1 HellaSwag,85.61,,hf_open_llm_v1_240829_frozen.csv hyperion_3_0_yi_34b,HFv1 MMLU,75.98,,hf_open_llm_v1_240829_frozen.csv hyperion_3_0_yi_34b,HFv1 TruthfulQA,56.38,,hf_open_llm_v1_240829_frozen.csv hyperion_3_0_yi_34b,HFv1 Winogrande,83.5,,hf_open_llm_v1_240829_frozen.csv hyperion_medium_preview,HF OpenLLM v1,61.67,,hf_open_llm_v1_240829_frozen.csv hyperion_medium_preview,HFv1 ARC,60.67,,hf_open_llm_v1_240829_frozen.csv hyperion_medium_preview,HFv1 GSM8K,40.49,,hf_open_llm_v1_240829_frozen.csv hyperion_medium_preview,HFv1 HellaSwag,83.67,,hf_open_llm_v1_240829_frozen.csv hyperion_medium_preview,HFv1 MMLU,63.73,,hf_open_llm_v1_240829_frozen.csv hyperion_medium_preview,HFv1 TruthfulQA,42.93,,hf_open_llm_v1_240829_frozen.csv hyperion_medium_preview,HFv1 Winogrande,78.53,,hf_open_llm_v1_240829_frozen.csv iambe_20b_dare_v2,HF OpenLLM v1,61.99,,hf_open_llm_v1_240829_frozen.csv iambe_20b_dare_v2,HFv1 ARC,62.8,,hf_open_llm_v1_240829_frozen.csv iambe_20b_dare_v2,HFv1 GSM8K,33.28,,hf_open_llm_v1_240829_frozen.csv iambe_20b_dare_v2,HFv1 HellaSwag,84.53,,hf_open_llm_v1_240829_frozen.csv iambe_20b_dare_v2,HFv1 MMLU,60.45,,hf_open_llm_v1_240829_frozen.csv iambe_20b_dare_v2,HFv1 TruthfulQA,53.85,,hf_open_llm_v1_240829_frozen.csv iambe_20b_dare_v2,HFv1 Winogrande,77.03,,hf_open_llm_v1_240829_frozen.csv iamsotired_7b_slerp,HF OpenLLM v1,72.37,,hf_open_llm_v1_240829_frozen.csv iamsotired_7b_slerp,HFv1 ARC,69.88,,hf_open_llm_v1_240829_frozen.csv iamsotired_7b_slerp,HFv1 GSM8K,66.19,,hf_open_llm_v1_240829_frozen.csv iamsotired_7b_slerp,HFv1 HellaSwag,87.15,,hf_open_llm_v1_240829_frozen.csv iamsotired_7b_slerp,HFv1 MMLU,64.85,,hf_open_llm_v1_240829_frozen.csv iamsotired_7b_slerp,HFv1 TruthfulQA,63.75,,hf_open_llm_v1_240829_frozen.csv iamsotired_7b_slerp,HFv1 Winogrande,82.4,,hf_open_llm_v1_240829_frozen.csv ice_grt,HF OpenLLM v1,61.39,,hf_open_llm_v1_240829_frozen.csv ice_grt,HFv1 ARC,62.88,,hf_open_llm_v1_240829_frozen.csv ice_grt,HFv1 GSM8K,31.69,,hf_open_llm_v1_240829_frozen.csv ice_grt,HFv1 HellaSwag,86.14,,hf_open_llm_v1_240829_frozen.csv ice_grt,HFv1 MMLU,57.34,,hf_open_llm_v1_240829_frozen.csv ice_grt,HFv1 TruthfulQA,53.17,,hf_open_llm_v1_240829_frozen.csv ice_grt,HFv1 Winogrande,77.11,,hf_open_llm_v1_240829_frozen.csv idus,HF OpenLLM v1,29.51,,hf_open_llm_v1_240829_frozen.csv idus,HFv1 ARC,27.73,,hf_open_llm_v1_240829_frozen.csv idus,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv idus,HFv1 HellaSwag,26.65,,hf_open_llm_v1_240829_frozen.csv idus,HFv1 MMLU,24.91,,hf_open_llm_v1_240829_frozen.csv idus,HFv1 TruthfulQA,48.58,,hf_open_llm_v1_240829_frozen.csv idus,HFv1 Winogrande,49.17,,hf_open_llm_v1_240829_frozen.csv idus_8layers,HF OpenLLM v1,58.38,,hf_open_llm_v1_240829_frozen.csv idus_8layers,HFv1 ARC,59.3,,hf_open_llm_v1_240829_frozen.csv idus_8layers,HFv1 GSM8K,29.57,,hf_open_llm_v1_240829_frozen.csv idus_8layers,HFv1 HellaSwag,81.34,,hf_open_llm_v1_240829_frozen.csv idus_8layers,HFv1 MMLU,63.22,,hf_open_llm_v1_240829_frozen.csv idus_8layers,HFv1 TruthfulQA,40.62,,hf_open_llm_v1_240829_frozen.csv idus_8layers,HFv1 Winogrande,76.24,,hf_open_llm_v1_240829_frozen.csv ignis_7b_dpo,HF OpenLLM v1,64.77,,hf_open_llm_v1_240829_frozen.csv ignis_7b_dpo,HFv1 ARC,66.3,,hf_open_llm_v1_240829_frozen.csv ignis_7b_dpo,HFv1 GSM8K,33.06,,hf_open_llm_v1_240829_frozen.csv ignis_7b_dpo,HFv1 HellaSwag,84.85,,hf_open_llm_v1_240829_frozen.csv ignis_7b_dpo,HFv1 MMLU,58.99,,hf_open_llm_v1_240829_frozen.csv ignis_7b_dpo,HFv1 TruthfulQA,65.46,,hf_open_llm_v1_240829_frozen.csv ignis_7b_dpo,HFv1 Winogrande,79.95,,hf_open_llm_v1_240829_frozen.csv ignis_7b_dpo_laser,HF OpenLLM v1,64.41,,hf_open_llm_v1_240829_frozen.csv ignis_7b_dpo_laser,HFv1 ARC,65.19,,hf_open_llm_v1_240829_frozen.csv ignis_7b_dpo_laser,HFv1 GSM8K,31.46,,hf_open_llm_v1_240829_frozen.csv ignis_7b_dpo_laser,HFv1 HellaSwag,84.57,,hf_open_llm_v1_240829_frozen.csv ignis_7b_dpo_laser,HFv1 MMLU,58.56,,hf_open_llm_v1_240829_frozen.csv ignis_7b_dpo_laser,HFv1 TruthfulQA,66.24,,hf_open_llm_v1_240829_frozen.csv ignis_7b_dpo_laser,HFv1 Winogrande,80.43,,hf_open_llm_v1_240829_frozen.csv inex12_7b,HF OpenLLM v1,76.66,,hf_open_llm_v1_240829_frozen.csv inex12_7b,HFv1 ARC,72.95,,hf_open_llm_v1_240829_frozen.csv inex12_7b,HFv1 GSM8K,70.2,,hf_open_llm_v1_240829_frozen.csv inex12_7b,HFv1 HellaSwag,89.14,,hf_open_llm_v1_240829_frozen.csv inex12_7b,HFv1 MMLU,64.4,,hf_open_llm_v1_240829_frozen.csv inex12_7b,HFv1 TruthfulQA,78.04,,hf_open_llm_v1_240829_frozen.csv inex12_7b,HFv1 Winogrande,85.24,,hf_open_llm_v1_240829_frozen.csv inex8_7b,HF OpenLLM v1,76.44,,hf_open_llm_v1_240829_frozen.csv inex8_7b,HFv1 ARC,73.29,,hf_open_llm_v1_240829_frozen.csv inex8_7b,HFv1 GSM8K,68.99,,hf_open_llm_v1_240829_frozen.csv inex8_7b,HFv1 HellaSwag,89.19,,hf_open_llm_v1_240829_frozen.csv inex8_7b,HFv1 MMLU,64.47,,hf_open_llm_v1_240829_frozen.csv inex8_7b,HFv1 TruthfulQA,77.83,,hf_open_llm_v1_240829_frozen.csv inex8_7b,HFv1 Winogrande,84.85,,hf_open_llm_v1_240829_frozen.csv init_model,HF OpenLLM v1,29.6,,hf_open_llm_v1_240829_frozen.csv init_model,HFv1 ARC,28.5,,hf_open_llm_v1_240829_frozen.csv init_model,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv init_model,HFv1 HellaSwag,25.4,,hf_open_llm_v1_240829_frozen.csv init_model,HFv1 MMLU,25.65,,hf_open_llm_v1_240829_frozen.csv init_model,HFv1 TruthfulQA,48.48,,hf_open_llm_v1_240829_frozen.csv init_model,HFv1 Winogrande,49.57,,hf_open_llm_v1_240829_frozen.csv instructpalmyra_20b,HF OpenLLM v1,42.91,,hf_open_llm_v1_240829_frozen.csv instructpalmyra_20b,HFv1 ARC,47.1,,hf_open_llm_v1_240829_frozen.csv instructpalmyra_20b,HFv1 GSM8K,2.58,,hf_open_llm_v1_240829_frozen.csv instructpalmyra_20b,HFv1 HellaSwag,73.0,,hf_open_llm_v1_240829_frozen.csv instructpalmyra_20b,HFv1 MMLU,28.26,,hf_open_llm_v1_240829_frozen.csv instructpalmyra_20b,HFv1 TruthfulQA,41.81,,hf_open_llm_v1_240829_frozen.csv instructpalmyra_20b,HFv1 Winogrande,64.72,,hf_open_llm_v1_240829_frozen.csv internlm2_20b,HF OpenLLM v1,69.75,,hf_open_llm_v1_240829_frozen.csv internlm2_20b,HFv1 ARC,62.97,,hf_open_llm_v1_240829_frozen.csv internlm2_20b,HFv1 GSM8K,67.93,,hf_open_llm_v1_240829_frozen.csv internlm2_20b,HFv1 HellaSwag,83.21,,hf_open_llm_v1_240829_frozen.csv internlm2_20b,HFv1 MMLU,67.58,,hf_open_llm_v1_240829_frozen.csv internlm2_20b,HFv1 TruthfulQA,51.27,,hf_open_llm_v1_240829_frozen.csv internlm2_20b,HFv1 Winogrande,85.56,,hf_open_llm_v1_240829_frozen.csv internlm2_20b_llama,HF OpenLLM v1,70.66,,hf_open_llm_v1_240829_frozen.csv internlm2_20b_llama,HFv1 ARC,64.68,,hf_open_llm_v1_240829_frozen.csv internlm2_20b_llama,HFv1 GSM8K,70.66,,hf_open_llm_v1_240829_frozen.csv internlm2_20b_llama,HFv1 HellaSwag,83.16,,hf_open_llm_v1_240829_frozen.csv internlm2_20b_llama,HFv1 MMLU,67.27,,hf_open_llm_v1_240829_frozen.csv internlm2_20b_llama,HFv1 TruthfulQA,54.17,,hf_open_llm_v1_240829_frozen.csv internlm2_20b_llama,HFv1 Winogrande,84.29,,hf_open_llm_v1_240829_frozen.csv internlm2_base_20b_llama,HF OpenLLM v1,62.69,,hf_open_llm_v1_240829_frozen.csv internlm2_base_20b_llama,HFv1 ARC,63.05,,hf_open_llm_v1_240829_frozen.csv internlm2_base_20b_llama,HFv1 GSM8K,44.88,,hf_open_llm_v1_240829_frozen.csv internlm2_base_20b_llama,HFv1 HellaSwag,82.15,,hf_open_llm_v1_240829_frozen.csv internlm2_base_20b_llama,HFv1 MMLU,63.97,,hf_open_llm_v1_240829_frozen.csv internlm2_base_20b_llama,HFv1 TruthfulQA,44.11,,hf_open_llm_v1_240829_frozen.csv internlm2_base_20b_llama,HFv1 Winogrande,78.22,,hf_open_llm_v1_240829_frozen.csv internlm2_base_7b_llama,HF OpenLLM v1,53.62,,hf_open_llm_v1_240829_frozen.csv internlm2_base_7b_llama,HFv1 ARC,54.35,,hf_open_llm_v1_240829_frozen.csv internlm2_base_7b_llama,HFv1 GSM8K,19.18,,hf_open_llm_v1_240829_frozen.csv internlm2_base_7b_llama,HFv1 HellaSwag,79.47,,hf_open_llm_v1_240829_frozen.csv internlm2_base_7b_llama,HFv1 MMLU,54.05,,hf_open_llm_v1_240829_frozen.csv internlm2_base_7b_llama,HFv1 TruthfulQA,43.23,,hf_open_llm_v1_240829_frozen.csv internlm2_base_7b_llama,HFv1 Winogrande,71.43,,hf_open_llm_v1_240829_frozen.csv internlm2_chat_20b_llama,HF OpenLLM v1,62.56,,hf_open_llm_v1_240829_frozen.csv internlm2_chat_20b_llama,HFv1 ARC,63.65,,hf_open_llm_v1_240829_frozen.csv internlm2_chat_20b_llama,HFv1 GSM8K,33.97,,hf_open_llm_v1_240829_frozen.csv internlm2_chat_20b_llama,HFv1 HellaSwag,82.58,,hf_open_llm_v1_240829_frozen.csv internlm2_chat_20b_llama,HFv1 MMLU,66.89,,hf_open_llm_v1_240829_frozen.csv internlm2_chat_20b_llama,HFv1 TruthfulQA,48.74,,hf_open_llm_v1_240829_frozen.csv internlm2_chat_20b_llama,HFv1 Winogrande,79.56,,hf_open_llm_v1_240829_frozen.csv internlm2_chat_20b_llama_old,HF OpenLLM v1,62.53,,hf_open_llm_v1_240829_frozen.csv internlm2_chat_20b_llama_old,HFv1 ARC,63.65,,hf_open_llm_v1_240829_frozen.csv internlm2_chat_20b_llama_old,HFv1 GSM8K,34.04,,hf_open_llm_v1_240829_frozen.csv internlm2_chat_20b_llama_old,HFv1 HellaSwag,82.57,,hf_open_llm_v1_240829_frozen.csv internlm2_chat_20b_llama_old,HFv1 MMLU,66.85,,hf_open_llm_v1_240829_frozen.csv internlm2_chat_20b_llama_old,HFv1 TruthfulQA,48.75,,hf_open_llm_v1_240829_frozen.csv internlm2_chat_20b_llama_old,HFv1 Winogrande,79.32,,hf_open_llm_v1_240829_frozen.csv internlm2_chat_7b_sft_llama,HF OpenLLM v1,64.34,,hf_open_llm_v1_240829_frozen.csv internlm2_chat_7b_sft_llama,HFv1 ARC,57.17,,hf_open_llm_v1_240829_frozen.csv internlm2_chat_7b_sft_llama,HFv1 GSM8K,55.95,,hf_open_llm_v1_240829_frozen.csv internlm2_chat_7b_sft_llama,HFv1 HellaSwag,80.16,,hf_open_llm_v1_240829_frozen.csv internlm2_chat_7b_sft_llama,HFv1 MMLU,63.92,,hf_open_llm_v1_240829_frozen.csv internlm2_chat_7b_sft_llama,HFv1 TruthfulQA,50.95,,hf_open_llm_v1_240829_frozen.csv internlm2_chat_7b_sft_llama,HFv1 Winogrande,77.9,,hf_open_llm_v1_240829_frozen.csv internlm_20b,HF OpenLLM v1,59.55,,hf_open_llm_v1_240829_frozen.csv internlm_20b,HFv1 ARC,60.49,,hf_open_llm_v1_240829_frozen.csv internlm_20b,HFv1 GSM8K,23.5,,hf_open_llm_v1_240829_frozen.csv internlm_20b,HFv1 HellaSwag,82.13,,hf_open_llm_v1_240829_frozen.csv internlm_20b,HFv1 MMLU,61.85,,hf_open_llm_v1_240829_frozen.csv internlm_20b,HFv1 TruthfulQA,52.61,,hf_open_llm_v1_240829_frozen.csv internlm_20b,HFv1 Winogrande,76.72,,hf_open_llm_v1_240829_frozen.csv internlm_20b_llama,HF OpenLLM v1,65.09,,hf_open_llm_v1_240829_frozen.csv internlm_20b_llama,HFv1 ARC,61.35,,hf_open_llm_v1_240829_frozen.csv internlm_20b_llama,HFv1 GSM8K,51.1,,hf_open_llm_v1_240829_frozen.csv internlm_20b_llama,HFv1 HellaSwag,82.08,,hf_open_llm_v1_240829_frozen.csv internlm_20b_llama,HFv1 MMLU,61.59,,hf_open_llm_v1_240829_frozen.csv internlm_20b_llama,HFv1 TruthfulQA,57.71,,hf_open_llm_v1_240829_frozen.csv internlm_20b_llama,HFv1 Winogrande,76.72,,hf_open_llm_v1_240829_frozen.csv internlm_20b_llamafied,HF OpenLLM v1,29.08,,hf_open_llm_v1_240829_frozen.csv internlm_20b_llamafied,HFv1 ARC,26.79,,hf_open_llm_v1_240829_frozen.csv internlm_20b_llamafied,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv internlm_20b_llamafied,HFv1 HellaSwag,26.4,,hf_open_llm_v1_240829_frozen.csv internlm_20b_llamafied,HFv1 MMLU,25.4,,hf_open_llm_v1_240829_frozen.csv internlm_20b_llamafied,HFv1 TruthfulQA,48.06,,hf_open_llm_v1_240829_frozen.csv internlm_20b_llamafied,HFv1 Winogrande,47.83,,hf_open_llm_v1_240829_frozen.csv ipo_test,HF OpenLLM v1,71.29,,hf_open_llm_v1_240829_frozen.csv ipo_test,HFv1 ARC,67.92,,hf_open_llm_v1_240829_frozen.csv ipo_test,HFv1 GSM8K,72.02,,hf_open_llm_v1_240829_frozen.csv ipo_test,HFv1 HellaSwag,85.99,,hf_open_llm_v1_240829_frozen.csv ipo_test,HFv1 MMLU,65.05,,hf_open_llm_v1_240829_frozen.csv ipo_test,HFv1 TruthfulQA,55.87,,hf_open_llm_v1_240829_frozen.csv ipo_test,HFv1 Winogrande,80.9,,hf_open_llm_v1_240829_frozen.csv iwillchangethenamelater,HF OpenLLM v1,74.56,,hf_open_llm_v1_240829_frozen.csv iwillchangethenamelater,HFv1 ARC,72.01,,hf_open_llm_v1_240829_frozen.csv iwillchangethenamelater,HFv1 GSM8K,68.54,,hf_open_llm_v1_240829_frozen.csv iwillchangethenamelater,HFv1 HellaSwag,88.23,,hf_open_llm_v1_240829_frozen.csv iwillchangethenamelater,HFv1 MMLU,64.97,,hf_open_llm_v1_240829_frozen.csv iwillchangethenamelater,HFv1 TruthfulQA,69.41,,hf_open_llm_v1_240829_frozen.csv iwillchangethenamelater,HFv1 Winogrande,84.21,,hf_open_llm_v1_240829_frozen.csv j_o_s_i_e_3_beta10_7b_slerp,HF OpenLLM v1,67.99,,hf_open_llm_v1_240829_frozen.csv j_o_s_i_e_3_beta10_7b_slerp,HFv1 ARC,63.48,,hf_open_llm_v1_240829_frozen.csv j_o_s_i_e_3_beta10_7b_slerp,HFv1 GSM8K,61.03,,hf_open_llm_v1_240829_frozen.csv j_o_s_i_e_3_beta10_7b_slerp,HFv1 HellaSwag,83.79,,hf_open_llm_v1_240829_frozen.csv j_o_s_i_e_3_beta10_7b_slerp,HFv1 MMLU,63.16,,hf_open_llm_v1_240829_frozen.csv j_o_s_i_e_3_beta10_7b_slerp,HFv1 TruthfulQA,56.88,,hf_open_llm_v1_240829_frozen.csv j_o_s_i_e_3_beta10_7b_slerp,HFv1 Winogrande,80.11,,hf_open_llm_v1_240829_frozen.csv j_o_s_i_e_3_beta11_7b_slerp,HF OpenLLM v1,68.64,,hf_open_llm_v1_240829_frozen.csv j_o_s_i_e_3_beta11_7b_slerp,HFv1 ARC,64.68,,hf_open_llm_v1_240829_frozen.csv j_o_s_i_e_3_beta11_7b_slerp,HFv1 GSM8K,63.99,,hf_open_llm_v1_240829_frozen.csv j_o_s_i_e_3_beta11_7b_slerp,HFv1 HellaSwag,85.01,,hf_open_llm_v1_240829_frozen.csv j_o_s_i_e_3_beta11_7b_slerp,HFv1 MMLU,63.77,,hf_open_llm_v1_240829_frozen.csv j_o_s_i_e_3_beta11_7b_slerp,HFv1 TruthfulQA,55.77,,hf_open_llm_v1_240829_frozen.csv j_o_s_i_e_3_beta11_7b_slerp,HFv1 Winogrande,78.85,,hf_open_llm_v1_240829_frozen.csv j_o_s_i_e_3_beta12_7b_slerp,HF OpenLLM v1,68.22,,hf_open_llm_v1_240829_frozen.csv j_o_s_i_e_3_beta12_7b_slerp,HFv1 ARC,64.59,,hf_open_llm_v1_240829_frozen.csv j_o_s_i_e_3_beta12_7b_slerp,HFv1 GSM8K,59.67,,hf_open_llm_v1_240829_frozen.csv j_o_s_i_e_3_beta12_7b_slerp,HFv1 HellaSwag,83.98,,hf_open_llm_v1_240829_frozen.csv j_o_s_i_e_3_beta12_7b_slerp,HFv1 MMLU,63.28,,hf_open_llm_v1_240829_frozen.csv j_o_s_i_e_3_beta12_7b_slerp,HFv1 TruthfulQA,58.16,,hf_open_llm_v1_240829_frozen.csv j_o_s_i_e_3_beta12_7b_slerp,HFv1 Winogrande,79.64,,hf_open_llm_v1_240829_frozen.csv j_o_s_i_e_3_beta8_slerp,HF OpenLLM v1,61.56,,hf_open_llm_v1_240829_frozen.csv j_o_s_i_e_3_beta8_slerp,HFv1 ARC,60.41,,hf_open_llm_v1_240829_frozen.csv j_o_s_i_e_3_beta8_slerp,HFv1 GSM8K,36.09,,hf_open_llm_v1_240829_frozen.csv j_o_s_i_e_3_beta8_slerp,HFv1 HellaSwag,83.66,,hf_open_llm_v1_240829_frozen.csv j_o_s_i_e_3_beta8_slerp,HFv1 MMLU,62.35,,hf_open_llm_v1_240829_frozen.csv j_o_s_i_e_3_beta8_slerp,HFv1 TruthfulQA,48.69,,hf_open_llm_v1_240829_frozen.csv j_o_s_i_e_3_beta8_slerp,HFv1 Winogrande,78.14,,hf_open_llm_v1_240829_frozen.csv jallabi_34b,HF OpenLLM v1,70.73,,hf_open_llm_v1_240829_frozen.csv jallabi_34b,HFv1 ARC,66.04,,hf_open_llm_v1_240829_frozen.csv jallabi_34b,HFv1 GSM8K,65.2,,hf_open_llm_v1_240829_frozen.csv jallabi_34b,HFv1 HellaSwag,83.81,,hf_open_llm_v1_240829_frozen.csv jallabi_34b,HFv1 MMLU,76.4,,hf_open_llm_v1_240829_frozen.csv jallabi_34b,HFv1 TruthfulQA,51.46,,hf_open_llm_v1_240829_frozen.csv jallabi_34b,HFv1 Winogrande,81.45,,hf_open_llm_v1_240829_frozen.csv japanese_gpt_neox_3_6b,HF OpenLLM v1,29.28,,hf_open_llm_v1_240829_frozen.csv japanese_gpt_neox_3_6b,HFv1 ARC,25.0,,hf_open_llm_v1_240829_frozen.csv japanese_gpt_neox_3_6b,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv japanese_gpt_neox_3_6b,HFv1 HellaSwag,25.46,,hf_open_llm_v1_240829_frozen.csv japanese_gpt_neox_3_6b,HFv1 MMLU,23.12,,hf_open_llm_v1_240829_frozen.csv japanese_gpt_neox_3_6b,HFv1 TruthfulQA,51.45,,hf_open_llm_v1_240829_frozen.csv japanese_gpt_neox_3_6b,HFv1 Winogrande,50.67,,hf_open_llm_v1_240829_frozen.csv japanese_stablelm_instruct_gamma_7b,HF OpenLLM v1,52.82,,hf_open_llm_v1_240829_frozen.csv japanese_stablelm_instruct_gamma_7b,HFv1 ARC,50.68,,hf_open_llm_v1_240829_frozen.csv japanese_stablelm_instruct_gamma_7b,HFv1 GSM8K,19.26,,hf_open_llm_v1_240829_frozen.csv japanese_stablelm_instruct_gamma_7b,HFv1 HellaSwag,78.68,,hf_open_llm_v1_240829_frozen.csv japanese_stablelm_instruct_gamma_7b,HFv1 MMLU,54.82,,hf_open_llm_v1_240829_frozen.csv japanese_stablelm_instruct_gamma_7b,HFv1 TruthfulQA,39.77,,hf_open_llm_v1_240829_frozen.csv japanese_stablelm_instruct_gamma_7b,HFv1 Winogrande,73.72,,hf_open_llm_v1_240829_frozen.csv jaskier_7b_dpo,HF OpenLLM v1,72.91,,hf_open_llm_v1_240829_frozen.csv jaskier_7b_dpo,HFv1 ARC,70.82,,hf_open_llm_v1_240829_frozen.csv jaskier_7b_dpo,HFv1 GSM8K,70.36,,hf_open_llm_v1_240829_frozen.csv jaskier_7b_dpo,HFv1 HellaSwag,87.02,,hf_open_llm_v1_240829_frozen.csv jaskier_7b_dpo,HFv1 MMLU,64.67,,hf_open_llm_v1_240829_frozen.csv jaskier_7b_dpo,HFv1 TruthfulQA,64.41,,hf_open_llm_v1_240829_frozen.csv jaskier_7b_dpo,HFv1 Winogrande,80.19,,hf_open_llm_v1_240829_frozen.csv jaskier_7b_dpo_v2,HF OpenLLM v1,72.53,,hf_open_llm_v1_240829_frozen.csv jaskier_7b_dpo_v2,HFv1 ARC,69.28,,hf_open_llm_v1_240829_frozen.csv jaskier_7b_dpo_v2,HFv1 GSM8K,71.8,,hf_open_llm_v1_240829_frozen.csv jaskier_7b_dpo_v2,HFv1 HellaSwag,86.8,,hf_open_llm_v1_240829_frozen.csv jaskier_7b_dpo_v2,HFv1 MMLU,64.92,,hf_open_llm_v1_240829_frozen.csv jaskier_7b_dpo_v2,HFv1 TruthfulQA,61.64,,hf_open_llm_v1_240829_frozen.csv jaskier_7b_dpo_v2,HFv1 Winogrande,80.74,,hf_open_llm_v1_240829_frozen.csv jaskier_7b_dpo_v3_3,HF OpenLLM v1,76.12,,hf_open_llm_v1_240829_frozen.csv jaskier_7b_dpo_v3_3,HFv1 ARC,72.27,,hf_open_llm_v1_240829_frozen.csv jaskier_7b_dpo_v3_3,HFv1 GSM8K,67.85,,hf_open_llm_v1_240829_frozen.csv jaskier_7b_dpo_v3_3,HFv1 HellaSwag,88.89,,hf_open_llm_v1_240829_frozen.csv jaskier_7b_dpo_v3_3,HFv1 MMLU,64.34,,hf_open_llm_v1_240829_frozen.csv jaskier_7b_dpo_v3_3,HFv1 TruthfulQA,79.0,,hf_open_llm_v1_240829_frozen.csv jaskier_7b_dpo_v3_3,HFv1 Winogrande,84.37,,hf_open_llm_v1_240829_frozen.csv jaskier_7b_dpo_v4_1,HF OpenLLM v1,75.95,,hf_open_llm_v1_240829_frozen.csv jaskier_7b_dpo_v4_1,HFv1 ARC,72.95,,hf_open_llm_v1_240829_frozen.csv jaskier_7b_dpo_v4_1,HFv1 GSM8K,68.31,,hf_open_llm_v1_240829_frozen.csv jaskier_7b_dpo_v4_1,HFv1 HellaSwag,89.07,,hf_open_llm_v1_240829_frozen.csv jaskier_7b_dpo_v4_1,HFv1 MMLU,64.75,,hf_open_llm_v1_240829_frozen.csv jaskier_7b_dpo_v4_1,HFv1 TruthfulQA,75.92,,hf_open_llm_v1_240829_frozen.csv jaskier_7b_dpo_v4_1,HFv1 Winogrande,84.69,,hf_open_llm_v1_240829_frozen.csv jaskier_7b_dpo_v4_3,HF OpenLLM v1,76.35,,hf_open_llm_v1_240829_frozen.csv jaskier_7b_dpo_v4_3,HFv1 ARC,72.61,,hf_open_llm_v1_240829_frozen.csv jaskier_7b_dpo_v4_3,HFv1 GSM8K,69.07,,hf_open_llm_v1_240829_frozen.csv jaskier_7b_dpo_v4_3,HFv1 HellaSwag,89.09,,hf_open_llm_v1_240829_frozen.csv jaskier_7b_dpo_v4_3,HFv1 MMLU,64.29,,hf_open_llm_v1_240829_frozen.csv jaskier_7b_dpo_v4_3,HFv1 TruthfulQA,78.27,,hf_open_llm_v1_240829_frozen.csv jaskier_7b_dpo_v4_3,HFv1 Winogrande,84.77,,hf_open_llm_v1_240829_frozen.csv jaskier_7b_neuraldpo,HF OpenLLM v1,71.36,,hf_open_llm_v1_240829_frozen.csv jaskier_7b_neuraldpo,HFv1 ARC,73.46,,hf_open_llm_v1_240829_frozen.csv jaskier_7b_neuraldpo,HFv1 GSM8K,58.0,,hf_open_llm_v1_240829_frozen.csv jaskier_7b_neuraldpo,HFv1 HellaSwag,88.16,,hf_open_llm_v1_240829_frozen.csv jaskier_7b_neuraldpo,HFv1 MMLU,63.15,,hf_open_llm_v1_240829_frozen.csv jaskier_7b_neuraldpo,HFv1 TruthfulQA,59.92,,hf_open_llm_v1_240829_frozen.csv jaskier_7b_neuraldpo,HFv1 Winogrande,85.48,,hf_open_llm_v1_240829_frozen.csv josie_beta_4_7b_slerp,HF OpenLLM v1,68.06,,hf_open_llm_v1_240829_frozen.csv josie_beta_4_7b_slerp,HFv1 ARC,63.57,,hf_open_llm_v1_240829_frozen.csv josie_beta_4_7b_slerp,HFv1 GSM8K,61.71,,hf_open_llm_v1_240829_frozen.csv josie_beta_4_7b_slerp,HFv1 HellaSwag,84.1,,hf_open_llm_v1_240829_frozen.csv josie_beta_4_7b_slerp,HFv1 MMLU,63.73,,hf_open_llm_v1_240829_frozen.csv josie_beta_4_7b_slerp,HFv1 TruthfulQA,55.93,,hf_open_llm_v1_240829_frozen.csv josie_beta_4_7b_slerp,HFv1 Winogrande,79.32,,hf_open_llm_v1_240829_frozen.csv justtosuffer_7b_slerp,HF OpenLLM v1,70.48,,hf_open_llm_v1_240829_frozen.csv justtosuffer_7b_slerp,HFv1 ARC,68.94,,hf_open_llm_v1_240829_frozen.csv justtosuffer_7b_slerp,HFv1 GSM8K,59.74,,hf_open_llm_v1_240829_frozen.csv justtosuffer_7b_slerp,HFv1 HellaSwag,86.79,,hf_open_llm_v1_240829_frozen.csv justtosuffer_7b_slerp,HFv1 MMLU,64.66,,hf_open_llm_v1_240829_frozen.csv justtosuffer_7b_slerp,HFv1 TruthfulQA,62.69,,hf_open_llm_v1_240829_frozen.csv justtosuffer_7b_slerp,HFv1 Winogrande,80.03,,hf_open_llm_v1_240829_frozen.csv juud_mistral_7b,HF OpenLLM v1,61.72,,hf_open_llm_v1_240829_frozen.csv juud_mistral_7b,HFv1 ARC,66.72,,hf_open_llm_v1_240829_frozen.csv juud_mistral_7b,HFv1 GSM8K,23.12,,hf_open_llm_v1_240829_frozen.csv juud_mistral_7b,HFv1 HellaSwag,85.0,,hf_open_llm_v1_240829_frozen.csv juud_mistral_7b,HFv1 MMLU,63.38,,hf_open_llm_v1_240829_frozen.csv juud_mistral_7b,HFv1 TruthfulQA,54.12,,hf_open_llm_v1_240829_frozen.csv juud_mistral_7b,HFv1 Winogrande,77.98,,hf_open_llm_v1_240829_frozen.csv juud_mistral_7b_dpo,HF OpenLLM v1,60.89,,hf_open_llm_v1_240829_frozen.csv juud_mistral_7b_dpo,HFv1 ARC,66.81,,hf_open_llm_v1_240829_frozen.csv juud_mistral_7b_dpo,HFv1 GSM8K,18.8,,hf_open_llm_v1_240829_frozen.csv juud_mistral_7b_dpo,HFv1 HellaSwag,84.89,,hf_open_llm_v1_240829_frozen.csv juud_mistral_7b_dpo,HFv1 MMLU,63.03,,hf_open_llm_v1_240829_frozen.csv juud_mistral_7b_dpo,HFv1 TruthfulQA,53.51,,hf_open_llm_v1_240829_frozen.csv juud_mistral_7b_dpo,HFv1 Winogrande,78.3,,hf_open_llm_v1_240829_frozen.csv k2,HF OpenLLM v1,64.54,,hf_open_llm_v1_240829_frozen.csv k2,HFv1 ARC,63.99,,hf_open_llm_v1_240829_frozen.csv k2,HFv1 GSM8K,48.67,,hf_open_llm_v1_240829_frozen.csv k2,HFv1 HellaSwag,85.71,,hf_open_llm_v1_240829_frozen.csv k2,HFv1 MMLU,67.99,,hf_open_llm_v1_240829_frozen.csv k2,HFv1 TruthfulQA,40.77,,hf_open_llm_v1_240829_frozen.csv k2,HFv1 Winogrande,80.11,,hf_open_llm_v1_240829_frozen.csv karakuri_lm_70b_chat_v0_1,HF OpenLLM v1,62.36,,hf_open_llm_v1_240829_frozen.csv karakuri_lm_70b_chat_v0_1,HFv1 ARC,61.52,,hf_open_llm_v1_240829_frozen.csv karakuri_lm_70b_chat_v0_1,HFv1 GSM8K,40.41,,hf_open_llm_v1_240829_frozen.csv karakuri_lm_70b_chat_v0_1,HFv1 HellaSwag,83.13,,hf_open_llm_v1_240829_frozen.csv karakuri_lm_70b_chat_v0_1,HFv1 MMLU,59.35,,hf_open_llm_v1_240829_frozen.csv karakuri_lm_70b_chat_v0_1,HFv1 TruthfulQA,51.39,,hf_open_llm_v1_240829_frozen.csv karakuri_lm_70b_chat_v0_1,HFv1 Winogrande,78.37,,hf_open_llm_v1_240829_frozen.csv karen_theeditor_v2_strict_mistral_7b,HF OpenLLM v1,59.13,,hf_open_llm_v1_240829_frozen.csv karen_theeditor_v2_strict_mistral_7b,HFv1 ARC,59.56,,hf_open_llm_v1_240829_frozen.csv karen_theeditor_v2_strict_mistral_7b,HFv1 GSM8K,30.17,,hf_open_llm_v1_240829_frozen.csv karen_theeditor_v2_strict_mistral_7b,HFv1 HellaSwag,81.79,,hf_open_llm_v1_240829_frozen.csv karen_theeditor_v2_strict_mistral_7b,HFv1 MMLU,59.56,,hf_open_llm_v1_240829_frozen.csv karen_theeditor_v2_strict_mistral_7b,HFv1 TruthfulQA,49.36,,hf_open_llm_v1_240829_frozen.csv karen_theeditor_v2_strict_mistral_7b,HFv1 Winogrande,74.35,,hf_open_llm_v1_240829_frozen.csv kellemar_dpo_orca_distilled_7b_slerp,HF OpenLLM v1,73.71,,hf_open_llm_v1_240829_frozen.csv kellemar_dpo_orca_distilled_7b_slerp,HFv1 ARC,70.48,,hf_open_llm_v1_240829_frozen.csv kellemar_dpo_orca_distilled_7b_slerp,HFv1 GSM8K,72.02,,hf_open_llm_v1_240829_frozen.csv kellemar_dpo_orca_distilled_7b_slerp,HFv1 HellaSwag,87.56,,hf_open_llm_v1_240829_frozen.csv kellemar_dpo_orca_distilled_7b_slerp,HFv1 MMLU,65.33,,hf_open_llm_v1_240829_frozen.csv kellemar_dpo_orca_distilled_7b_slerp,HFv1 TruthfulQA,64.97,,hf_open_llm_v1_240829_frozen.csv kellemar_dpo_orca_distilled_7b_slerp,HFv1 Winogrande,81.93,,hf_open_llm_v1_240829_frozen.csv kellemar_krishnahercules_0_1_7b_slerp,HF OpenLLM v1,73.33,,hf_open_llm_v1_240829_frozen.csv kellemar_krishnahercules_0_1_7b_slerp,HFv1 ARC,70.22,,hf_open_llm_v1_240829_frozen.csv kellemar_krishnahercules_0_1_7b_slerp,HFv1 GSM8K,71.57,,hf_open_llm_v1_240829_frozen.csv kellemar_krishnahercules_0_1_7b_slerp,HFv1 HellaSwag,87.29,,hf_open_llm_v1_240829_frozen.csv kellemar_krishnahercules_0_1_7b_slerp,HFv1 MMLU,65.61,,hf_open_llm_v1_240829_frozen.csv kellemar_krishnahercules_0_1_7b_slerp,HFv1 TruthfulQA,63.03,,hf_open_llm_v1_240829_frozen.csv kellemar_krishnahercules_0_1_7b_slerp,HFv1 Winogrande,82.24,,hf_open_llm_v1_240829_frozen.csv kindred_7b_slerp,HF OpenLLM v1,74.29,,hf_open_llm_v1_240829_frozen.csv kindred_7b_slerp,HFv1 ARC,71.76,,hf_open_llm_v1_240829_frozen.csv kindred_7b_slerp,HFv1 GSM8K,70.2,,hf_open_llm_v1_240829_frozen.csv kindred_7b_slerp,HFv1 HellaSwag,87.78,,hf_open_llm_v1_240829_frozen.csv kindred_7b_slerp,HFv1 MMLU,64.76,,hf_open_llm_v1_240829_frozen.csv kindred_7b_slerp,HFv1 TruthfulQA,68.12,,hf_open_llm_v1_240829_frozen.csv kindred_7b_slerp,HFv1 Winogrande,83.11,,hf_open_llm_v1_240829_frozen.csv kingnish_llama3_8b,HF OpenLLM v1,72.12,,hf_open_llm_v1_240829_frozen.csv kingnish_llama3_8b,HFv1 ARC,69.97,,hf_open_llm_v1_240829_frozen.csv kingnish_llama3_8b,HFv1 GSM8K,69.07,,hf_open_llm_v1_240829_frozen.csv kingnish_llama3_8b,HFv1 HellaSwag,85.83,,hf_open_llm_v1_240829_frozen.csv kingnish_llama3_8b,HFv1 MMLU,68.45,,hf_open_llm_v1_240829_frozen.csv kingnish_llama3_8b,HFv1 TruthfulQA,61.02,,hf_open_llm_v1_240829_frozen.csv kingnish_llama3_8b,HFv1 Winogrande,78.37,,hf_open_llm_v1_240829_frozen.csv kiqu_70b,HF OpenLLM v1,75.29,,hf_open_llm_v1_240829_frozen.csv kiqu_70b,HFv1 ARC,72.1,,hf_open_llm_v1_240829_frozen.csv kiqu_70b,HFv1 GSM8K,68.46,,hf_open_llm_v1_240829_frozen.csv kiqu_70b,HFv1 HellaSwag,87.94,,hf_open_llm_v1_240829_frozen.csv kiqu_70b,HFv1 MMLU,74.93,,hf_open_llm_v1_240829_frozen.csv kiqu_70b,HFv1 TruthfulQA,63.48,,hf_open_llm_v1_240829_frozen.csv kiqu_70b,HFv1 Winogrande,84.85,,hf_open_llm_v1_240829_frozen.csv knowledgeninja_litellama_460mx6moe_1t,HF OpenLLM v1,30.23,,hf_open_llm_v1_240829_frozen.csv knowledgeninja_litellama_460mx6moe_1t,HFv1 ARC,25.17,,hf_open_llm_v1_240829_frozen.csv knowledgeninja_litellama_460mx6moe_1t,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv knowledgeninja_litellama_460mx6moe_1t,HFv1 HellaSwag,38.45,,hf_open_llm_v1_240829_frozen.csv knowledgeninja_litellama_460mx6moe_1t,HFv1 MMLU,26.16,,hf_open_llm_v1_240829_frozen.csv knowledgeninja_litellama_460mx6moe_1t,HFv1 TruthfulQA,41.57,,hf_open_llm_v1_240829_frozen.csv knowledgeninja_litellama_460mx6moe_1t,HFv1 Winogrande,50.04,,hf_open_llm_v1_240829_frozen.csv ko_wand_136m,HF OpenLLM v1,28.29,,hf_open_llm_v1_240829_frozen.csv ko_wand_136m,HFv1 ARC,21.33,,hf_open_llm_v1_240829_frozen.csv ko_wand_136m,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv ko_wand_136m,HFv1 HellaSwag,25.0,,hf_open_llm_v1_240829_frozen.csv ko_wand_136m,HFv1 MMLU,23.58,,hf_open_llm_v1_240829_frozen.csv ko_wand_136m,HFv1 TruthfulQA,50.68,,hf_open_llm_v1_240829_frozen.csv ko_wand_136m,HFv1 Winogrande,49.17,,hf_open_llm_v1_240829_frozen.csv koalpaca_korwkv_6b,HF OpenLLM v1,28.57,,hf_open_llm_v1_240829_frozen.csv koalpaca_korwkv_6b,HFv1 ARC,23.46,,hf_open_llm_v1_240829_frozen.csv koalpaca_korwkv_6b,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv koalpaca_korwkv_6b,HFv1 HellaSwag,31.65,,hf_open_llm_v1_240829_frozen.csv koalpaca_korwkv_6b,HFv1 MMLU,24.89,,hf_open_llm_v1_240829_frozen.csv koalpaca_korwkv_6b,HFv1 TruthfulQA,39.83,,hf_open_llm_v1_240829_frozen.csv koalpaca_korwkv_6b,HFv1 Winogrande,51.62,,hf_open_llm_v1_240829_frozen.csv kollama2_7b_v2,HF OpenLLM v1,50.66,,hf_open_llm_v1_240829_frozen.csv kollama2_7b_v2,HFv1 ARC,53.33,,hf_open_llm_v1_240829_frozen.csv kollama2_7b_v2,HFv1 GSM8K,6.52,,hf_open_llm_v1_240829_frozen.csv kollama2_7b_v2,HFv1 HellaSwag,78.5,,hf_open_llm_v1_240829_frozen.csv kollama2_7b_v2,HFv1 MMLU,43.61,,hf_open_llm_v1_240829_frozen.csv kollama2_7b_v2,HFv1 TruthfulQA,46.37,,hf_open_llm_v1_240829_frozen.csv kollama2_7b_v2,HFv1 Winogrande,75.61,,hf_open_llm_v1_240829_frozen.csv koopenchat_sft,HF OpenLLM v1,58.61,,hf_open_llm_v1_240829_frozen.csv koopenchat_sft,HFv1 ARC,59.81,,hf_open_llm_v1_240829_frozen.csv koopenchat_sft,HFv1 GSM8K,24.18,,hf_open_llm_v1_240829_frozen.csv koopenchat_sft,HFv1 HellaSwag,78.73,,hf_open_llm_v1_240829_frozen.csv koopenchat_sft,HFv1 MMLU,61.32,,hf_open_llm_v1_240829_frozen.csv koopenchat_sft,HFv1 TruthfulQA,51.24,,hf_open_llm_v1_240829_frozen.csv koopenchat_sft,HFv1 Winogrande,76.4,,hf_open_llm_v1_240829_frozen.csv korwkv_6b,HF OpenLLM v1,28.19,,hf_open_llm_v1_240829_frozen.csv korwkv_6b,HFv1 ARC,22.1,,hf_open_llm_v1_240829_frozen.csv korwkv_6b,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv korwkv_6b,HFv1 HellaSwag,32.18,,hf_open_llm_v1_240829_frozen.csv korwkv_6b,HFv1 MMLU,24.69,,hf_open_llm_v1_240829_frozen.csv korwkv_6b,HFv1 TruthfulQA,39.05,,hf_open_llm_v1_240829_frozen.csv korwkv_6b,HFv1 Winogrande,51.14,,hf_open_llm_v1_240829_frozen.csv kosolar_10_7b_v0_2,HF OpenLLM v1,64.2,,hf_open_llm_v1_240829_frozen.csv kosolar_10_7b_v0_2,HFv1 ARC,61.35,,hf_open_llm_v1_240829_frozen.csv kosolar_10_7b_v0_2,HFv1 GSM8K,47.69,,hf_open_llm_v1_240829_frozen.csv kosolar_10_7b_v0_2,HFv1 HellaSwag,82.63,,hf_open_llm_v1_240829_frozen.csv kosolar_10_7b_v0_2,HFv1 MMLU,64.85,,hf_open_llm_v1_240829_frozen.csv kosolar_10_7b_v0_2,HFv1 TruthfulQA,47.94,,hf_open_llm_v1_240829_frozen.csv kosolar_10_7b_v0_2,HFv1 Winogrande,80.74,,hf_open_llm_v1_240829_frozen.csv kosolar_10_7b_v0_2_1_3_dedup_p,HF OpenLLM v1,65.43,,hf_open_llm_v1_240829_frozen.csv kosolar_10_7b_v0_2_1_3_dedup_p,HFv1 ARC,63.05,,hf_open_llm_v1_240829_frozen.csv kosolar_10_7b_v0_2_1_3_dedup_p,HFv1 GSM8K,48.07,,hf_open_llm_v1_240829_frozen.csv kosolar_10_7b_v0_2_1_3_dedup_p,HFv1 HellaSwag,83.63,,hf_open_llm_v1_240829_frozen.csv kosolar_10_7b_v0_2_1_3_dedup_p,HFv1 MMLU,64.61,,hf_open_llm_v1_240829_frozen.csv kosolar_10_7b_v0_2_1_3_dedup_p,HFv1 TruthfulQA,52.69,,hf_open_llm_v1_240829_frozen.csv kosolar_10_7b_v0_2_1_3_dedup_p,HFv1 Winogrande,80.51,,hf_open_llm_v1_240829_frozen.csv kosolar_10_7b_v0_3,HF OpenLLM v1,64.76,,hf_open_llm_v1_240829_frozen.csv kosolar_10_7b_v0_3,HFv1 ARC,62.8,,hf_open_llm_v1_240829_frozen.csv kosolar_10_7b_v0_3,HFv1 GSM8K,50.49,,hf_open_llm_v1_240829_frozen.csv kosolar_10_7b_v0_3,HFv1 HellaSwag,83.73,,hf_open_llm_v1_240829_frozen.csv kosolar_10_7b_v0_3,HFv1 MMLU,64.51,,hf_open_llm_v1_240829_frozen.csv kosolar_10_7b_v0_3,HFv1 TruthfulQA,44.57,,hf_open_llm_v1_240829_frozen.csv kosolar_10_7b_v0_3,HFv1 Winogrande,82.48,,hf_open_llm_v1_240829_frozen.csv kunomaid_7b_slerp,HF OpenLLM v1,69.21,,hf_open_llm_v1_240829_frozen.csv kunomaid_7b_slerp,HFv1 ARC,68.0,,hf_open_llm_v1_240829_frozen.csv kunomaid_7b_slerp,HFv1 GSM8K,61.64,,hf_open_llm_v1_240829_frozen.csv kunomaid_7b_slerp,HFv1 HellaSwag,86.34,,hf_open_llm_v1_240829_frozen.csv kunomaid_7b_slerp,HFv1 MMLU,64.82,,hf_open_llm_v1_240829_frozen.csv kunomaid_7b_slerp,HFv1 TruthfulQA,55.19,,hf_open_llm_v1_240829_frozen.csv kunomaid_7b_slerp,HFv1 Winogrande,79.24,,hf_open_llm_v1_240829_frozen.csv lamini_neo_1_3b_mental_health_lora,HF OpenLLM v1,29.3,,hf_open_llm_v1_240829_frozen.csv lamini_neo_1_3b_mental_health_lora,HFv1 ARC,25.77,,hf_open_llm_v1_240829_frozen.csv lamini_neo_1_3b_mental_health_lora,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv lamini_neo_1_3b_mental_health_lora,HFv1 HellaSwag,25.67,,hf_open_llm_v1_240829_frozen.csv lamini_neo_1_3b_mental_health_lora,HFv1 MMLU,27.0,,hf_open_llm_v1_240829_frozen.csv lamini_neo_1_3b_mental_health_lora,HFv1 TruthfulQA,48.21,,hf_open_llm_v1_240829_frozen.csv lamini_neo_1_3b_mental_health_lora,HFv1 Winogrande,49.17,,hf_open_llm_v1_240829_frozen.csv laser_dolphin_mixtral_2x7b_dpo,HF OpenLLM v1,67.16,,hf_open_llm_v1_240829_frozen.csv laser_dolphin_mixtral_2x7b_dpo,HFv1 ARC,65.96,,hf_open_llm_v1_240829_frozen.csv laser_dolphin_mixtral_2x7b_dpo,HFv1 GSM8K,48.29,,hf_open_llm_v1_240829_frozen.csv laser_dolphin_mixtral_2x7b_dpo,HFv1 HellaSwag,85.8,,hf_open_llm_v1_240829_frozen.csv laser_dolphin_mixtral_2x7b_dpo,HFv1 MMLU,63.17,,hf_open_llm_v1_240829_frozen.csv laser_dolphin_mixtral_2x7b_dpo,HFv1 TruthfulQA,60.76,,hf_open_llm_v1_240829_frozen.csv laser_dolphin_mixtral_2x7b_dpo,HFv1 Winogrande,79.01,,hf_open_llm_v1_240829_frozen.csv lemur_70b_chat_v1,HF OpenLLM v1,65.38,,hf_open_llm_v1_240829_frozen.csv lemur_70b_chat_v1,HFv1 ARC,66.98,,hf_open_llm_v1_240829_frozen.csv lemur_70b_chat_v1,HFv1 GSM8K,35.33,,hf_open_llm_v1_240829_frozen.csv lemur_70b_chat_v1,HFv1 HellaSwag,85.73,,hf_open_llm_v1_240829_frozen.csv lemur_70b_chat_v1,HFv1 MMLU,65.99,,hf_open_llm_v1_240829_frozen.csv lemur_70b_chat_v1,HFv1 TruthfulQA,56.58,,hf_open_llm_v1_240829_frozen.csv lemur_70b_chat_v1,HFv1 Winogrande,81.69,,hf_open_llm_v1_240829_frozen.csv leo_hessianai_7b_chat,HF OpenLLM v1,49.29,,hf_open_llm_v1_240829_frozen.csv leo_hessianai_7b_chat,HFv1 ARC,52.56,,hf_open_llm_v1_240829_frozen.csv leo_hessianai_7b_chat,HFv1 GSM8K,5.16,,hf_open_llm_v1_240829_frozen.csv leo_hessianai_7b_chat,HFv1 HellaSwag,77.61,,hf_open_llm_v1_240829_frozen.csv leo_hessianai_7b_chat,HFv1 MMLU,45.58,,hf_open_llm_v1_240829_frozen.csv leo_hessianai_7b_chat,HFv1 TruthfulQA,44.89,,hf_open_llm_v1_240829_frozen.csv leo_hessianai_7b_chat,HFv1 Winogrande,69.93,,hf_open_llm_v1_240829_frozen.csv leo_hessianai_7b_chat_bilingual,HF OpenLLM v1,48.72,,hf_open_llm_v1_240829_frozen.csv leo_hessianai_7b_chat_bilingual,HFv1 ARC,51.02,,hf_open_llm_v1_240829_frozen.csv leo_hessianai_7b_chat_bilingual,HFv1 GSM8K,2.73,,hf_open_llm_v1_240829_frozen.csv leo_hessianai_7b_chat_bilingual,HFv1 HellaSwag,76.03,,hf_open_llm_v1_240829_frozen.csv leo_hessianai_7b_chat_bilingual,HFv1 MMLU,44.68,,hf_open_llm_v1_240829_frozen.csv leo_hessianai_7b_chat_bilingual,HFv1 TruthfulQA,47.16,,hf_open_llm_v1_240829_frozen.csv leo_hessianai_7b_chat_bilingual,HFv1 Winogrande,70.72,,hf_open_llm_v1_240829_frozen.csv leoscorpius_7b_chat_dpo,HF OpenLLM v1,73.92,,hf_open_llm_v1_240829_frozen.csv leoscorpius_7b_chat_dpo,HFv1 ARC,70.48,,hf_open_llm_v1_240829_frozen.csv leoscorpius_7b_chat_dpo,HFv1 GSM8K,69.07,,hf_open_llm_v1_240829_frozen.csv leoscorpius_7b_chat_dpo,HFv1 HellaSwag,87.97,,hf_open_llm_v1_240829_frozen.csv leoscorpius_7b_chat_dpo,HFv1 MMLU,65.08,,hf_open_llm_v1_240829_frozen.csv leoscorpius_7b_chat_dpo,HFv1 TruthfulQA,68.83,,hf_open_llm_v1_240829_frozen.csv leoscorpius_7b_chat_dpo,HFv1 Winogrande,82.08,,hf_open_llm_v1_240829_frozen.csv lexgpt_v3,HF OpenLLM v1,69.49,,hf_open_llm_v1_240829_frozen.csv lexgpt_v3,HFv1 ARC,66.47,,hf_open_llm_v1_240829_frozen.csv lexgpt_v3,HFv1 GSM8K,61.56,,hf_open_llm_v1_240829_frozen.csv lexgpt_v3,HFv1 HellaSwag,85.91,,hf_open_llm_v1_240829_frozen.csv lexgpt_v3,HFv1 MMLU,64.48,,hf_open_llm_v1_240829_frozen.csv lexgpt_v3,HFv1 TruthfulQA,59.98,,hf_open_llm_v1_240829_frozen.csv lexgpt_v3,HFv1 Winogrande,78.53,,hf_open_llm_v1_240829_frozen.csv lhk,HF OpenLLM v1,68.74,,hf_open_llm_v1_240829_frozen.csv lhk,HFv1 ARC,66.38,,hf_open_llm_v1_240829_frozen.csv lhk,HFv1 GSM8K,56.33,,hf_open_llm_v1_240829_frozen.csv lhk,HFv1 HellaSwag,84.49,,hf_open_llm_v1_240829_frozen.csv lhk,HFv1 MMLU,65.13,,hf_open_llm_v1_240829_frozen.csv lhk,HFv1 TruthfulQA,59.12,,hf_open_llm_v1_240829_frozen.csv lhk,HFv1 Winogrande,80.98,,hf_open_llm_v1_240829_frozen.csv libra_19b,HF OpenLLM v1,53.83,,hf_open_llm_v1_240829_frozen.csv libra_19b,HFv1 ARC,60.58,,hf_open_llm_v1_240829_frozen.csv libra_19b,HFv1 GSM8K,0.08,,hf_open_llm_v1_240829_frozen.csv libra_19b,HFv1 HellaSwag,82.04,,hf_open_llm_v1_240829_frozen.csv libra_19b,HFv1 MMLU,55.57,,hf_open_llm_v1_240829_frozen.csv libra_19b,HFv1 TruthfulQA,48.41,,hf_open_llm_v1_240829_frozen.csv libra_19b,HFv1 Winogrande,76.32,,hf_open_llm_v1_240829_frozen.csv lil_c3po,HF OpenLLM v1,68.03,,hf_open_llm_v1_240829_frozen.csv lil_c3po,HFv1 ARC,65.02,,hf_open_llm_v1_240829_frozen.csv lil_c3po,HFv1 GSM8K,48.45,,hf_open_llm_v1_240829_frozen.csv lil_c3po,HFv1 HellaSwag,84.45,,hf_open_llm_v1_240829_frozen.csv lil_c3po,HFv1 MMLU,62.36,,hf_open_llm_v1_240829_frozen.csv lil_c3po,HFv1 TruthfulQA,68.73,,hf_open_llm_v1_240829_frozen.csv lil_c3po,HFv1 Winogrande,79.16,,hf_open_llm_v1_240829_frozen.csv lima2_13b,HF OpenLLM v1,52.98,,hf_open_llm_v1_240829_frozen.csv lima2_13b,HFv1 ARC,60.24,,hf_open_llm_v1_240829_frozen.csv lima2_13b,HFv1 GSM8K,5.76,,hf_open_llm_v1_240829_frozen.csv lima2_13b,HFv1 HellaSwag,83.69,,hf_open_llm_v1_240829_frozen.csv lima2_13b,HFv1 MMLU,53.17,,hf_open_llm_v1_240829_frozen.csv lima2_13b,HFv1 TruthfulQA,41.81,,hf_open_llm_v1_240829_frozen.csv lima2_13b,HFv1 Winogrande,73.24,,hf_open_llm_v1_240829_frozen.csv lima2_7b,HF OpenLLM v1,49.27,,hf_open_llm_v1_240829_frozen.csv lima2_7b,HFv1 ARC,53.24,,hf_open_llm_v1_240829_frozen.csv lima2_7b,HFv1 GSM8K,3.87,,hf_open_llm_v1_240829_frozen.csv lima2_7b,HFv1 HellaSwag,80.6,,hf_open_llm_v1_240829_frozen.csv lima2_7b,HFv1 MMLU,43.22,,hf_open_llm_v1_240829_frozen.csv lima2_7b,HFv1 TruthfulQA,44.74,,hf_open_llm_v1_240829_frozen.csv lima2_7b,HFv1 Winogrande,69.93,,hf_open_llm_v1_240829_frozen.csv lima_unchained_70b,HF OpenLLM v1,65.51,,hf_open_llm_v1_240829_frozen.csv lima_unchained_70b,HFv1 ARC,68.26,,hf_open_llm_v1_240829_frozen.csv lima_unchained_70b,HFv1 GSM8K,34.72,,hf_open_llm_v1_240829_frozen.csv lima_unchained_70b,HFv1 HellaSwag,87.65,,hf_open_llm_v1_240829_frozen.csv lima_unchained_70b,HFv1 MMLU,70.0,,hf_open_llm_v1_240829_frozen.csv lima_unchained_70b,HFv1 TruthfulQA,48.76,,hf_open_llm_v1_240829_frozen.csv lima_unchained_70b,HFv1 Winogrande,83.66,,hf_open_llm_v1_240829_frozen.csv limarp_platypus2_13b_qlora_0_80_epoch,HF OpenLLM v1,54.46,,hf_open_llm_v1_240829_frozen.csv limarp_platypus2_13b_qlora_0_80_epoch,HFv1 ARC,60.49,,hf_open_llm_v1_240829_frozen.csv limarp_platypus2_13b_qlora_0_80_epoch,HFv1 GSM8K,6.07,,hf_open_llm_v1_240829_frozen.csv limarp_platypus2_13b_qlora_0_80_epoch,HFv1 HellaSwag,82.76,,hf_open_llm_v1_240829_frozen.csv limarp_platypus2_13b_qlora_0_80_epoch,HFv1 MMLU,56.52,,hf_open_llm_v1_240829_frozen.csv limarp_platypus2_13b_qlora_0_80_epoch,HFv1 TruthfulQA,44.14,,hf_open_llm_v1_240829_frozen.csv limarp_platypus2_13b_qlora_0_80_epoch,HFv1 Winogrande,76.8,,hf_open_llm_v1_240829_frozen.csv liph42,HF OpenLLM v1,62.12,,hf_open_llm_v1_240829_frozen.csv liph42,HFv1 ARC,62.03,,hf_open_llm_v1_240829_frozen.csv liph42,HFv1 GSM8K,56.94,,hf_open_llm_v1_240829_frozen.csv liph42,HFv1 HellaSwag,75.87,,hf_open_llm_v1_240829_frozen.csv liph42,HFv1 MMLU,57.37,,hf_open_llm_v1_240829_frozen.csv liph42,HFv1 TruthfulQA,45.94,,hf_open_llm_v1_240829_frozen.csv liph42,HFv1 Winogrande,74.59,,hf_open_llm_v1_240829_frozen.csv litellama_460m_1t,HF OpenLLM v1,30.16,,hf_open_llm_v1_240829_frozen.csv litellama_460m_1t,HFv1 ARC,24.83,,hf_open_llm_v1_240829_frozen.csv litellama_460m_1t,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv litellama_460m_1t,HFv1 HellaSwag,38.39,,hf_open_llm_v1_240829_frozen.csv litellama_460m_1t,HFv1 MMLU,25.96,,hf_open_llm_v1_240829_frozen.csv litellama_460m_1t,HFv1 TruthfulQA,41.59,,hf_open_llm_v1_240829_frozen.csv litellama_460m_1t,HFv1 Winogrande,50.2,,hf_open_llm_v1_240829_frozen.csv llama2_13b_chinese_v2,HF OpenLLM v1,49.58,,hf_open_llm_v1_240829_frozen.csv llama2_13b_chinese_v2,HFv1 ARC,53.92,,hf_open_llm_v1_240829_frozen.csv llama2_13b_chinese_v2,HFv1 GSM8K,2.2,,hf_open_llm_v1_240829_frozen.csv llama2_13b_chinese_v2,HFv1 HellaSwag,74.64,,hf_open_llm_v1_240829_frozen.csv llama2_13b_chinese_v2,HFv1 MMLU,49.74,,hf_open_llm_v1_240829_frozen.csv llama2_13b_chinese_v2,HFv1 TruthfulQA,45.43,,hf_open_llm_v1_240829_frozen.csv llama2_13b_chinese_v2,HFv1 Winogrande,71.59,,hf_open_llm_v1_240829_frozen.csv llama2_13b_holomax,HF OpenLLM v1,54.52,,hf_open_llm_v1_240829_frozen.csv llama2_13b_holomax,HFv1 ARC,60.49,,hf_open_llm_v1_240829_frozen.csv llama2_13b_holomax,HFv1 GSM8K,11.45,,hf_open_llm_v1_240829_frozen.csv llama2_13b_holomax,HFv1 HellaSwag,82.86,,hf_open_llm_v1_240829_frozen.csv llama2_13b_holomax,HFv1 MMLU,54.67,,hf_open_llm_v1_240829_frozen.csv llama2_13b_holomax,HFv1 TruthfulQA,42.97,,hf_open_llm_v1_240829_frozen.csv llama2_13b_holomax,HFv1 Winogrande,74.66,,hf_open_llm_v1_240829_frozen.csv llama2_13b_platypus_ckpt_1000,HF OpenLLM v1,29.28,,hf_open_llm_v1_240829_frozen.csv llama2_13b_platypus_ckpt_1000,HFv1 ARC,28.16,,hf_open_llm_v1_240829_frozen.csv llama2_13b_platypus_ckpt_1000,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv llama2_13b_platypus_ckpt_1000,HFv1 HellaSwag,26.55,,hf_open_llm_v1_240829_frozen.csv llama2_13b_platypus_ckpt_1000,HFv1 MMLU,23.17,,hf_open_llm_v1_240829_frozen.csv llama2_13b_platypus_ckpt_1000,HFv1 TruthfulQA,48.79,,hf_open_llm_v1_240829_frozen.csv llama2_13b_platypus_ckpt_1000,HFv1 Winogrande,49.01,,hf_open_llm_v1_240829_frozen.csv llama2_13b_sharegpt4_orca_openplatypus_8w,HF OpenLLM v1,55.75,,hf_open_llm_v1_240829_frozen.csv llama2_13b_sharegpt4_orca_openplatypus_8w,HFv1 ARC,62.8,,hf_open_llm_v1_240829_frozen.csv llama2_13b_sharegpt4_orca_openplatypus_8w,HFv1 GSM8K,11.75,,hf_open_llm_v1_240829_frozen.csv llama2_13b_sharegpt4_orca_openplatypus_8w,HFv1 HellaSwag,84.04,,hf_open_llm_v1_240829_frozen.csv llama2_13b_sharegpt4_orca_openplatypus_8w,HFv1 MMLU,55.13,,hf_open_llm_v1_240829_frozen.csv llama2_13b_sharegpt4_orca_openplatypus_8w,HFv1 TruthfulQA,45.66,,hf_open_llm_v1_240829_frozen.csv llama2_13b_sharegpt4_orca_openplatypus_8w,HFv1 Winogrande,75.14,,hf_open_llm_v1_240829_frozen.csv llama2_13b_sharegpt4_test,HF OpenLLM v1,55.69,,hf_open_llm_v1_240829_frozen.csv llama2_13b_sharegpt4_test,HFv1 ARC,58.02,,hf_open_llm_v1_240829_frozen.csv llama2_13b_sharegpt4_test,HFv1 GSM8K,13.12,,hf_open_llm_v1_240829_frozen.csv llama2_13b_sharegpt4_test,HFv1 HellaSwag,82.65,,hf_open_llm_v1_240829_frozen.csv llama2_13b_sharegpt4_test,HFv1 MMLU,55.99,,hf_open_llm_v1_240829_frozen.csv llama2_13b_sharegpt4_test,HFv1 TruthfulQA,48.27,,hf_open_llm_v1_240829_frozen.csv llama2_13b_sharegpt4_test,HFv1 Winogrande,76.09,,hf_open_llm_v1_240829_frozen.csv llama2_7b_instruction_lora,HF OpenLLM v1,51.54,,hf_open_llm_v1_240829_frozen.csv llama2_7b_instruction_lora,HFv1 ARC,55.38,,hf_open_llm_v1_240829_frozen.csv llama2_7b_instruction_lora,HFv1 GSM8K,9.86,,hf_open_llm_v1_240829_frozen.csv llama2_7b_instruction_lora,HFv1 HellaSwag,78.57,,hf_open_llm_v1_240829_frozen.csv llama2_7b_instruction_lora,HFv1 MMLU,49.39,,hf_open_llm_v1_240829_frozen.csv llama2_7b_instruction_lora,HFv1 TruthfulQA,41.83,,hf_open_llm_v1_240829_frozen.csv llama2_7b_instruction_lora,HFv1 Winogrande,74.19,,hf_open_llm_v1_240829_frozen.csv llama2_7b_openorca_mc_v1,HF OpenLLM v1,52.24,,hf_open_llm_v1_240829_frozen.csv llama2_7b_openorca_mc_v1,HFv1 ARC,55.63,,hf_open_llm_v1_240829_frozen.csv llama2_7b_openorca_mc_v1,HFv1 GSM8K,4.09,,hf_open_llm_v1_240829_frozen.csv llama2_7b_openorca_mc_v1,HFv1 HellaSwag,80.17,,hf_open_llm_v1_240829_frozen.csv llama2_7b_openorca_mc_v1,HFv1 MMLU,48.44,,hf_open_llm_v1_240829_frozen.csv llama2_7b_openorca_mc_v1,HFv1 TruthfulQA,51.62,,hf_open_llm_v1_240829_frozen.csv llama2_7b_openorca_mc_v1,HFv1 Winogrande,73.48,,hf_open_llm_v1_240829_frozen.csv llama2_7b_openorca_mc_v2_dpo,HF OpenLLM v1,52.32,,hf_open_llm_v1_240829_frozen.csv llama2_7b_openorca_mc_v2_dpo,HFv1 ARC,54.78,,hf_open_llm_v1_240829_frozen.csv llama2_7b_openorca_mc_v2_dpo,HFv1 GSM8K,4.47,,hf_open_llm_v1_240829_frozen.csv llama2_7b_openorca_mc_v2_dpo,HFv1 HellaSwag,81.48,,hf_open_llm_v1_240829_frozen.csv llama2_7b_openorca_mc_v2_dpo,HFv1 MMLU,47.2,,hf_open_llm_v1_240829_frozen.csv llama2_7b_openorca_mc_v2_dpo,HFv1 TruthfulQA,53.13,,hf_open_llm_v1_240829_frozen.csv llama2_7b_openorca_mc_v2_dpo,HFv1 Winogrande,72.85,,hf_open_llm_v1_240829_frozen.csv llama2_megamerge_dare_13b_v2,HF OpenLLM v1,57.94,,hf_open_llm_v1_240829_frozen.csv llama2_megamerge_dare_13b_v2,HFv1 ARC,59.39,,hf_open_llm_v1_240829_frozen.csv llama2_megamerge_dare_13b_v2,HFv1 GSM8K,29.26,,hf_open_llm_v1_240829_frozen.csv llama2_megamerge_dare_13b_v2,HFv1 HellaSwag,80.93,,hf_open_llm_v1_240829_frozen.csv llama2_megamerge_dare_13b_v2,HFv1 MMLU,55.26,,hf_open_llm_v1_240829_frozen.csv llama2_megamerge_dare_13b_v2,HFv1 TruthfulQA,47.27,,hf_open_llm_v1_240829_frozen.csv llama2_megamerge_dare_13b_v2,HFv1 Winogrande,75.53,,hf_open_llm_v1_240829_frozen.csv llama2_xs_460m_experimental,HF OpenLLM v1,30.17,,hf_open_llm_v1_240829_frozen.csv llama2_xs_460m_experimental,HFv1 ARC,24.91,,hf_open_llm_v1_240829_frozen.csv llama2_xs_460m_experimental,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv llama2_xs_460m_experimental,HFv1 HellaSwag,38.47,,hf_open_llm_v1_240829_frozen.csv llama2_xs_460m_experimental,HFv1 MMLU,26.17,,hf_open_llm_v1_240829_frozen.csv llama2_xs_460m_experimental,HFv1 TruthfulQA,41.59,,hf_open_llm_v1_240829_frozen.csv llama2_xs_460m_experimental,HFv1 Winogrande,49.88,,hf_open_llm_v1_240829_frozen.csv llama3,HF OpenLLM v1,37.78,,hf_open_llm_v1_240829_frozen.csv llama3,HFv1 ARC,37.71,,hf_open_llm_v1_240829_frozen.csv llama3,HFv1 GSM8K,5.23,,hf_open_llm_v1_240829_frozen.csv llama3,HFv1 HellaSwag,58.93,,hf_open_llm_v1_240829_frozen.csv llama3,HFv1 MMLU,25.33,,hf_open_llm_v1_240829_frozen.csv llama3,HFv1 TruthfulQA,42.79,,hf_open_llm_v1_240829_frozen.csv llama3,HFv1 Winogrande,56.67,,hf_open_llm_v1_240829_frozen.csv llama30b,HF OpenLLM v1,56.94,,hf_open_llm_v1_240829_frozen.csv llama30b,HFv1 ARC,61.26,,hf_open_llm_v1_240829_frozen.csv llama30b,HFv1 GSM8K,14.86,,hf_open_llm_v1_240829_frozen.csv llama30b,HFv1 HellaSwag,84.73,,hf_open_llm_v1_240829_frozen.csv llama30b,HFv1 MMLU,58.47,,hf_open_llm_v1_240829_frozen.csv llama30b,HFv1 TruthfulQA,42.27,,hf_open_llm_v1_240829_frozen.csv llama30b,HFv1 Winogrande,80.03,,hf_open_llm_v1_240829_frozen.csv llama33b_instructed,HF OpenLLM v1,58.18,,hf_open_llm_v1_240829_frozen.csv llama33b_instructed,HFv1 ARC,64.59,,hf_open_llm_v1_240829_frozen.csv llama33b_instructed,HFv1 GSM8K,14.4,,hf_open_llm_v1_240829_frozen.csv llama33b_instructed,HFv1 HellaSwag,86.17,,hf_open_llm_v1_240829_frozen.csv llama33b_instructed,HFv1 MMLU,60.5,,hf_open_llm_v1_240829_frozen.csv llama33b_instructed,HFv1 TruthfulQA,44.12,,hf_open_llm_v1_240829_frozen.csv llama33b_instructed,HFv1 Winogrande,79.32,,hf_open_llm_v1_240829_frozen.csv llama39m,HF OpenLLM v1,28.45,,hf_open_llm_v1_240829_frozen.csv llama39m,HFv1 ARC,24.06,,hf_open_llm_v1_240829_frozen.csv llama39m,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv llama39m,HFv1 HellaSwag,25.57,,hf_open_llm_v1_240829_frozen.csv llama39m,HFv1 MMLU,24.31,,hf_open_llm_v1_240829_frozen.csv llama39m,HFv1 TruthfulQA,47.19,,hf_open_llm_v1_240829_frozen.csv llama39m,HFv1 Winogrande,49.57,,hf_open_llm_v1_240829_frozen.csv llama3_13b,HF OpenLLM v1,54.61,,hf_open_llm_v1_240829_frozen.csv llama3_13b,HFv1 ARC,52.99,,hf_open_llm_v1_240829_frozen.csv llama3_13b,HFv1 GSM8K,21.91,,hf_open_llm_v1_240829_frozen.csv llama3_13b,HFv1 HellaSwag,80.66,,hf_open_llm_v1_240829_frozen.csv llama3_13b,HFv1 MMLU,62.12,,hf_open_llm_v1_240829_frozen.csv llama3_13b,HFv1 TruthfulQA,39.28,,hf_open_llm_v1_240829_frozen.csv llama3_13b,HFv1 Winogrande,70.72,,hf_open_llm_v1_240829_frozen.csv llama3_70b,HF OpenLLM v1,73.96,,hf_open_llm_v1_240829_frozen.csv llama3_70b,HFv1 ARC,68.77,,hf_open_llm_v1_240829_frozen.csv llama3_70b,HFv1 GSM8K,76.88,,hf_open_llm_v1_240829_frozen.csv llama3_70b,HFv1 HellaSwag,87.98,,hf_open_llm_v1_240829_frozen.csv llama3_70b,HFv1 MMLU,79.23,,hf_open_llm_v1_240829_frozen.csv llama3_70b,HFv1 TruthfulQA,45.56,,hf_open_llm_v1_240829_frozen.csv llama3_70b,HFv1 Winogrande,85.32,,hf_open_llm_v1_240829_frozen.csv llama3_70b_chinese_chat,HF OpenLLM v1,77.34,,hf_open_llm_v1_240829_frozen.csv llama3_70b_chinese_chat,HFv1 ARC,70.39,,hf_open_llm_v1_240829_frozen.csv llama3_70b_chinese_chat,HFv1 GSM8K,83.24,,hf_open_llm_v1_240829_frozen.csv llama3_70b_chinese_chat,HFv1 HellaSwag,85.81,,hf_open_llm_v1_240829_frozen.csv llama3_70b_chinese_chat,HFv1 MMLU,79.74,,hf_open_llm_v1_240829_frozen.csv llama3_70b_chinese_chat,HFv1 TruthfulQA,61.1,,hf_open_llm_v1_240829_frozen.csv llama3_70b_chinese_chat,HFv1 Winogrande,83.74,,hf_open_llm_v1_240829_frozen.csv llama3_70b_instruct,HF OpenLLM v1,77.88,,hf_open_llm_v1_240829_frozen.csv llama3_70b_instruct,HFv1 ARC,71.42,,hf_open_llm_v1_240829_frozen.csv llama3_70b_instruct,HFv1 GSM8K,85.44,,hf_open_llm_v1_240829_frozen.csv llama3_70b_instruct,HFv1 HellaSwag,85.69,,hf_open_llm_v1_240829_frozen.csv llama3_70b_instruct,HFv1 MMLU,80.06,,hf_open_llm_v1_240829_frozen.csv llama3_70b_instruct,HFv1 TruthfulQA,61.81,,hf_open_llm_v1_240829_frozen.csv llama3_70b_instruct,HFv1 Winogrande,82.87,,hf_open_llm_v1_240829_frozen.csv llama3_70b_instruct_dpo_v0_1,HF OpenLLM v1,78.11,,hf_open_llm_v1_240829_frozen.csv llama3_70b_instruct_dpo_v0_1,HFv1 ARC,71.67,,hf_open_llm_v1_240829_frozen.csv llama3_70b_instruct_dpo_v0_1,HFv1 GSM8K,86.05,,hf_open_llm_v1_240829_frozen.csv llama3_70b_instruct_dpo_v0_1,HFv1 HellaSwag,85.83,,hf_open_llm_v1_240829_frozen.csv llama3_70b_instruct_dpo_v0_1,HFv1 MMLU,80.12,,hf_open_llm_v1_240829_frozen.csv llama3_70b_instruct_dpo_v0_1,HFv1 TruthfulQA,62.11,,hf_open_llm_v1_240829_frozen.csv llama3_70b_instruct_dpo_v0_1,HFv1 Winogrande,82.87,,hf_open_llm_v1_240829_frozen.csv llama3_70b_instruct_dpo_v0_2,HF OpenLLM v1,78.96,,hf_open_llm_v1_240829_frozen.csv llama3_70b_instruct_dpo_v0_2,HFv1 ARC,72.53,,hf_open_llm_v1_240829_frozen.csv llama3_70b_instruct_dpo_v0_2,HFv1 GSM8K,88.25,,hf_open_llm_v1_240829_frozen.csv llama3_70b_instruct_dpo_v0_2,HFv1 HellaSwag,86.22,,hf_open_llm_v1_240829_frozen.csv llama3_70b_instruct_dpo_v0_2,HFv1 MMLU,80.41,,hf_open_llm_v1_240829_frozen.csv llama3_70b_instruct_dpo_v0_2,HFv1 TruthfulQA,63.57,,hf_open_llm_v1_240829_frozen.csv llama3_70b_instruct_dpo_v0_2,HFv1 Winogrande,82.79,,hf_open_llm_v1_240829_frozen.csv llama3_70b_instruct_dpo_v0_3,HF OpenLLM v1,78.74,,hf_open_llm_v1_240829_frozen.csv llama3_70b_instruct_dpo_v0_3,HFv1 ARC,72.35,,hf_open_llm_v1_240829_frozen.csv llama3_70b_instruct_dpo_v0_3,HFv1 GSM8K,87.19,,hf_open_llm_v1_240829_frozen.csv llama3_70b_instruct_dpo_v0_3,HFv1 HellaSwag,86.0,,hf_open_llm_v1_240829_frozen.csv llama3_70b_instruct_dpo_v0_3,HFv1 MMLU,80.47,,hf_open_llm_v1_240829_frozen.csv llama3_70b_instruct_dpo_v0_3,HFv1 TruthfulQA,63.45,,hf_open_llm_v1_240829_frozen.csv llama3_70b_instruct_dpo_v0_3,HFv1 Winogrande,82.95,,hf_open_llm_v1_240829_frozen.csv llama3_70b_instruct_dpo_v0_4,HF OpenLLM v1,78.89,,hf_open_llm_v1_240829_frozen.csv llama3_70b_instruct_dpo_v0_4,HFv1 ARC,72.61,,hf_open_llm_v1_240829_frozen.csv llama3_70b_instruct_dpo_v0_4,HFv1 GSM8K,87.34,,hf_open_llm_v1_240829_frozen.csv llama3_70b_instruct_dpo_v0_4,HFv1 HellaSwag,86.03,,hf_open_llm_v1_240829_frozen.csv llama3_70b_instruct_dpo_v0_4,HFv1 MMLU,80.5,,hf_open_llm_v1_240829_frozen.csv llama3_70b_instruct_dpo_v0_4,HFv1 TruthfulQA,63.26,,hf_open_llm_v1_240829_frozen.csv llama3_70b_instruct_dpo_v0_4,HFv1 Winogrande,83.58,,hf_open_llm_v1_240829_frozen.csv llama3_70b_instruct_gradient_524k,HF OpenLLM v1,73.97,,hf_open_llm_v1_240829_frozen.csv llama3_70b_instruct_gradient_524k,HFv1 ARC,66.81,,hf_open_llm_v1_240829_frozen.csv llama3_70b_instruct_gradient_524k,HFv1 GSM8K,78.85,,hf_open_llm_v1_240829_frozen.csv llama3_70b_instruct_gradient_524k,HFv1 HellaSwag,85.46,,hf_open_llm_v1_240829_frozen.csv llama3_70b_instruct_gradient_524k,HFv1 MMLU,76.37,,hf_open_llm_v1_240829_frozen.csv llama3_70b_instruct_gradient_524k,HFv1 TruthfulQA,53.73,,hf_open_llm_v1_240829_frozen.csv llama3_70b_instruct_gradient_524k,HFv1 Winogrande,82.64,,hf_open_llm_v1_240829_frozen.csv llama3_70b_japanese_suzume_vector_v0_1,HF OpenLLM v1,78.6,,hf_open_llm_v1_240829_frozen.csv llama3_70b_japanese_suzume_vector_v0_1,HFv1 ARC,72.35,,hf_open_llm_v1_240829_frozen.csv llama3_70b_japanese_suzume_vector_v0_1,HFv1 GSM8K,87.41,,hf_open_llm_v1_240829_frozen.csv llama3_70b_japanese_suzume_vector_v0_1,HFv1 HellaSwag,85.81,,hf_open_llm_v1_240829_frozen.csv llama3_70b_japanese_suzume_vector_v0_1,HFv1 MMLU,80.28,,hf_open_llm_v1_240829_frozen.csv llama3_70b_japanese_suzume_vector_v0_1,HFv1 TruthfulQA,62.93,,hf_open_llm_v1_240829_frozen.csv llama3_70b_japanese_suzume_vector_v0_1,HFv1 Winogrande,82.79,,hf_open_llm_v1_240829_frozen.csv llama3_70b_orpo_v0_1,HF OpenLLM v1,74.67,,hf_open_llm_v1_240829_frozen.csv llama3_70b_orpo_v0_1,HFv1 ARC,68.69,,hf_open_llm_v1_240829_frozen.csv llama3_70b_orpo_v0_1,HFv1 GSM8K,76.8,,hf_open_llm_v1_240829_frozen.csv llama3_70b_orpo_v0_1,HFv1 HellaSwag,88.01,,hf_open_llm_v1_240829_frozen.csv llama3_70b_orpo_v0_1,HFv1 MMLU,79.39,,hf_open_llm_v1_240829_frozen.csv llama3_70b_orpo_v0_1,HFv1 TruthfulQA,49.62,,hf_open_llm_v1_240829_frozen.csv llama3_70b_orpo_v0_1,HFv1 Winogrande,85.48,,hf_open_llm_v1_240829_frozen.csv llama3_7b,HF OpenLLM v1,63.96,,hf_open_llm_v1_240829_frozen.csv llama3_7b,HFv1 ARC,60.24,,hf_open_llm_v1_240829_frozen.csv llama3_7b,HFv1 GSM8K,47.61,,hf_open_llm_v1_240829_frozen.csv llama3_7b,HFv1 HellaSwag,77.38,,hf_open_llm_v1_240829_frozen.csv llama3_7b,HFv1 MMLU,64.09,,hf_open_llm_v1_240829_frozen.csv llama3_7b,HFv1 TruthfulQA,59.56,,hf_open_llm_v1_240829_frozen.csv llama3_7b,HFv1 Winogrande,74.9,,hf_open_llm_v1_240829_frozen.csv llama3_8b,HF OpenLLM v1,62.62,,hf_open_llm_v1_240829_frozen.csv llama3_8b,HFv1 ARC,60.24,,hf_open_llm_v1_240829_frozen.csv llama3_8b,HFv1 GSM8K,45.34,,hf_open_llm_v1_240829_frozen.csv llama3_8b,HFv1 HellaSwag,82.23,,hf_open_llm_v1_240829_frozen.csv llama3_8b,HFv1 MMLU,66.7,,hf_open_llm_v1_240829_frozen.csv llama3_8b,HFv1 TruthfulQA,43.95,,hf_open_llm_v1_240829_frozen.csv llama3_8b,HFv1 Winogrande,78.45,,hf_open_llm_v1_240829_frozen.csv llama3_8b_chinese_chat,HF OpenLLM v1,67.1,,hf_open_llm_v1_240829_frozen.csv llama3_8b_chinese_chat,HFv1 ARC,61.77,,hf_open_llm_v1_240829_frozen.csv llama3_8b_chinese_chat,HFv1 GSM8K,67.17,,hf_open_llm_v1_240829_frozen.csv llama3_8b_chinese_chat,HFv1 HellaSwag,80.07,,hf_open_llm_v1_240829_frozen.csv llama3_8b_chinese_chat,HFv1 MMLU,66.97,,hf_open_llm_v1_240829_frozen.csv llama3_8b_chinese_chat,HFv1 TruthfulQA,51.41,,hf_open_llm_v1_240829_frozen.csv llama3_8b_chinese_chat,HFv1 Winogrande,75.22,,hf_open_llm_v1_240829_frozen.csv llama3_8b_chinese_chat_v2_nightly,HF OpenLLM v1,66.79,,hf_open_llm_v1_240829_frozen.csv llama3_8b_chinese_chat_v2_nightly,HFv1 ARC,60.84,,hf_open_llm_v1_240829_frozen.csv llama3_8b_chinese_chat_v2_nightly,HFv1 GSM8K,67.17,,hf_open_llm_v1_240829_frozen.csv llama3_8b_chinese_chat_v2_nightly,HFv1 HellaSwag,79.94,,hf_open_llm_v1_240829_frozen.csv llama3_8b_chinese_chat_v2_nightly,HFv1 MMLU,66.46,,hf_open_llm_v1_240829_frozen.csv llama3_8b_chinese_chat_v2_nightly,HFv1 TruthfulQA,50.89,,hf_open_llm_v1_240829_frozen.csv llama3_8b_chinese_chat_v2_nightly,HFv1 Winogrande,75.45,,hf_open_llm_v1_240829_frozen.csv llama3_8b_chinese_chat_v2_nightly_v2,HF OpenLLM v1,67.1,,hf_open_llm_v1_240829_frozen.csv llama3_8b_chinese_chat_v2_nightly_v2,HFv1 ARC,61.77,,hf_open_llm_v1_240829_frozen.csv llama3_8b_chinese_chat_v2_nightly_v2,HFv1 GSM8K,67.17,,hf_open_llm_v1_240829_frozen.csv llama3_8b_chinese_chat_v2_nightly_v2,HFv1 HellaSwag,80.07,,hf_open_llm_v1_240829_frozen.csv llama3_8b_chinese_chat_v2_nightly_v2,HFv1 MMLU,66.97,,hf_open_llm_v1_240829_frozen.csv llama3_8b_chinese_chat_v2_nightly_v2,HFv1 TruthfulQA,51.41,,hf_open_llm_v1_240829_frozen.csv llama3_8b_chinese_chat_v2_nightly_v2,HFv1 Winogrande,75.22,,hf_open_llm_v1_240829_frozen.csv llama3_8b_claudstruct_v1,HF OpenLLM v1,65.9,,hf_open_llm_v1_240829_frozen.csv llama3_8b_claudstruct_v1,HFv1 ARC,59.73,,hf_open_llm_v1_240829_frozen.csv llama3_8b_claudstruct_v1,HFv1 GSM8K,64.52,,hf_open_llm_v1_240829_frozen.csv llama3_8b_claudstruct_v1,HFv1 HellaSwag,79.94,,hf_open_llm_v1_240829_frozen.csv llama3_8b_claudstruct_v1,HFv1 MMLU,64.98,,hf_open_llm_v1_240829_frozen.csv llama3_8b_claudstruct_v1,HFv1 TruthfulQA,51.82,,hf_open_llm_v1_240829_frozen.csv llama3_8b_claudstruct_v1,HFv1 Winogrande,74.43,,hf_open_llm_v1_240829_frozen.csv llama3_8b_claudstruct_v2,HF OpenLLM v1,66.32,,hf_open_llm_v1_240829_frozen.csv llama3_8b_claudstruct_v2,HFv1 ARC,59.9,,hf_open_llm_v1_240829_frozen.csv llama3_8b_claudstruct_v2,HFv1 GSM8K,65.73,,hf_open_llm_v1_240829_frozen.csv llama3_8b_claudstruct_v2,HFv1 HellaSwag,80.01,,hf_open_llm_v1_240829_frozen.csv llama3_8b_claudstruct_v2,HFv1 MMLU,64.8,,hf_open_llm_v1_240829_frozen.csv llama3_8b_claudstruct_v2,HFv1 TruthfulQA,51.87,,hf_open_llm_v1_240829_frozen.csv llama3_8b_claudstruct_v2,HFv1 Winogrande,75.61,,hf_open_llm_v1_240829_frozen.csv llama3_8b_claudstruct_v3,HF OpenLLM v1,65.62,,hf_open_llm_v1_240829_frozen.csv llama3_8b_claudstruct_v3,HFv1 ARC,58.96,,hf_open_llm_v1_240829_frozen.csv llama3_8b_claudstruct_v3,HFv1 GSM8K,64.22,,hf_open_llm_v1_240829_frozen.csv llama3_8b_claudstruct_v3,HFv1 HellaSwag,80.05,,hf_open_llm_v1_240829_frozen.csv llama3_8b_claudstruct_v3,HFv1 MMLU,64.55,,hf_open_llm_v1_240829_frozen.csv llama3_8b_claudstruct_v3,HFv1 TruthfulQA,51.76,,hf_open_llm_v1_240829_frozen.csv llama3_8b_claudstruct_v3,HFv1 Winogrande,74.19,,hf_open_llm_v1_240829_frozen.csv llama3_8b_instruct,HF OpenLLM v1,66.87,,hf_open_llm_v1_240829_frozen.csv llama3_8b_instruct,HFv1 ARC,60.75,,hf_open_llm_v1_240829_frozen.csv llama3_8b_instruct,HFv1 GSM8K,68.69,,hf_open_llm_v1_240829_frozen.csv llama3_8b_instruct,HFv1 HellaSwag,78.55,,hf_open_llm_v1_240829_frozen.csv llama3_8b_instruct,HFv1 MMLU,67.07,,hf_open_llm_v1_240829_frozen.csv llama3_8b_instruct,HFv1 TruthfulQA,51.65,,hf_open_llm_v1_240829_frozen.csv llama3_8b_instruct,HFv1 Winogrande,74.51,,hf_open_llm_v1_240829_frozen.csv llama3_8b_instruct_abliterated_dpomix,HF OpenLLM v1,68.11,,hf_open_llm_v1_240829_frozen.csv llama3_8b_instruct_abliterated_dpomix,HFv1 ARC,62.88,,hf_open_llm_v1_240829_frozen.csv llama3_8b_instruct_abliterated_dpomix,HFv1 GSM8K,69.75,,hf_open_llm_v1_240829_frozen.csv llama3_8b_instruct_abliterated_dpomix,HFv1 HellaSwag,79.52,,hf_open_llm_v1_240829_frozen.csv llama3_8b_instruct_abliterated_dpomix,HFv1 MMLU,67.0,,hf_open_llm_v1_240829_frozen.csv llama3_8b_instruct_abliterated_dpomix,HFv1 TruthfulQA,54.21,,hf_open_llm_v1_240829_frozen.csv llama3_8b_instruct_abliterated_dpomix,HFv1 Winogrande,75.3,,hf_open_llm_v1_240829_frozen.csv llama3_8b_instruct_dpo_v0_2,HF OpenLLM v1,68.36,,hf_open_llm_v1_240829_frozen.csv llama3_8b_instruct_dpo_v0_2,HFv1 ARC,62.46,,hf_open_llm_v1_240829_frozen.csv llama3_8b_instruct_dpo_v0_2,HFv1 GSM8K,70.81,,hf_open_llm_v1_240829_frozen.csv llama3_8b_instruct_dpo_v0_2,HFv1 HellaSwag,79.5,,hf_open_llm_v1_240829_frozen.csv llama3_8b_instruct_dpo_v0_2,HFv1 MMLU,68.21,,hf_open_llm_v1_240829_frozen.csv llama3_8b_instruct_dpo_v0_2,HFv1 TruthfulQA,53.27,,hf_open_llm_v1_240829_frozen.csv llama3_8b_instruct_dpo_v0_2,HFv1 Winogrande,75.93,,hf_open_llm_v1_240829_frozen.csv llama3_8b_instruct_dpo_v0_3,HF OpenLLM v1,68.23,,hf_open_llm_v1_240829_frozen.csv llama3_8b_instruct_dpo_v0_3,HFv1 ARC,62.63,,hf_open_llm_v1_240829_frozen.csv llama3_8b_instruct_dpo_v0_3,HFv1 GSM8K,70.58,,hf_open_llm_v1_240829_frozen.csv llama3_8b_instruct_dpo_v0_3,HFv1 HellaSwag,79.2,,hf_open_llm_v1_240829_frozen.csv llama3_8b_instruct_dpo_v0_3,HFv1 MMLU,68.33,,hf_open_llm_v1_240829_frozen.csv llama3_8b_instruct_dpo_v0_3,HFv1 TruthfulQA,53.29,,hf_open_llm_v1_240829_frozen.csv llama3_8b_instruct_dpo_v0_3,HFv1 Winogrande,75.37,,hf_open_llm_v1_240829_frozen.csv llama3_8b_instruct_dpo_v0_4,HF OpenLLM v1,68.49,,hf_open_llm_v1_240829_frozen.csv llama3_8b_instruct_dpo_v0_4,HFv1 ARC,62.54,,hf_open_llm_v1_240829_frozen.csv llama3_8b_instruct_dpo_v0_4,HFv1 GSM8K,71.04,,hf_open_llm_v1_240829_frozen.csv llama3_8b_instruct_dpo_v0_4,HFv1 HellaSwag,79.73,,hf_open_llm_v1_240829_frozen.csv llama3_8b_instruct_dpo_v0_4,HFv1 MMLU,68.08,,hf_open_llm_v1_240829_frozen.csv llama3_8b_instruct_dpo_v0_4,HFv1 TruthfulQA,53.94,,hf_open_llm_v1_240829_frozen.csv llama3_8b_instruct_dpo_v0_4,HFv1 Winogrande,75.61,,hf_open_llm_v1_240829_frozen.csv llama3_8b_instruct_orpo_qlora,HF OpenLLM v1,64.46,,hf_open_llm_v1_240829_frozen.csv llama3_8b_instruct_orpo_qlora,HFv1 ARC,58.19,,hf_open_llm_v1_240829_frozen.csv llama3_8b_instruct_orpo_qlora,HFv1 GSM8K,58.61,,hf_open_llm_v1_240829_frozen.csv llama3_8b_instruct_orpo_qlora,HFv1 HellaSwag,79.42,,hf_open_llm_v1_240829_frozen.csv llama3_8b_instruct_orpo_qlora,HFv1 MMLU,65.59,,hf_open_llm_v1_240829_frozen.csv llama3_8b_instruct_orpo_qlora,HFv1 TruthfulQA,48.38,,hf_open_llm_v1_240829_frozen.csv llama3_8b_instruct_orpo_qlora,HFv1 Winogrande,76.56,,hf_open_llm_v1_240829_frozen.csv llama3_8b_instruct_ortho_baukit_10fail_1000total,HF OpenLLM v1,65.68,,hf_open_llm_v1_240829_frozen.csv llama3_8b_instruct_ortho_baukit_10fail_1000total,HFv1 ARC,60.15,,hf_open_llm_v1_240829_frozen.csv llama3_8b_instruct_ortho_baukit_10fail_1000total,HFv1 GSM8K,66.79,,hf_open_llm_v1_240829_frozen.csv llama3_8b_instruct_ortho_baukit_10fail_1000total,HFv1 HellaSwag,77.83,,hf_open_llm_v1_240829_frozen.csv llama3_8b_instruct_ortho_baukit_10fail_1000total,HFv1 MMLU,64.68,,hf_open_llm_v1_240829_frozen.csv llama3_8b_instruct_ortho_baukit_10fail_1000total,HFv1 TruthfulQA,50.02,,hf_open_llm_v1_240829_frozen.csv llama3_8b_instruct_ortho_baukit_10fail_1000total,HFv1 Winogrande,74.59,,hf_open_llm_v1_240829_frozen.csv llama3_8b_instruct_ortho_baukit_2fail_128total,HF OpenLLM v1,66.12,,hf_open_llm_v1_240829_frozen.csv llama3_8b_instruct_ortho_baukit_2fail_128total,HFv1 ARC,60.49,,hf_open_llm_v1_240829_frozen.csv llama3_8b_instruct_ortho_baukit_2fail_128total,HFv1 GSM8K,67.25,,hf_open_llm_v1_240829_frozen.csv llama3_8b_instruct_ortho_baukit_2fail_128total,HFv1 HellaSwag,77.92,,hf_open_llm_v1_240829_frozen.csv llama3_8b_instruct_ortho_baukit_2fail_128total,HFv1 MMLU,66.36,,hf_open_llm_v1_240829_frozen.csv llama3_8b_instruct_ortho_baukit_2fail_128total,HFv1 TruthfulQA,50.05,,hf_open_llm_v1_240829_frozen.csv llama3_8b_instruct_ortho_baukit_2fail_128total,HFv1 Winogrande,74.66,,hf_open_llm_v1_240829_frozen.csv llama3_8b_instruct_ortho_baukit_5fail_3000total_bf16,HF OpenLLM v1,65.62,,hf_open_llm_v1_240829_frozen.csv llama3_8b_instruct_ortho_baukit_5fail_3000total_bf16,HFv1 ARC,60.84,,hf_open_llm_v1_240829_frozen.csv llama3_8b_instruct_ortho_baukit_5fail_3000total_bf16,HFv1 GSM8K,65.5,,hf_open_llm_v1_240829_frozen.csv llama3_8b_instruct_ortho_baukit_5fail_3000total_bf16,HFv1 HellaSwag,77.32,,hf_open_llm_v1_240829_frozen.csv llama3_8b_instruct_ortho_baukit_5fail_3000total_bf16,HFv1 MMLU,65.62,,hf_open_llm_v1_240829_frozen.csv llama3_8b_instruct_ortho_baukit_5fail_3000total_bf16,HFv1 TruthfulQA,50.16,,hf_open_llm_v1_240829_frozen.csv llama3_8b_instruct_ortho_baukit_5fail_3000total_bf16,HFv1 Winogrande,74.27,,hf_open_llm_v1_240829_frozen.csv llama3_8b_instruct_ortho_baukit_5fail_500total,HF OpenLLM v1,65.19,,hf_open_llm_v1_240829_frozen.csv llama3_8b_instruct_ortho_baukit_5fail_500total,HFv1 ARC,61.09,,hf_open_llm_v1_240829_frozen.csv llama3_8b_instruct_ortho_baukit_5fail_500total,HFv1 GSM8K,61.26,,hf_open_llm_v1_240829_frozen.csv llama3_8b_instruct_ortho_baukit_5fail_500total,HFv1 HellaSwag,77.51,,hf_open_llm_v1_240829_frozen.csv llama3_8b_instruct_ortho_baukit_5fail_500total,HFv1 MMLU,65.82,,hf_open_llm_v1_240829_frozen.csv llama3_8b_instruct_ortho_baukit_5fail_500total,HFv1 TruthfulQA,50.07,,hf_open_llm_v1_240829_frozen.csv llama3_8b_instruct_ortho_baukit_5fail_500total,HFv1 Winogrande,75.37,,hf_open_llm_v1_240829_frozen.csv llama3_8b_instruct_v0_1,HF OpenLLM v1,68.32,,hf_open_llm_v1_240829_frozen.csv llama3_8b_instruct_v0_1,HFv1 ARC,62.46,,hf_open_llm_v1_240829_frozen.csv llama3_8b_instruct_v0_1,HFv1 GSM8K,70.74,,hf_open_llm_v1_240829_frozen.csv llama3_8b_instruct_v0_1,HFv1 HellaSwag,79.37,,hf_open_llm_v1_240829_frozen.csv llama3_8b_instruct_v0_1,HFv1 MMLU,68.25,,hf_open_llm_v1_240829_frozen.csv llama3_8b_instruct_v0_1,HFv1 TruthfulQA,53.4,,hf_open_llm_v1_240829_frozen.csv llama3_8b_instruct_v0_1,HFv1 Winogrande,75.69,,hf_open_llm_v1_240829_frozen.csv llama3_8b_instruct_v0_2,HF OpenLLM v1,68.18,,hf_open_llm_v1_240829_frozen.csv llama3_8b_instruct_v0_2,HFv1 ARC,62.2,,hf_open_llm_v1_240829_frozen.csv llama3_8b_instruct_v0_2,HFv1 GSM8K,70.96,,hf_open_llm_v1_240829_frozen.csv llama3_8b_instruct_v0_2,HFv1 HellaSwag,79.27,,hf_open_llm_v1_240829_frozen.csv llama3_8b_instruct_v0_2,HFv1 MMLU,67.96,,hf_open_llm_v1_240829_frozen.csv llama3_8b_instruct_v0_2,HFv1 TruthfulQA,53.02,,hf_open_llm_v1_240829_frozen.csv llama3_8b_instruct_v0_2,HFv1 Winogrande,75.69,,hf_open_llm_v1_240829_frozen.csv llama3_8b_instruct_v0_3,HF OpenLLM v1,68.22,,hf_open_llm_v1_240829_frozen.csv llama3_8b_instruct_v0_3,HFv1 ARC,62.37,,hf_open_llm_v1_240829_frozen.csv llama3_8b_instruct_v0_3,HFv1 GSM8K,69.98,,hf_open_llm_v1_240829_frozen.csv llama3_8b_instruct_v0_3,HFv1 HellaSwag,79.55,,hf_open_llm_v1_240829_frozen.csv llama3_8b_instruct_v0_3,HFv1 MMLU,68.13,,hf_open_llm_v1_240829_frozen.csv llama3_8b_instruct_v0_3,HFv1 TruthfulQA,53.77,,hf_open_llm_v1_240829_frozen.csv llama3_8b_instruct_v0_3,HFv1 Winogrande,75.53,,hf_open_llm_v1_240829_frozen.csv llama3_8b_instruct_v0_4,HF OpenLLM v1,70.3,,hf_open_llm_v1_240829_frozen.csv llama3_8b_instruct_v0_4,HFv1 ARC,67.24,,hf_open_llm_v1_240829_frozen.csv llama3_8b_instruct_v0_4,HFv1 GSM8K,69.45,,hf_open_llm_v1_240829_frozen.csv llama3_8b_instruct_v0_4,HFv1 HellaSwag,83.23,,hf_open_llm_v1_240829_frozen.csv llama3_8b_instruct_v0_4,HFv1 MMLU,67.77,,hf_open_llm_v1_240829_frozen.csv llama3_8b_instruct_v0_4,HFv1 TruthfulQA,56.75,,hf_open_llm_v1_240829_frozen.csv llama3_8b_instruct_v0_4,HFv1 Winogrande,77.35,,hf_open_llm_v1_240829_frozen.csv llama3_8b_instruct_v0_5,HF OpenLLM v1,68.37,,hf_open_llm_v1_240829_frozen.csv llama3_8b_instruct_v0_5,HFv1 ARC,62.37,,hf_open_llm_v1_240829_frozen.csv llama3_8b_instruct_v0_5,HFv1 GSM8K,71.34,,hf_open_llm_v1_240829_frozen.csv llama3_8b_instruct_v0_5,HFv1 HellaSwag,79.41,,hf_open_llm_v1_240829_frozen.csv llama3_8b_instruct_v0_5,HFv1 MMLU,68.16,,hf_open_llm_v1_240829_frozen.csv llama3_8b_instruct_v0_5,HFv1 TruthfulQA,53.26,,hf_open_llm_v1_240829_frozen.csv llama3_8b_instruct_v0_5,HFv1 Winogrande,75.69,,hf_open_llm_v1_240829_frozen.csv llama3_8b_instruct_v0_7,HF OpenLLM v1,69.35,,hf_open_llm_v1_240829_frozen.csv llama3_8b_instruct_v0_7,HFv1 ARC,65.36,,hf_open_llm_v1_240829_frozen.csv llama3_8b_instruct_v0_7,HFv1 GSM8K,69.83,,hf_open_llm_v1_240829_frozen.csv llama3_8b_instruct_v0_7,HFv1 HellaSwag,81.82,,hf_open_llm_v1_240829_frozen.csv llama3_8b_instruct_v0_7,HFv1 MMLU,67.67,,hf_open_llm_v1_240829_frozen.csv llama3_8b_instruct_v0_7,HFv1 TruthfulQA,55.18,,hf_open_llm_v1_240829_frozen.csv llama3_8b_instruct_v0_7,HFv1 Winogrande,76.24,,hf_open_llm_v1_240829_frozen.csv llama3_8b_instruct_v0_8,HF OpenLLM v1,73.2,,hf_open_llm_v1_240829_frozen.csv llama3_8b_instruct_v0_8,HFv1 ARC,71.93,,hf_open_llm_v1_240829_frozen.csv llama3_8b_instruct_v0_8,HFv1 GSM8K,68.46,,hf_open_llm_v1_240829_frozen.csv llama3_8b_instruct_v0_8,HFv1 HellaSwag,87.77,,hf_open_llm_v1_240829_frozen.csv llama3_8b_instruct_v0_8,HFv1 MMLU,68.3,,hf_open_llm_v1_240829_frozen.csv llama3_8b_instruct_v0_8,HFv1 TruthfulQA,63.94,,hf_open_llm_v1_240829_frozen.csv llama3_8b_instruct_v0_8,HFv1 Winogrande,79.08,,hf_open_llm_v1_240829_frozen.csv llama3_8b_instruct_v0_9,HF OpenLLM v1,73.29,,hf_open_llm_v1_240829_frozen.csv llama3_8b_instruct_v0_9,HFv1 ARC,72.35,,hf_open_llm_v1_240829_frozen.csv llama3_8b_instruct_v0_9,HFv1 GSM8K,66.49,,hf_open_llm_v1_240829_frozen.csv llama3_8b_instruct_v0_9,HFv1 HellaSwag,88.17,,hf_open_llm_v1_240829_frozen.csv llama3_8b_instruct_v0_9,HFv1 MMLU,68.1,,hf_open_llm_v1_240829_frozen.csv llama3_8b_instruct_v0_9,HFv1 TruthfulQA,64.67,,hf_open_llm_v1_240829_frozen.csv llama3_8b_instruct_v0_9,HFv1 Winogrande,79.95,,hf_open_llm_v1_240829_frozen.csv llama3_8b_nola,HF OpenLLM v1,62.48,,hf_open_llm_v1_240829_frozen.csv llama3_8b_nola,HFv1 ARC,60.15,,hf_open_llm_v1_240829_frozen.csv llama3_8b_nola,HFv1 GSM8K,44.66,,hf_open_llm_v1_240829_frozen.csv llama3_8b_nola,HFv1 HellaSwag,82.21,,hf_open_llm_v1_240829_frozen.csv llama3_8b_nola,HFv1 MMLU,66.69,,hf_open_llm_v1_240829_frozen.csv llama3_8b_nola,HFv1 TruthfulQA,42.93,,hf_open_llm_v1_240829_frozen.csv llama3_8b_nola,HFv1 Winogrande,78.22,,hf_open_llm_v1_240829_frozen.csv llama3_8b_okay,HF OpenLLM v1,68.85,,hf_open_llm_v1_240829_frozen.csv llama3_8b_okay,HFv1 ARC,63.14,,hf_open_llm_v1_240829_frozen.csv llama3_8b_okay,HFv1 GSM8K,70.05,,hf_open_llm_v1_240829_frozen.csv llama3_8b_okay,HFv1 HellaSwag,81.19,,hf_open_llm_v1_240829_frozen.csv llama3_8b_okay,HFv1 MMLU,68.8,,hf_open_llm_v1_240829_frozen.csv llama3_8b_okay,HFv1 TruthfulQA,52.88,,hf_open_llm_v1_240829_frozen.csv llama3_8b_okay,HFv1 Winogrande,77.03,,hf_open_llm_v1_240829_frozen.csv llama3_8b_orpo,HF OpenLLM v1,62.13,,hf_open_llm_v1_240829_frozen.csv llama3_8b_orpo,HFv1 ARC,56.23,,hf_open_llm_v1_240829_frozen.csv llama3_8b_orpo,HFv1 GSM8K,44.05,,hf_open_llm_v1_240829_frozen.csv llama3_8b_orpo,HFv1 HellaSwag,82.37,,hf_open_llm_v1_240829_frozen.csv llama3_8b_orpo,HFv1 MMLU,65.74,,hf_open_llm_v1_240829_frozen.csv llama3_8b_orpo,HFv1 TruthfulQA,46.81,,hf_open_llm_v1_240829_frozen.csv llama3_8b_orpo,HFv1 Winogrande,77.58,,hf_open_llm_v1_240829_frozen.csv llama3_8b_orpo_v0_1,HF OpenLLM v1,64.67,,hf_open_llm_v1_240829_frozen.csv llama3_8b_orpo_v0_1,HFv1 ARC,60.67,,hf_open_llm_v1_240829_frozen.csv llama3_8b_orpo_v0_1,HFv1 GSM8K,48.75,,hf_open_llm_v1_240829_frozen.csv llama3_8b_orpo_v0_1,HFv1 HellaSwag,82.56,,hf_open_llm_v1_240829_frozen.csv llama3_8b_orpo_v0_1,HFv1 MMLU,66.59,,hf_open_llm_v1_240829_frozen.csv llama3_8b_orpo_v0_1,HFv1 TruthfulQA,50.47,,hf_open_llm_v1_240829_frozen.csv llama3_8b_orpo_v0_1,HFv1 Winogrande,79.01,,hf_open_llm_v1_240829_frozen.csv llama3_8b_ortho_v2,HF OpenLLM v1,64.93,,hf_open_llm_v1_240829_frozen.csv llama3_8b_ortho_v2,HFv1 ARC,59.04,,hf_open_llm_v1_240829_frozen.csv llama3_8b_ortho_v2,HFv1 GSM8K,62.55,,hf_open_llm_v1_240829_frozen.csv llama3_8b_ortho_v2,HFv1 HellaSwag,78.35,,hf_open_llm_v1_240829_frozen.csv llama3_8b_ortho_v2,HFv1 MMLU,64.39,,hf_open_llm_v1_240829_frozen.csv llama3_8b_ortho_v2,HFv1 TruthfulQA,49.37,,hf_open_llm_v1_240829_frozen.csv llama3_8b_ortho_v2,HFv1 Winogrande,75.85,,hf_open_llm_v1_240829_frozen.csv llama3_8b_wangchanx_sft_demo,HF OpenLLM v1,63.22,,hf_open_llm_v1_240829_frozen.csv llama3_8b_wangchanx_sft_demo,HFv1 ARC,60.41,,hf_open_llm_v1_240829_frozen.csv llama3_8b_wangchanx_sft_demo,HFv1 GSM8K,52.24,,hf_open_llm_v1_240829_frozen.csv llama3_8b_wangchanx_sft_demo,HFv1 HellaSwag,83.12,,hf_open_llm_v1_240829_frozen.csv llama3_8b_wangchanx_sft_demo,HFv1 MMLU,65.48,,hf_open_llm_v1_240829_frozen.csv llama3_8b_wangchanx_sft_demo,HFv1 TruthfulQA,41.05,,hf_open_llm_v1_240829_frozen.csv llama3_8b_wangchanx_sft_demo,HFv1 Winogrande,77.03,,hf_open_llm_v1_240829_frozen.csv llama3_chinese_8b_instruct,HF OpenLLM v1,63.21,,hf_open_llm_v1_240829_frozen.csv llama3_chinese_8b_instruct,HFv1 ARC,61.26,,hf_open_llm_v1_240829_frozen.csv llama3_chinese_8b_instruct,HFv1 GSM8K,44.43,,hf_open_llm_v1_240829_frozen.csv llama3_chinese_8b_instruct,HFv1 HellaSwag,80.24,,hf_open_llm_v1_240829_frozen.csv llama3_chinese_8b_instruct,HFv1 MMLU,63.1,,hf_open_llm_v1_240829_frozen.csv llama3_chinese_8b_instruct,HFv1 TruthfulQA,55.15,,hf_open_llm_v1_240829_frozen.csv llama3_chinese_8b_instruct,HFv1 Winogrande,75.06,,hf_open_llm_v1_240829_frozen.csv llama3_chinese_8b_instruct_v2,HF OpenLLM v1,66.68,,hf_open_llm_v1_240829_frozen.csv llama3_chinese_8b_instruct_v2,HFv1 ARC,62.63,,hf_open_llm_v1_240829_frozen.csv llama3_chinese_8b_instruct_v2,HFv1 GSM8K,60.58,,hf_open_llm_v1_240829_frozen.csv llama3_chinese_8b_instruct_v2,HFv1 HellaSwag,79.72,,hf_open_llm_v1_240829_frozen.csv llama3_chinese_8b_instruct_v2,HFv1 MMLU,66.48,,hf_open_llm_v1_240829_frozen.csv llama3_chinese_8b_instruct_v2,HFv1 TruthfulQA,53.93,,hf_open_llm_v1_240829_frozen.csv llama3_chinese_8b_instruct_v2,HFv1 Winogrande,76.72,,hf_open_llm_v1_240829_frozen.csv llama3_chinese_8b_instruct_v3,HF OpenLLM v1,66.81,,hf_open_llm_v1_240829_frozen.csv llama3_chinese_8b_instruct_v3,HFv1 ARC,63.4,,hf_open_llm_v1_240829_frozen.csv llama3_chinese_8b_instruct_v3,HFv1 GSM8K,59.21,,hf_open_llm_v1_240829_frozen.csv llama3_chinese_8b_instruct_v3,HFv1 HellaSwag,80.51,,hf_open_llm_v1_240829_frozen.csv llama3_chinese_8b_instruct_v3,HFv1 MMLU,67.9,,hf_open_llm_v1_240829_frozen.csv llama3_chinese_8b_instruct_v3,HFv1 TruthfulQA,53.57,,hf_open_llm_v1_240829_frozen.csv llama3_chinese_8b_instruct_v3,HFv1 Winogrande,76.24,,hf_open_llm_v1_240829_frozen.csv llama3_neural_chat_v1_8b,HF OpenLLM v1,66.5,,hf_open_llm_v1_240829_frozen.csv llama3_neural_chat_v1_8b,HFv1 ARC,60.84,,hf_open_llm_v1_240829_frozen.csv llama3_neural_chat_v1_8b,HFv1 GSM8K,54.81,,hf_open_llm_v1_240829_frozen.csv llama3_neural_chat_v1_8b,HFv1 HellaSwag,84.13,,hf_open_llm_v1_240829_frozen.csv llama3_neural_chat_v1_8b,HFv1 MMLU,64.69,,hf_open_llm_v1_240829_frozen.csv llama3_neural_chat_v1_8b,HFv1 TruthfulQA,56.34,,hf_open_llm_v1_240829_frozen.csv llama3_neural_chat_v1_8b,HFv1 Winogrande,78.22,,hf_open_llm_v1_240829_frozen.csv llama3_neural_chat_v2_2_8b,HF OpenLLM v1,65.44,,hf_open_llm_v1_240829_frozen.csv llama3_neural_chat_v2_2_8b,HFv1 ARC,61.6,,hf_open_llm_v1_240829_frozen.csv llama3_neural_chat_v2_2_8b,HFv1 GSM8K,53.07,,hf_open_llm_v1_240829_frozen.csv llama3_neural_chat_v2_2_8b,HFv1 HellaSwag,83.04,,hf_open_llm_v1_240829_frozen.csv llama3_neural_chat_v2_2_8b,HFv1 MMLU,64.97,,hf_open_llm_v1_240829_frozen.csv llama3_neural_chat_v2_2_8b,HFv1 TruthfulQA,51.88,,hf_open_llm_v1_240829_frozen.csv llama3_neural_chat_v2_2_8b,HFv1 Winogrande,78.06,,hf_open_llm_v1_240829_frozen.csv llama3_neurona_8b,HF OpenLLM v1,65.83,,hf_open_llm_v1_240829_frozen.csv llama3_neurona_8b,HFv1 ARC,58.02,,hf_open_llm_v1_240829_frozen.csv llama3_neurona_8b,HFv1 GSM8K,66.79,,hf_open_llm_v1_240829_frozen.csv llama3_neurona_8b,HFv1 HellaSwag,79.7,,hf_open_llm_v1_240829_frozen.csv llama3_neurona_8b,HFv1 MMLU,62.0,,hf_open_llm_v1_240829_frozen.csv llama3_neurona_8b,HFv1 TruthfulQA,53.36,,hf_open_llm_v1_240829_frozen.csv llama3_neurona_8b,HFv1 Winogrande,75.14,,hf_open_llm_v1_240829_frozen.csv llama3_orca_2_0_8b,HF OpenLLM v1,64.89,,hf_open_llm_v1_240829_frozen.csv llama3_orca_2_0_8b,HFv1 ARC,59.64,,hf_open_llm_v1_240829_frozen.csv llama3_orca_2_0_8b,HFv1 GSM8K,53.15,,hf_open_llm_v1_240829_frozen.csv llama3_orca_2_0_8b,HFv1 HellaSwag,82.18,,hf_open_llm_v1_240829_frozen.csv llama3_orca_2_0_8b,HFv1 MMLU,65.03,,hf_open_llm_v1_240829_frozen.csv llama3_orca_2_0_8b,HFv1 TruthfulQA,51.1,,hf_open_llm_v1_240829_frozen.csv llama3_orca_2_0_8b,HFv1 Winogrande,78.22,,hf_open_llm_v1_240829_frozen.csv llama3_orpo_v1_merged_16bit,HF OpenLLM v1,30.44,,hf_open_llm_v1_240829_frozen.csv llama3_orpo_v1_merged_16bit,HFv1 ARC,23.29,,hf_open_llm_v1_240829_frozen.csv llama3_orpo_v1_merged_16bit,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv llama3_orpo_v1_merged_16bit,HFv1 HellaSwag,41.14,,hf_open_llm_v1_240829_frozen.csv llama3_orpo_v1_merged_16bit,HFv1 MMLU,24.59,,hf_open_llm_v1_240829_frozen.csv llama3_orpo_v1_merged_16bit,HFv1 TruthfulQA,40.92,,hf_open_llm_v1_240829_frozen.csv llama3_orpo_v1_merged_16bit,HFv1 Winogrande,52.72,,hf_open_llm_v1_240829_frozen.csv llama3_ruozhiba_8b,HF OpenLLM v1,66.65,,hf_open_llm_v1_240829_frozen.csv llama3_ruozhiba_8b,HFv1 ARC,60.58,,hf_open_llm_v1_240829_frozen.csv llama3_ruozhiba_8b,HFv1 GSM8K,68.23,,hf_open_llm_v1_240829_frozen.csv llama3_ruozhiba_8b,HFv1 HellaSwag,78.85,,hf_open_llm_v1_240829_frozen.csv llama3_ruozhiba_8b,HFv1 MMLU,66.8,,hf_open_llm_v1_240829_frozen.csv llama3_ruozhiba_8b,HFv1 TruthfulQA,49.97,,hf_open_llm_v1_240829_frozen.csv llama3_ruozhiba_8b,HFv1 Winogrande,75.45,,hf_open_llm_v1_240829_frozen.csv llama3_soliloquy_8b,HF OpenLLM v1,59.72,,hf_open_llm_v1_240829_frozen.csv llama3_soliloquy_8b,HFv1 ARC,58.11,,hf_open_llm_v1_240829_frozen.csv llama3_soliloquy_8b,HFv1 GSM8K,46.17,,hf_open_llm_v1_240829_frozen.csv llama3_soliloquy_8b,HFv1 HellaSwag,78.06,,hf_open_llm_v1_240829_frozen.csv llama3_soliloquy_8b,HFv1 MMLU,57.11,,hf_open_llm_v1_240829_frozen.csv llama3_soliloquy_8b,HFv1 TruthfulQA,47.68,,hf_open_llm_v1_240829_frozen.csv llama3_soliloquy_8b,HFv1 Winogrande,71.19,,hf_open_llm_v1_240829_frozen.csv llama3_tenyxchat_70b,HF OpenLLM v1,78.4,,hf_open_llm_v1_240829_frozen.csv llama3_tenyxchat_70b,HFv1 ARC,72.1,,hf_open_llm_v1_240829_frozen.csv llama3_tenyxchat_70b,HFv1 GSM8K,86.28,,hf_open_llm_v1_240829_frozen.csv llama3_tenyxchat_70b,HFv1 HellaSwag,86.21,,hf_open_llm_v1_240829_frozen.csv llama3_tenyxchat_70b,HFv1 MMLU,80.04,,hf_open_llm_v1_240829_frozen.csv llama3_tenyxchat_70b,HFv1 TruthfulQA,62.85,,hf_open_llm_v1_240829_frozen.csv llama3_tenyxchat_70b,HFv1 Winogrande,82.95,,hf_open_llm_v1_240829_frozen.csv llama3_youko_8b,HF OpenLLM v1,57.55,,hf_open_llm_v1_240829_frozen.csv llama3_youko_8b,HFv1 ARC,54.44,,hf_open_llm_v1_240829_frozen.csv llama3_youko_8b,HFv1 GSM8K,34.42,,hf_open_llm_v1_240829_frozen.csv llama3_youko_8b,HFv1 HellaSwag,79.91,,hf_open_llm_v1_240829_frozen.csv llama3_youko_8b,HFv1 MMLU,60.9,,hf_open_llm_v1_240829_frozen.csv llama3_youko_8b,HFv1 TruthfulQA,41.05,,hf_open_llm_v1_240829_frozen.csv llama3_youko_8b,HFv1 Winogrande,74.59,,hf_open_llm_v1_240829_frozen.csv llama_13b,HF OpenLLM v1,51.36,,hf_open_llm_v1_240829_frozen.csv llama_13b,HFv1 ARC,56.23,,hf_open_llm_v1_240829_frozen.csv llama_13b,HFv1 GSM8K,7.58,,hf_open_llm_v1_240829_frozen.csv llama_13b,HFv1 HellaSwag,80.93,,hf_open_llm_v1_240829_frozen.csv llama_13b,HFv1 MMLU,47.67,,hf_open_llm_v1_240829_frozen.csv llama_13b,HFv1 TruthfulQA,39.48,,hf_open_llm_v1_240829_frozen.csv llama_13b,HFv1 Winogrande,76.24,,hf_open_llm_v1_240829_frozen.csv llama_160m,HF OpenLLM v1,29.55,,hf_open_llm_v1_240829_frozen.csv llama_160m,HFv1 ARC,24.83,,hf_open_llm_v1_240829_frozen.csv llama_160m,HFv1 GSM8K,0.08,,hf_open_llm_v1_240829_frozen.csv llama_160m,HFv1 HellaSwag,35.23,,hf_open_llm_v1_240829_frozen.csv llama_160m,HFv1 MMLU,24.26,,hf_open_llm_v1_240829_frozen.csv llama_160m,HFv1 TruthfulQA,42.08,,hf_open_llm_v1_240829_frozen.csv llama_160m,HFv1 Winogrande,50.83,,hf_open_llm_v1_240829_frozen.csv llama_160m_chat_v1,HF OpenLLM v1,30.28,,hf_open_llm_v1_240829_frozen.csv llama_160m_chat_v1,HFv1 ARC,24.74,,hf_open_llm_v1_240829_frozen.csv llama_160m_chat_v1,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv llama_160m_chat_v1,HFv1 HellaSwag,35.32,,hf_open_llm_v1_240829_frozen.csv llama_160m_chat_v1,HFv1 MMLU,26.14,,hf_open_llm_v1_240829_frozen.csv llama_160m_chat_v1,HFv1 TruthfulQA,44.16,,hf_open_llm_v1_240829_frozen.csv llama_160m_chat_v1,HFv1 Winogrande,51.3,,hf_open_llm_v1_240829_frozen.csv llama_2_13b,HF OpenLLM v1,55.69,,hf_open_llm_v1_240829_frozen.csv llama_2_13b,HFv1 ARC,59.39,,hf_open_llm_v1_240829_frozen.csv llama_2_13b,HFv1 GSM8K,22.82,,hf_open_llm_v1_240829_frozen.csv llama_2_13b,HFv1 HellaSwag,82.13,,hf_open_llm_v1_240829_frozen.csv llama_2_13b,HFv1 MMLU,55.77,,hf_open_llm_v1_240829_frozen.csv llama_2_13b,HFv1 TruthfulQA,37.38,,hf_open_llm_v1_240829_frozen.csv llama_2_13b,HFv1 Winogrande,76.64,,hf_open_llm_v1_240829_frozen.csv llama_2_13b_beluga_qlora,HF OpenLLM v1,54.09,,hf_open_llm_v1_240829_frozen.csv llama_2_13b_beluga_qlora,HFv1 ARC,59.22,,hf_open_llm_v1_240829_frozen.csv llama_2_13b_beluga_qlora,HFv1 GSM8K,1.29,,hf_open_llm_v1_240829_frozen.csv llama_2_13b_beluga_qlora,HFv1 HellaSwag,81.92,,hf_open_llm_v1_240829_frozen.csv llama_2_13b_beluga_qlora,HFv1 MMLU,56.67,,hf_open_llm_v1_240829_frozen.csv llama_2_13b_beluga_qlora,HFv1 TruthfulQA,48.23,,hf_open_llm_v1_240829_frozen.csv llama_2_13b_beluga_qlora,HFv1 Winogrande,77.19,,hf_open_llm_v1_240829_frozen.csv llama_2_13b_chat,HF OpenLLM v1,54.91,,hf_open_llm_v1_240829_frozen.csv llama_2_13b_chat,HFv1 ARC,59.04,,hf_open_llm_v1_240829_frozen.csv llama_2_13b_chat,HFv1 GSM8K,15.24,,hf_open_llm_v1_240829_frozen.csv llama_2_13b_chat,HFv1 HellaSwag,81.94,,hf_open_llm_v1_240829_frozen.csv llama_2_13b_chat,HFv1 MMLU,54.64,,hf_open_llm_v1_240829_frozen.csv llama_2_13b_chat,HFv1 TruthfulQA,44.12,,hf_open_llm_v1_240829_frozen.csv llama_2_13b_chat,HFv1 Winogrande,74.51,,hf_open_llm_v1_240829_frozen.csv llama_2_13b_chat_dutch,HF OpenLLM v1,53.69,,hf_open_llm_v1_240829_frozen.csv llama_2_13b_chat_dutch,HFv1 ARC,59.3,,hf_open_llm_v1_240829_frozen.csv llama_2_13b_chat_dutch,HFv1 GSM8K,10.69,,hf_open_llm_v1_240829_frozen.csv llama_2_13b_chat_dutch,HFv1 HellaSwag,81.45,,hf_open_llm_v1_240829_frozen.csv llama_2_13b_chat_dutch,HFv1 MMLU,55.82,,hf_open_llm_v1_240829_frozen.csv llama_2_13b_chat_dutch,HFv1 TruthfulQA,38.23,,hf_open_llm_v1_240829_frozen.csv llama_2_13b_chat_dutch,HFv1 Winogrande,76.64,,hf_open_llm_v1_240829_frozen.csv llama_2_13b_chat_platypus,HF OpenLLM v1,53.92,,hf_open_llm_v1_240829_frozen.csv llama_2_13b_chat_platypus,HFv1 ARC,53.84,,hf_open_llm_v1_240829_frozen.csv llama_2_13b_chat_platypus,HFv1 GSM8K,12.36,,hf_open_llm_v1_240829_frozen.csv llama_2_13b_chat_platypus,HFv1 HellaSwag,80.67,,hf_open_llm_v1_240829_frozen.csv llama_2_13b_chat_platypus,HFv1 MMLU,54.44,,hf_open_llm_v1_240829_frozen.csv llama_2_13b_chat_platypus,HFv1 TruthfulQA,46.23,,hf_open_llm_v1_240829_frozen.csv llama_2_13b_chat_platypus,HFv1 Winogrande,76.01,,hf_open_llm_v1_240829_frozen.csv llama_2_13b_ds_eli5_1024_r_64_alpha_16,HF OpenLLM v1,54.61,,hf_open_llm_v1_240829_frozen.csv llama_2_13b_ds_eli5_1024_r_64_alpha_16,HFv1 ARC,60.41,,hf_open_llm_v1_240829_frozen.csv llama_2_13b_ds_eli5_1024_r_64_alpha_16,HFv1 GSM8K,8.49,,hf_open_llm_v1_240829_frozen.csv llama_2_13b_ds_eli5_1024_r_64_alpha_16,HFv1 HellaSwag,82.58,,hf_open_llm_v1_240829_frozen.csv llama_2_13b_ds_eli5_1024_r_64_alpha_16,HFv1 MMLU,55.86,,hf_open_llm_v1_240829_frozen.csv llama_2_13b_ds_eli5_1024_r_64_alpha_16,HFv1 TruthfulQA,43.61,,hf_open_llm_v1_240829_frozen.csv llama_2_13b_ds_eli5_1024_r_64_alpha_16,HFv1 Winogrande,76.72,,hf_open_llm_v1_240829_frozen.csv llama_2_13b_ds_eli5_1024_r_64_alpha_16_merged,HF OpenLLM v1,54.16,,hf_open_llm_v1_240829_frozen.csv llama_2_13b_ds_eli5_1024_r_64_alpha_16_merged,HFv1 ARC,59.13,,hf_open_llm_v1_240829_frozen.csv llama_2_13b_ds_eli5_1024_r_64_alpha_16_merged,HFv1 GSM8K,8.11,,hf_open_llm_v1_240829_frozen.csv llama_2_13b_ds_eli5_1024_r_64_alpha_16_merged,HFv1 HellaSwag,82.13,,hf_open_llm_v1_240829_frozen.csv llama_2_13b_ds_eli5_1024_r_64_alpha_16_merged,HFv1 MMLU,54.98,,hf_open_llm_v1_240829_frozen.csv llama_2_13b_ds_eli5_1024_r_64_alpha_16_merged,HFv1 TruthfulQA,44.23,,hf_open_llm_v1_240829_frozen.csv llama_2_13b_ds_eli5_1024_r_64_alpha_16_merged,HFv1 Winogrande,76.4,,hf_open_llm_v1_240829_frozen.csv llama_2_13b_ds_wiki_1024_full_r_64_alpha_16,HF OpenLLM v1,53.14,,hf_open_llm_v1_240829_frozen.csv llama_2_13b_ds_wiki_1024_full_r_64_alpha_16,HFv1 ARC,59.04,,hf_open_llm_v1_240829_frozen.csv llama_2_13b_ds_wiki_1024_full_r_64_alpha_16,HFv1 GSM8K,10.01,,hf_open_llm_v1_240829_frozen.csv llama_2_13b_ds_wiki_1024_full_r_64_alpha_16,HFv1 HellaSwag,82.33,,hf_open_llm_v1_240829_frozen.csv llama_2_13b_ds_wiki_1024_full_r_64_alpha_16,HFv1 MMLU,55.36,,hf_open_llm_v1_240829_frozen.csv llama_2_13b_ds_wiki_1024_full_r_64_alpha_16,HFv1 TruthfulQA,35.75,,hf_open_llm_v1_240829_frozen.csv llama_2_13b_ds_wiki_1024_full_r_64_alpha_16,HFv1 Winogrande,76.32,,hf_open_llm_v1_240829_frozen.csv llama_2_13b_ds_wiki_1024_full_r_64_alpha_16_merged,HF OpenLLM v1,52.94,,hf_open_llm_v1_240829_frozen.csv llama_2_13b_ds_wiki_1024_full_r_64_alpha_16_merged,HFv1 ARC,58.45,,hf_open_llm_v1_240829_frozen.csv llama_2_13b_ds_wiki_1024_full_r_64_alpha_16_merged,HFv1 GSM8K,10.69,,hf_open_llm_v1_240829_frozen.csv llama_2_13b_ds_wiki_1024_full_r_64_alpha_16_merged,HFv1 HellaSwag,81.97,,hf_open_llm_v1_240829_frozen.csv llama_2_13b_ds_wiki_1024_full_r_64_alpha_16_merged,HFv1 MMLU,55.02,,hf_open_llm_v1_240829_frozen.csv llama_2_13b_ds_wiki_1024_full_r_64_alpha_16_merged,HFv1 TruthfulQA,35.85,,hf_open_llm_v1_240829_frozen.csv llama_2_13b_ds_wiki_1024_full_r_64_alpha_16_merged,HFv1 Winogrande,75.69,,hf_open_llm_v1_240829_frozen.csv llama_2_13b_eli5_wiki_1024_r_64_alpha_16,HF OpenLLM v1,54.14,,hf_open_llm_v1_240829_frozen.csv llama_2_13b_eli5_wiki_1024_r_64_alpha_16,HFv1 ARC,59.98,,hf_open_llm_v1_240829_frozen.csv llama_2_13b_eli5_wiki_1024_r_64_alpha_16,HFv1 GSM8K,10.54,,hf_open_llm_v1_240829_frozen.csv llama_2_13b_eli5_wiki_1024_r_64_alpha_16,HFv1 HellaSwag,82.43,,hf_open_llm_v1_240829_frozen.csv llama_2_13b_eli5_wiki_1024_r_64_alpha_16,HFv1 MMLU,55.41,,hf_open_llm_v1_240829_frozen.csv llama_2_13b_eli5_wiki_1024_r_64_alpha_16,HFv1 TruthfulQA,39.9,,hf_open_llm_v1_240829_frozen.csv llama_2_13b_eli5_wiki_1024_r_64_alpha_16,HFv1 Winogrande,76.56,,hf_open_llm_v1_240829_frozen.csv llama_2_13b_eli5_wiki_1024_r_64_alpha_16_merged,HF OpenLLM v1,53.57,,hf_open_llm_v1_240829_frozen.csv llama_2_13b_eli5_wiki_1024_r_64_alpha_16_merged,HFv1 ARC,58.96,,hf_open_llm_v1_240829_frozen.csv llama_2_13b_eli5_wiki_1024_r_64_alpha_16_merged,HFv1 GSM8K,8.72,,hf_open_llm_v1_240829_frozen.csv llama_2_13b_eli5_wiki_1024_r_64_alpha_16_merged,HFv1 HellaSwag,81.94,,hf_open_llm_v1_240829_frozen.csv llama_2_13b_eli5_wiki_1024_r_64_alpha_16_merged,HFv1 MMLU,55.0,,hf_open_llm_v1_240829_frozen.csv llama_2_13b_eli5_wiki_1024_r_64_alpha_16_merged,HFv1 TruthfulQA,40.26,,hf_open_llm_v1_240829_frozen.csv llama_2_13b_eli5_wiki_1024_r_64_alpha_16_merged,HFv1 Winogrande,76.56,,hf_open_llm_v1_240829_frozen.csv llama_2_13b_fp16,HF OpenLLM v1,53.67,,hf_open_llm_v1_240829_frozen.csv llama_2_13b_fp16,HFv1 ARC,59.3,,hf_open_llm_v1_240829_frozen.csv llama_2_13b_fp16,HFv1 GSM8K,10.84,,hf_open_llm_v1_240829_frozen.csv llama_2_13b_fp16,HFv1 HellaSwag,82.15,,hf_open_llm_v1_240829_frozen.csv llama_2_13b_fp16,HFv1 MMLU,55.67,,hf_open_llm_v1_240829_frozen.csv llama_2_13b_fp16,HFv1 TruthfulQA,37.39,,hf_open_llm_v1_240829_frozen.csv llama_2_13b_fp16,HFv1 Winogrande,76.64,,hf_open_llm_v1_240829_frozen.csv llama_2_13b_ft_instruct_es,HF OpenLLM v1,52.89,,hf_open_llm_v1_240829_frozen.csv llama_2_13b_ft_instruct_es,HFv1 ARC,59.39,,hf_open_llm_v1_240829_frozen.csv llama_2_13b_ft_instruct_es,HFv1 GSM8K,8.57,,hf_open_llm_v1_240829_frozen.csv llama_2_13b_ft_instruct_es,HFv1 HellaSwag,81.51,,hf_open_llm_v1_240829_frozen.csv llama_2_13b_ft_instruct_es,HFv1 MMLU,54.31,,hf_open_llm_v1_240829_frozen.csv llama_2_13b_ft_instruct_es,HFv1 TruthfulQA,37.81,,hf_open_llm_v1_240829_frozen.csv llama_2_13b_ft_instruct_es,HFv1 Winogrande,75.77,,hf_open_llm_v1_240829_frozen.csv llama_2_13b_german_orpo,HF OpenLLM v1,53.44,,hf_open_llm_v1_240829_frozen.csv llama_2_13b_german_orpo,HFv1 ARC,54.78,,hf_open_llm_v1_240829_frozen.csv llama_2_13b_german_orpo,HFv1 GSM8K,17.29,,hf_open_llm_v1_240829_frozen.csv llama_2_13b_german_orpo,HFv1 HellaSwag,79.05,,hf_open_llm_v1_240829_frozen.csv llama_2_13b_german_orpo,HFv1 MMLU,53.45,,hf_open_llm_v1_240829_frozen.csv llama_2_13b_german_orpo,HFv1 TruthfulQA,42.44,,hf_open_llm_v1_240829_frozen.csv llama_2_13b_german_orpo,HFv1 Winogrande,73.64,,hf_open_llm_v1_240829_frozen.csv llama_2_13b_guanaco_qlora,HF OpenLLM v1,55.31,,hf_open_llm_v1_240829_frozen.csv llama_2_13b_guanaco_qlora,HFv1 ARC,61.09,,hf_open_llm_v1_240829_frozen.csv llama_2_13b_guanaco_qlora,HFv1 GSM8K,10.99,,hf_open_llm_v1_240829_frozen.csv llama_2_13b_guanaco_qlora,HFv1 HellaSwag,82.99,,hf_open_llm_v1_240829_frozen.csv llama_2_13b_guanaco_qlora,HFv1 MMLU,55.47,,hf_open_llm_v1_240829_frozen.csv llama_2_13b_guanaco_qlora,HFv1 TruthfulQA,44.12,,hf_open_llm_v1_240829_frozen.csv llama_2_13b_guanaco_qlora,HFv1 Winogrande,77.19,,hf_open_llm_v1_240829_frozen.csv llama_2_13b_instruct_v0_2,HF OpenLLM v1,55.14,,hf_open_llm_v1_240829_frozen.csv llama_2_13b_instruct_v0_2,HFv1 ARC,60.58,,hf_open_llm_v1_240829_frozen.csv llama_2_13b_instruct_v0_2,HFv1 GSM8K,9.33,,hf_open_llm_v1_240829_frozen.csv llama_2_13b_instruct_v0_2,HFv1 HellaSwag,81.96,,hf_open_llm_v1_240829_frozen.csv llama_2_13b_instruct_v0_2,HFv1 MMLU,55.46,,hf_open_llm_v1_240829_frozen.csv llama_2_13b_instruct_v0_2,HFv1 TruthfulQA,45.71,,hf_open_llm_v1_240829_frozen.csv llama_2_13b_instruct_v0_2,HFv1 Winogrande,77.82,,hf_open_llm_v1_240829_frozen.csv llama_2_13b_instructed,HF OpenLLM v1,54.63,,hf_open_llm_v1_240829_frozen.csv llama_2_13b_instructed,HFv1 ARC,59.39,,hf_open_llm_v1_240829_frozen.csv llama_2_13b_instructed,HFv1 GSM8K,8.04,,hf_open_llm_v1_240829_frozen.csv llama_2_13b_instructed,HFv1 HellaSwag,83.88,,hf_open_llm_v1_240829_frozen.csv llama_2_13b_instructed,HFv1 MMLU,55.57,,hf_open_llm_v1_240829_frozen.csv llama_2_13b_instructed,HFv1 TruthfulQA,46.89,,hf_open_llm_v1_240829_frozen.csv llama_2_13b_instructed,HFv1 Winogrande,74.03,,hf_open_llm_v1_240829_frozen.csv llama_2_13b_platypus,HF OpenLLM v1,54.22,,hf_open_llm_v1_240829_frozen.csv llama_2_13b_platypus,HFv1 ARC,58.87,,hf_open_llm_v1_240829_frozen.csv llama_2_13b_platypus,HFv1 GSM8K,9.4,,hf_open_llm_v1_240829_frozen.csv llama_2_13b_platypus,HFv1 HellaSwag,82.14,,hf_open_llm_v1_240829_frozen.csv llama_2_13b_platypus,HFv1 MMLU,54.98,,hf_open_llm_v1_240829_frozen.csv llama_2_13b_platypus,HFv1 TruthfulQA,42.84,,hf_open_llm_v1_240829_frozen.csv llama_2_13b_platypus,HFv1 Winogrande,77.11,,hf_open_llm_v1_240829_frozen.csv llama_2_13b_platypus_vicuna_wizard,HF OpenLLM v1,52.9,,hf_open_llm_v1_240829_frozen.csv llama_2_13b_platypus_vicuna_wizard,HFv1 ARC,61.26,,hf_open_llm_v1_240829_frozen.csv llama_2_13b_platypus_vicuna_wizard,HFv1 GSM8K,0.91,,hf_open_llm_v1_240829_frozen.csv llama_2_13b_platypus_vicuna_wizard,HFv1 HellaSwag,82.31,,hf_open_llm_v1_240829_frozen.csv llama_2_13b_platypus_vicuna_wizard,HFv1 MMLU,55.21,,hf_open_llm_v1_240829_frozen.csv llama_2_13b_platypus_vicuna_wizard,HFv1 TruthfulQA,41.91,,hf_open_llm_v1_240829_frozen.csv llama_2_13b_platypus_vicuna_wizard,HFv1 Winogrande,75.77,,hf_open_llm_v1_240829_frozen.csv llama_2_13b_qlora,HF OpenLLM v1,53.87,,hf_open_llm_v1_240829_frozen.csv llama_2_13b_qlora,HFv1 ARC,58.02,,hf_open_llm_v1_240829_frozen.csv llama_2_13b_qlora,HFv1 GSM8K,3.26,,hf_open_llm_v1_240829_frozen.csv llama_2_13b_qlora,HFv1 HellaSwag,82.33,,hf_open_llm_v1_240829_frozen.csv llama_2_13b_qlora,HFv1 MMLU,55.8,,hf_open_llm_v1_240829_frozen.csv llama_2_13b_qlora,HFv1 TruthfulQA,46.23,,hf_open_llm_v1_240829_frozen.csv llama_2_13b_qlora,HFv1 Winogrande,77.58,,hf_open_llm_v1_240829_frozen.csv llama_2_13b_vicuna_wizard,HF OpenLLM v1,51.94,,hf_open_llm_v1_240829_frozen.csv llama_2_13b_vicuna_wizard,HFv1 ARC,57.76,,hf_open_llm_v1_240829_frozen.csv llama_2_13b_vicuna_wizard,HFv1 GSM8K,0.91,,hf_open_llm_v1_240829_frozen.csv llama_2_13b_vicuna_wizard,HFv1 HellaSwag,82.16,,hf_open_llm_v1_240829_frozen.csv llama_2_13b_vicuna_wizard,HFv1 MMLU,54.68,,hf_open_llm_v1_240829_frozen.csv llama_2_13b_vicuna_wizard,HFv1 TruthfulQA,41.11,,hf_open_llm_v1_240829_frozen.csv llama_2_13b_vicuna_wizard,HFv1 Winogrande,74.98,,hf_open_llm_v1_240829_frozen.csv llama_2_16b_nastychat,HF OpenLLM v1,55.04,,hf_open_llm_v1_240829_frozen.csv llama_2_16b_nastychat,HFv1 ARC,57.42,,hf_open_llm_v1_240829_frozen.csv llama_2_16b_nastychat,HFv1 GSM8K,8.11,,hf_open_llm_v1_240829_frozen.csv llama_2_16b_nastychat,HFv1 HellaSwag,80.59,,hf_open_llm_v1_240829_frozen.csv llama_2_16b_nastychat,HFv1 MMLU,55.99,,hf_open_llm_v1_240829_frozen.csv llama_2_16b_nastychat,HFv1 TruthfulQA,53.45,,hf_open_llm_v1_240829_frozen.csv llama_2_16b_nastychat,HFv1 Winogrande,74.66,,hf_open_llm_v1_240829_frozen.csv llama_2_26b_trenchcoat_stack,HF OpenLLM v1,51.13,,hf_open_llm_v1_240829_frozen.csv llama_2_26b_trenchcoat_stack,HFv1 ARC,55.03,,hf_open_llm_v1_240829_frozen.csv llama_2_26b_trenchcoat_stack,HFv1 GSM8K,2.88,,hf_open_llm_v1_240829_frozen.csv llama_2_26b_trenchcoat_stack,HFv1 HellaSwag,79.9,,hf_open_llm_v1_240829_frozen.csv llama_2_26b_trenchcoat_stack,HFv1 MMLU,53.73,,hf_open_llm_v1_240829_frozen.csv llama_2_26b_trenchcoat_stack,HFv1 TruthfulQA,40.48,,hf_open_llm_v1_240829_frozen.csv llama_2_26b_trenchcoat_stack,HFv1 Winogrande,74.74,,hf_open_llm_v1_240829_frozen.csv llama_2_3b,HF OpenLLM v1,29.53,,hf_open_llm_v1_240829_frozen.csv llama_2_3b,HFv1 ARC,26.96,,hf_open_llm_v1_240829_frozen.csv llama_2_3b,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv llama_2_3b,HFv1 HellaSwag,26.52,,hf_open_llm_v1_240829_frozen.csv llama_2_3b,HFv1 MMLU,23.33,,hf_open_llm_v1_240829_frozen.csv llama_2_3b,HFv1 TruthfulQA,50.71,,hf_open_llm_v1_240829_frozen.csv llama_2_3b,HFv1 Winogrande,49.64,,hf_open_llm_v1_240829_frozen.csv llama_2_70b,HF OpenLLM v1,67.87,,hf_open_llm_v1_240829_frozen.csv llama_2_70b,HFv1 ARC,67.32,,hf_open_llm_v1_240829_frozen.csv llama_2_70b,HFv1 GSM8K,54.06,,hf_open_llm_v1_240829_frozen.csv llama_2_70b,HFv1 HellaSwag,87.33,,hf_open_llm_v1_240829_frozen.csv llama_2_70b,HFv1 MMLU,69.83,,hf_open_llm_v1_240829_frozen.csv llama_2_70b,HFv1 TruthfulQA,44.92,,hf_open_llm_v1_240829_frozen.csv llama_2_70b,HFv1 Winogrande,83.74,,hf_open_llm_v1_240829_frozen.csv llama_2_70b_chat,HF OpenLLM v1,62.4,,hf_open_llm_v1_240829_frozen.csv llama_2_70b_chat,HFv1 ARC,64.59,,hf_open_llm_v1_240829_frozen.csv llama_2_70b_chat,HFv1 GSM8K,26.69,,hf_open_llm_v1_240829_frozen.csv llama_2_70b_chat,HFv1 HellaSwag,85.88,,hf_open_llm_v1_240829_frozen.csv llama_2_70b_chat,HFv1 MMLU,63.91,,hf_open_llm_v1_240829_frozen.csv llama_2_70b_chat,HFv1 TruthfulQA,52.8,,hf_open_llm_v1_240829_frozen.csv llama_2_70b_chat,HFv1 Winogrande,80.51,,hf_open_llm_v1_240829_frozen.csv llama_2_70b_ia3_guanaco,HF OpenLLM v1,62.61,,hf_open_llm_v1_240829_frozen.csv llama_2_70b_ia3_guanaco,HFv1 ARC,68.52,,hf_open_llm_v1_240829_frozen.csv llama_2_70b_ia3_guanaco,HFv1 GSM8K,28.73,,hf_open_llm_v1_240829_frozen.csv llama_2_70b_ia3_guanaco,HFv1 HellaSwag,85.67,,hf_open_llm_v1_240829_frozen.csv llama_2_70b_ia3_guanaco,HFv1 MMLU,67.03,,hf_open_llm_v1_240829_frozen.csv llama_2_70b_ia3_guanaco,HFv1 TruthfulQA,43.47,,hf_open_llm_v1_240829_frozen.csv llama_2_70b_ia3_guanaco,HFv1 Winogrande,82.24,,hf_open_llm_v1_240829_frozen.csv llama_2_7b,HF OpenLLM v1,50.97,,hf_open_llm_v1_240829_frozen.csv llama_2_7b,HFv1 ARC,53.07,,hf_open_llm_v1_240829_frozen.csv llama_2_7b,HFv1 GSM8K,14.48,,hf_open_llm_v1_240829_frozen.csv llama_2_7b,HFv1 HellaSwag,78.59,,hf_open_llm_v1_240829_frozen.csv llama_2_7b,HFv1 MMLU,46.87,,hf_open_llm_v1_240829_frozen.csv llama_2_7b,HFv1 TruthfulQA,38.76,,hf_open_llm_v1_240829_frozen.csv llama_2_7b,HFv1 Winogrande,74.03,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_32k_instruct,HF OpenLLM v1,49.65,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_32k_instruct,HFv1 ARC,51.37,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_32k_instruct,HFv1 GSM8K,4.7,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_32k_instruct,HFv1 HellaSwag,78.47,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_32k_instruct,HFv1 MMLU,45.53,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_32k_instruct,HFv1 TruthfulQA,45.01,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_32k_instruct,HFv1 Winogrande,72.85,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_alpaca_gpt4,HF OpenLLM v1,51.75,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_alpaca_gpt4,HFv1 ARC,53.16,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_alpaca_gpt4,HFv1 GSM8K,9.7,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_alpaca_gpt4,HFv1 HellaSwag,79.24,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_alpaca_gpt4,HFv1 MMLU,44.15,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_alpaca_gpt4,HFv1 TruthfulQA,52.43,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_alpaca_gpt4,HFv1 Winogrande,71.82,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat,HF OpenLLM v1,50.74,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat,HFv1 ARC,52.9,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat,HFv1 GSM8K,7.35,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat,HFv1 HellaSwag,78.55,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat,HFv1 MMLU,48.32,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat,HFv1 TruthfulQA,45.57,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat,HFv1 Winogrande,71.74,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat_10_attention_sparsity,HF OpenLLM v1,52.52,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat_10_attention_sparsity,HFv1 ARC,52.9,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat_10_attention_sparsity,HFv1 GSM8K,19.11,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat_10_attention_sparsity,HFv1 HellaSwag,78.18,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat_10_attention_sparsity,HFv1 MMLU,48.1,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat_10_attention_sparsity,HFv1 TruthfulQA,45.4,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat_10_attention_sparsity,HFv1 Winogrande,71.43,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat_10_sparsity,HF OpenLLM v1,52.48,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat_10_sparsity,HFv1 ARC,53.16,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat_10_sparsity,HFv1 GSM8K,18.42,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat_10_sparsity,HFv1 HellaSwag,78.26,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat_10_sparsity,HFv1 MMLU,48.18,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat_10_sparsity,HFv1 TruthfulQA,45.29,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat_10_sparsity,HFv1 Winogrande,71.59,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat_20_attention_sparsity,HF OpenLLM v1,52.19,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat_20_attention_sparsity,HFv1 ARC,53.41,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat_20_attention_sparsity,HFv1 GSM8K,17.74,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat_20_attention_sparsity,HFv1 HellaSwag,77.91,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat_20_attention_sparsity,HFv1 MMLU,47.49,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat_20_attention_sparsity,HFv1 TruthfulQA,45.84,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat_20_attention_sparsity,HFv1 Winogrande,70.72,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat_20_sparsity,HF OpenLLM v1,52.01,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat_20_sparsity,HFv1 ARC,52.47,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat_20_sparsity,HFv1 GSM8K,17.82,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat_20_sparsity,HFv1 HellaSwag,77.91,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat_20_sparsity,HFv1 MMLU,47.27,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat_20_sparsity,HFv1 TruthfulQA,45.88,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat_20_sparsity,HFv1 Winogrande,70.72,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat_30_attention_sparsity,HF OpenLLM v1,51.8,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat_30_attention_sparsity,HFv1 ARC,53.41,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat_30_attention_sparsity,HFv1 GSM8K,17.44,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat_30_attention_sparsity,HFv1 HellaSwag,76.87,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat_30_attention_sparsity,HFv1 MMLU,47.04,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat_30_attention_sparsity,HFv1 TruthfulQA,45.02,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat_30_attention_sparsity,HFv1 Winogrande,71.03,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat_30_sparsity,HF OpenLLM v1,51.02,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat_30_sparsity,HFv1 ARC,52.47,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat_30_sparsity,HFv1 GSM8K,17.06,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat_30_sparsity,HFv1 HellaSwag,76.58,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat_30_sparsity,HFv1 MMLU,45.57,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat_30_sparsity,HFv1 TruthfulQA,44.82,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat_30_sparsity,HFv1 Winogrande,69.61,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat_afr_100step_flan,HF OpenLLM v1,52.88,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat_afr_100step_flan,HFv1 ARC,52.9,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat_afr_100step_flan,HFv1 GSM8K,19.48,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat_afr_100step_flan,HFv1 HellaSwag,78.44,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat_afr_100step_flan,HFv1 MMLU,48.4,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat_afr_100step_flan,HFv1 TruthfulQA,45.67,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat_afr_100step_flan,HFv1 Winogrande,72.38,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat_afr_100step_flan_v2,HF OpenLLM v1,52.92,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat_afr_100step_flan_v2,HFv1 ARC,53.24,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat_afr_100step_flan_v2,HFv1 GSM8K,19.48,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat_afr_100step_flan_v2,HFv1 HellaSwag,78.43,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat_afr_100step_flan_v2,HFv1 MMLU,48.43,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat_afr_100step_flan_v2,HFv1 TruthfulQA,45.66,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat_afr_100step_flan_v2,HFv1 Winogrande,72.3,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat_afr_100step_v2,HF OpenLLM v1,50.89,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat_afr_100step_v2,HFv1 ARC,52.65,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat_afr_100step_v2,HFv1 GSM8K,8.49,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat_afr_100step_v2,HFv1 HellaSwag,78.25,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat_afr_100step_v2,HFv1 MMLU,48.47,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat_afr_100step_v2,HFv1 TruthfulQA,45.18,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat_afr_100step_v2,HFv1 Winogrande,72.3,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat_afr_200step_flan,HF OpenLLM v1,52.62,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat_afr_200step_flan,HFv1 ARC,52.47,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat_afr_200step_flan,HFv1 GSM8K,18.65,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat_afr_200step_flan,HFv1 HellaSwag,78.02,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat_afr_200step_flan,HFv1 MMLU,48.42,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat_afr_200step_flan,HFv1 TruthfulQA,45.47,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat_afr_200step_flan,HFv1 Winogrande,72.69,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat_afr_200step_flan_v2,HF OpenLLM v1,52.75,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat_afr_200step_flan_v2,HFv1 ARC,52.65,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat_afr_200step_flan_v2,HFv1 GSM8K,18.95,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat_afr_200step_flan_v2,HFv1 HellaSwag,78.04,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat_afr_200step_flan_v2,HFv1 MMLU,48.51,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat_afr_200step_flan_v2,HFv1 TruthfulQA,45.42,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat_afr_200step_flan_v2,HFv1 Winogrande,72.93,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat_afr_200step_merged,HF OpenLLM v1,52.26,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat_afr_200step_merged,HFv1 ARC,52.05,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat_afr_200step_merged,HFv1 GSM8K,18.95,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat_afr_200step_merged,HFv1 HellaSwag,77.38,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat_afr_200step_merged,HFv1 MMLU,48.65,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat_afr_200step_merged,HFv1 TruthfulQA,44.6,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat_afr_200step_merged,HFv1 Winogrande,71.9,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat_afr_200step_v2,HF OpenLLM v1,50.21,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat_afr_200step_v2,HFv1 ARC,51.79,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat_afr_200step_v2,HFv1 GSM8K,7.88,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat_afr_200step_v2,HFv1 HellaSwag,77.41,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat_afr_200step_v2,HFv1 MMLU,48.55,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat_afr_200step_v2,HFv1 TruthfulQA,43.69,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat_afr_200step_v2,HFv1 Winogrande,71.9,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat_afr_300step_flan_v2,HF OpenLLM v1,52.41,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat_afr_300step_flan_v2,HFv1 ARC,52.56,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat_afr_300step_flan_v2,HFv1 GSM8K,17.97,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat_afr_300step_flan_v2,HFv1 HellaSwag,77.76,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat_afr_300step_flan_v2,HFv1 MMLU,48.51,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat_afr_300step_flan_v2,HFv1 TruthfulQA,45.14,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat_afr_300step_flan_v2,HFv1 Winogrande,72.53,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat_afr_441step_flan_v2,HF OpenLLM v1,52.28,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat_afr_441step_flan_v2,HFv1 ARC,52.13,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat_afr_441step_flan_v2,HFv1 GSM8K,17.82,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat_afr_441step_flan_v2,HFv1 HellaSwag,77.63,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat_afr_441step_flan_v2,HFv1 MMLU,48.52,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat_afr_441step_flan_v2,HFv1 TruthfulQA,45.02,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat_afr_441step_flan_v2,HFv1 Winogrande,72.53,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat_flan2022_1_2m,HF OpenLLM v1,47.89,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat_flan2022_1_2m,HFv1 ARC,49.57,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat_flan2022_1_2m,HFv1 GSM8K,1.52,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat_flan2022_1_2m,HFv1 HellaSwag,76.25,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat_flan2022_1_2m,HFv1 MMLU,45.99,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat_flan2022_1_2m,HFv1 TruthfulQA,42.17,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat_flan2022_1_2m,HFv1 Winogrande,71.82,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat_guanaco,HF OpenLLM v1,48.01,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat_guanaco,HFv1 ARC,46.93,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat_guanaco,HFv1 GSM8K,9.93,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat_guanaco,HFv1 HellaSwag,74.11,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat_guanaco,HFv1 MMLU,47.25,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat_guanaco,HFv1 TruthfulQA,45.99,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat_guanaco,HFv1 Winogrande,63.85,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat_guanaco_freeze_embed_tokens_q_v_proj,HF OpenLLM v1,49.94,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat_guanaco_freeze_embed_tokens_q_v_proj,HFv1 ARC,51.45,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat_guanaco_freeze_embed_tokens_q_v_proj,HFv1 GSM8K,11.68,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat_guanaco_freeze_embed_tokens_q_v_proj,HFv1 HellaSwag,76.99,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat_guanaco_freeze_embed_tokens_q_v_proj,HFv1 MMLU,47.13,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat_guanaco_freeze_embed_tokens_q_v_proj,HFv1 TruthfulQA,43.15,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat_guanaco_freeze_embed_tokens_q_v_proj,HFv1 Winogrande,69.22,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat_guanaco_freeze_embed_tokens_q_v_proj_lora,HF OpenLLM v1,49.51,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat_guanaco_freeze_embed_tokens_q_v_proj_lora,HFv1 ARC,51.54,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat_guanaco_freeze_embed_tokens_q_v_proj_lora,HFv1 GSM8K,11.3,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat_guanaco_freeze_embed_tokens_q_v_proj_lora,HFv1 HellaSwag,76.52,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat_guanaco_freeze_embed_tokens_q_v_proj_lora,HFv1 MMLU,46.92,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat_guanaco_freeze_embed_tokens_q_v_proj_lora,HFv1 TruthfulQA,42.51,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat_guanaco_freeze_embed_tokens_q_v_proj_lora,HFv1 Winogrande,68.27,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat_guanaco_lora,HF OpenLLM v1,52.38,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat_guanaco_lora,HFv1 ARC,52.65,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat_guanaco_lora,HFv1 GSM8K,18.8,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat_guanaco_lora,HFv1 HellaSwag,76.68,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat_guanaco_lora,HFv1 MMLU,48.91,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat_guanaco_lora,HFv1 TruthfulQA,43.82,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_chat_guanaco_lora,HFv1 Winogrande,73.4,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_eli5_cleaned_1024_qlora_merged,HF OpenLLM v1,50.4,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_eli5_cleaned_1024_qlora_merged,HFv1 ARC,53.67,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_eli5_cleaned_1024_qlora_merged,HFv1 GSM8K,4.7,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_eli5_cleaned_1024_qlora_merged,HFv1 HellaSwag,78.21,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_eli5_cleaned_1024_qlora_merged,HFv1 MMLU,45.9,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_eli5_cleaned_1024_qlora_merged,HFv1 TruthfulQA,46.13,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_eli5_cleaned_1024_qlora_merged,HFv1 Winogrande,73.8,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_eli5_cleaned_wiki65k_1024_qlora_merged,HF OpenLLM v1,49.71,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_eli5_cleaned_wiki65k_1024_qlora_merged,HFv1 ARC,53.67,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_eli5_cleaned_wiki65k_1024_qlora_merged,HFv1 GSM8K,5.61,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_eli5_cleaned_wiki65k_1024_qlora_merged,HFv1 HellaSwag,78.09,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_eli5_cleaned_wiki65k_1024_qlora_merged,HFv1 MMLU,45.63,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_eli5_cleaned_wiki65k_1024_qlora_merged,HFv1 TruthfulQA,41.72,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_eli5_cleaned_wiki65k_1024_qlora_merged,HFv1 Winogrande,73.56,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_flan2022_1_2m,HF OpenLLM v1,43.68,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_flan2022_1_2m,HFv1 ARC,23.29,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_flan2022_1_2m,HFv1 GSM8K,4.47,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_flan2022_1_2m,HFv1 HellaSwag,78.46,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_flan2022_1_2m,HFv1 MMLU,42.33,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_flan2022_1_2m,HFv1 TruthfulQA,37.97,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_flan2022_1_2m,HFv1 Winogrande,75.53,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_gptq,HF OpenLLM v1,48.48,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_gptq,HFv1 ARC,52.05,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_gptq,HFv1 GSM8K,5.0,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_gptq,HFv1 HellaSwag,77.59,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_gptq,HFv1 MMLU,43.99,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_gptq,HFv1 TruthfulQA,39.32,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_gptq,HFv1 Winogrande,72.93,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_guanaco_instruct_sharded,HF OpenLLM v1,50.58,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_guanaco_instruct_sharded,HFv1 ARC,53.75,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_guanaco_instruct_sharded,HFv1 GSM8K,7.81,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_guanaco_instruct_sharded,HFv1 HellaSwag,78.69,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_guanaco_instruct_sharded,HFv1 MMLU,46.65,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_guanaco_instruct_sharded,HFv1 TruthfulQA,43.93,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_guanaco_instruct_sharded,HFv1 Winogrande,72.61,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_open_platypus,HF OpenLLM v1,49.73,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_open_platypus,HFv1 ARC,51.45,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_open_platypus,HFv1 GSM8K,6.6,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_open_platypus,HFv1 HellaSwag,78.63,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_open_platypus,HFv1 MMLU,43.6,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_open_platypus,HFv1 TruthfulQA,43.71,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_open_platypus,HFv1 Winogrande,74.43,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_physics,HF OpenLLM v1,51.22,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_physics,HFv1 ARC,52.9,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_physics,HFv1 GSM8K,7.05,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_physics,HFv1 HellaSwag,77.71,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_physics,HFv1 MMLU,48.83,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_physics,HFv1 TruthfulQA,48.93,,hf_open_llm_v1_240829_frozen.csv llama_2_7b_physics,HFv1 Winogrande,71.9,,hf_open_llm_v1_240829_frozen.csv llama_2_peanutbutter_v10_7b,HF OpenLLM v1,50.75,,hf_open_llm_v1_240829_frozen.csv llama_2_peanutbutter_v10_7b,HFv1 ARC,55.29,,hf_open_llm_v1_240829_frozen.csv llama_2_peanutbutter_v10_7b,HFv1 GSM8K,5.91,,hf_open_llm_v1_240829_frozen.csv llama_2_peanutbutter_v10_7b,HFv1 HellaSwag,81.69,,hf_open_llm_v1_240829_frozen.csv llama_2_peanutbutter_v10_7b,HFv1 MMLU,46.97,,hf_open_llm_v1_240829_frozen.csv llama_2_peanutbutter_v10_7b,HFv1 TruthfulQA,43.78,,hf_open_llm_v1_240829_frozen.csv llama_2_peanutbutter_v10_7b,HFv1 Winogrande,70.88,,hf_open_llm_v1_240829_frozen.csv llama_2_peanutbutter_v18_a_7b,HF OpenLLM v1,49.88,,hf_open_llm_v1_240829_frozen.csv llama_2_peanutbutter_v18_a_7b,HFv1 ARC,53.16,,hf_open_llm_v1_240829_frozen.csv llama_2_peanutbutter_v18_a_7b,HFv1 GSM8K,7.2,,hf_open_llm_v1_240829_frozen.csv llama_2_peanutbutter_v18_a_7b,HFv1 HellaSwag,78.11,,hf_open_llm_v1_240829_frozen.csv llama_2_peanutbutter_v18_a_7b,HFv1 MMLU,45.54,,hf_open_llm_v1_240829_frozen.csv llama_2_peanutbutter_v18_a_7b,HFv1 TruthfulQA,40.37,,hf_open_llm_v1_240829_frozen.csv llama_2_peanutbutter_v18_a_7b,HFv1 Winogrande,74.9,,hf_open_llm_v1_240829_frozen.csv llama_2_peanutbutter_v18_b_7b,HF OpenLLM v1,50.94,,hf_open_llm_v1_240829_frozen.csv llama_2_peanutbutter_v18_b_7b,HFv1 ARC,54.61,,hf_open_llm_v1_240829_frozen.csv llama_2_peanutbutter_v18_b_7b,HFv1 GSM8K,6.52,,hf_open_llm_v1_240829_frozen.csv llama_2_peanutbutter_v18_b_7b,HFv1 HellaSwag,81.0,,hf_open_llm_v1_240829_frozen.csv llama_2_peanutbutter_v18_b_7b,HFv1 MMLU,47.07,,hf_open_llm_v1_240829_frozen.csv llama_2_peanutbutter_v18_b_7b,HFv1 TruthfulQA,41.93,,hf_open_llm_v1_240829_frozen.csv llama_2_peanutbutter_v18_b_7b,HFv1 Winogrande,74.51,,hf_open_llm_v1_240829_frozen.csv llama_2_wizard_70b_qlora,HF OpenLLM v1,66.47,,hf_open_llm_v1_240829_frozen.csv llama_2_wizard_70b_qlora,HFv1 ARC,67.58,,hf_open_llm_v1_240829_frozen.csv llama_2_wizard_70b_qlora,HFv1 GSM8K,30.48,,hf_open_llm_v1_240829_frozen.csv llama_2_wizard_70b_qlora,HFv1 HellaSwag,87.52,,hf_open_llm_v1_240829_frozen.csv llama_2_wizard_70b_qlora,HFv1 MMLU,69.11,,hf_open_llm_v1_240829_frozen.csv llama_2_wizard_70b_qlora,HFv1 TruthfulQA,61.79,,hf_open_llm_v1_240829_frozen.csv llama_2_wizard_70b_qlora,HFv1 Winogrande,82.32,,hf_open_llm_v1_240829_frozen.csv llama_65b,HF OpenLLM v1,62.79,,hf_open_llm_v1_240829_frozen.csv llama_65b,HFv1 ARC,63.48,,hf_open_llm_v1_240829_frozen.csv llama_65b,HFv1 GSM8K,37.23,,hf_open_llm_v1_240829_frozen.csv llama_65b,HFv1 HellaSwag,86.09,,hf_open_llm_v1_240829_frozen.csv llama_65b,HFv1 MMLU,63.93,,hf_open_llm_v1_240829_frozen.csv llama_65b,HFv1 TruthfulQA,43.43,,hf_open_llm_v1_240829_frozen.csv llama_65b,HFv1 Winogrande,82.56,,hf_open_llm_v1_240829_frozen.csv llama_68m_chat_v1,HF OpenLLM v1,29.72,,hf_open_llm_v1_240829_frozen.csv llama_68m_chat_v1,HFv1 ARC,23.29,,hf_open_llm_v1_240829_frozen.csv llama_68m_chat_v1,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv llama_68m_chat_v1,HFv1 HellaSwag,28.27,,hf_open_llm_v1_240829_frozen.csv llama_68m_chat_v1,HFv1 MMLU,25.18,,hf_open_llm_v1_240829_frozen.csv llama_68m_chat_v1,HFv1 TruthfulQA,47.27,,hf_open_llm_v1_240829_frozen.csv llama_68m_chat_v1,HFv1 Winogrande,54.3,,hf_open_llm_v1_240829_frozen.csv llama_7b,HF OpenLLM v1,45.65,,hf_open_llm_v1_240829_frozen.csv llama_7b,HFv1 ARC,51.02,,hf_open_llm_v1_240829_frozen.csv llama_7b,HFv1 GSM8K,3.56,,hf_open_llm_v1_240829_frozen.csv llama_7b,HFv1 HellaSwag,77.82,,hf_open_llm_v1_240829_frozen.csv llama_7b,HFv1 MMLU,35.71,,hf_open_llm_v1_240829_frozen.csv llama_7b,HFv1 TruthfulQA,34.33,,hf_open_llm_v1_240829_frozen.csv llama_7b,HFv1 Winogrande,71.43,,hf_open_llm_v1_240829_frozen.csv llama_7b_sft_ds_wiki65k_1024_r_64_alpha_16_merged,HF OpenLLM v1,48.82,,hf_open_llm_v1_240829_frozen.csv llama_7b_sft_ds_wiki65k_1024_r_64_alpha_16_merged,HFv1 ARC,54.35,,hf_open_llm_v1_240829_frozen.csv llama_7b_sft_ds_wiki65k_1024_r_64_alpha_16_merged,HFv1 GSM8K,4.62,,hf_open_llm_v1_240829_frozen.csv llama_7b_sft_ds_wiki65k_1024_r_64_alpha_16_merged,HFv1 HellaSwag,78.06,,hf_open_llm_v1_240829_frozen.csv llama_7b_sft_ds_wiki65k_1024_r_64_alpha_16_merged,HFv1 MMLU,45.35,,hf_open_llm_v1_240829_frozen.csv llama_7b_sft_ds_wiki65k_1024_r_64_alpha_16_merged,HFv1 TruthfulQA,37.11,,hf_open_llm_v1_240829_frozen.csv llama_7b_sft_ds_wiki65k_1024_r_64_alpha_16_merged,HFv1 Winogrande,73.4,,hf_open_llm_v1_240829_frozen.csv llama_7b_sft_qlora_eli5_wiki_dpo_ds_rm_top_2_1024_r_64_alpha_16,HF OpenLLM v1,49.98,,hf_open_llm_v1_240829_frozen.csv llama_7b_sft_qlora_eli5_wiki_dpo_ds_rm_top_2_1024_r_64_alpha_16,HFv1 ARC,54.1,,hf_open_llm_v1_240829_frozen.csv llama_7b_sft_qlora_eli5_wiki_dpo_ds_rm_top_2_1024_r_64_alpha_16,HFv1 GSM8K,4.55,,hf_open_llm_v1_240829_frozen.csv llama_7b_sft_qlora_eli5_wiki_dpo_ds_rm_top_2_1024_r_64_alpha_16,HFv1 HellaSwag,78.74,,hf_open_llm_v1_240829_frozen.csv llama_7b_sft_qlora_eli5_wiki_dpo_ds_rm_top_2_1024_r_64_alpha_16,HFv1 MMLU,45.44,,hf_open_llm_v1_240829_frozen.csv llama_7b_sft_qlora_eli5_wiki_dpo_ds_rm_top_2_1024_r_64_alpha_16,HFv1 TruthfulQA,43.4,,hf_open_llm_v1_240829_frozen.csv llama_7b_sft_qlora_eli5_wiki_dpo_ds_rm_top_2_1024_r_64_alpha_16,HFv1 Winogrande,73.64,,hf_open_llm_v1_240829_frozen.csv llama_base_7b,HF OpenLLM v1,45.62,,hf_open_llm_v1_240829_frozen.csv llama_base_7b,HFv1 ARC,50.94,,hf_open_llm_v1_240829_frozen.csv llama_base_7b,HFv1 GSM8K,3.56,,hf_open_llm_v1_240829_frozen.csv llama_base_7b,HFv1 HellaSwag,77.8,,hf_open_llm_v1_240829_frozen.csv llama_base_7b,HFv1 MMLU,35.67,,hf_open_llm_v1_240829_frozen.csv llama_base_7b,HFv1 TruthfulQA,34.34,,hf_open_llm_v1_240829_frozen.csv llama_base_7b,HFv1 Winogrande,71.43,,hf_open_llm_v1_240829_frozen.csv llama_megamerge_dare_13b,HF OpenLLM v1,58.15,,hf_open_llm_v1_240829_frozen.csv llama_megamerge_dare_13b,HFv1 ARC,60.58,,hf_open_llm_v1_240829_frozen.csv llama_megamerge_dare_13b,HFv1 GSM8K,28.51,,hf_open_llm_v1_240829_frozen.csv llama_megamerge_dare_13b,HFv1 HellaSwag,83.0,,hf_open_llm_v1_240829_frozen.csv llama_megamerge_dare_13b,HFv1 MMLU,54.91,,hf_open_llm_v1_240829_frozen.csv llama_megamerge_dare_13b,HFv1 TruthfulQA,45.76,,hf_open_llm_v1_240829_frozen.csv llama_megamerge_dare_13b,HFv1 Winogrande,76.16,,hf_open_llm_v1_240829_frozen.csv llama_pile_350b,HF OpenLLM v1,35.0,,hf_open_llm_v1_240829_frozen.csv llama_pile_350b,HFv1 ARC,33.19,,hf_open_llm_v1_240829_frozen.csv llama_pile_350b,HFv1 GSM8K,0.76,,hf_open_llm_v1_240829_frozen.csv llama_pile_350b,HFv1 HellaSwag,56.6,,hf_open_llm_v1_240829_frozen.csv llama_pile_350b,HFv1 MMLU,24.66,,hf_open_llm_v1_240829_frozen.csv llama_pile_350b,HFv1 TruthfulQA,36.28,,hf_open_llm_v1_240829_frozen.csv llama_pile_350b,HFv1 Winogrande,58.48,,hf_open_llm_v1_240829_frozen.csv llama_v2_7b_32kc_security,HF OpenLLM v1,49.19,,hf_open_llm_v1_240829_frozen.csv llama_v2_7b_32kc_security,HFv1 ARC,49.83,,hf_open_llm_v1_240829_frozen.csv llama_v2_7b_32kc_security,HFv1 GSM8K,3.87,,hf_open_llm_v1_240829_frozen.csv llama_v2_7b_32kc_security,HFv1 HellaSwag,77.33,,hf_open_llm_v1_240829_frozen.csv llama_v2_7b_32kc_security,HFv1 MMLU,44.41,,hf_open_llm_v1_240829_frozen.csv llama_v2_7b_32kc_security,HFv1 TruthfulQA,47.96,,hf_open_llm_v1_240829_frozen.csv llama_v2_7b_32kc_security,HFv1 Winogrande,71.74,,hf_open_llm_v1_240829_frozen.csv llamacorn_1_1b_chat,HF OpenLLM v1,36.73,,hf_open_llm_v1_240829_frozen.csv llamacorn_1_1b_chat,HFv1 ARC,33.79,,hf_open_llm_v1_240829_frozen.csv llamacorn_1_1b_chat,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv llamacorn_1_1b_chat,HFv1 HellaSwag,59.24,,hf_open_llm_v1_240829_frozen.csv llamacorn_1_1b_chat,HFv1 MMLU,29.01,,hf_open_llm_v1_240829_frozen.csv llamacorn_1_1b_chat,HFv1 TruthfulQA,36.86,,hf_open_llm_v1_240829_frozen.csv llamacorn_1_1b_chat,HFv1 Winogrande,61.48,,hf_open_llm_v1_240829_frozen.csv llamarada_3_orpo_v2_8b,HF OpenLLM v1,63.53,,hf_open_llm_v1_240829_frozen.csv llamarada_3_orpo_v2_8b,HFv1 ARC,59.9,,hf_open_llm_v1_240829_frozen.csv llamarada_3_orpo_v2_8b,HFv1 GSM8K,47.16,,hf_open_llm_v1_240829_frozen.csv llamarada_3_orpo_v2_8b,HFv1 HellaSwag,82.22,,hf_open_llm_v1_240829_frozen.csv llamarada_3_orpo_v2_8b,HFv1 MMLU,66.59,,hf_open_llm_v1_240829_frozen.csv llamarada_3_orpo_v2_8b,HFv1 TruthfulQA,47.67,,hf_open_llm_v1_240829_frozen.csv llamarada_3_orpo_v2_8b,HFv1 Winogrande,77.66,,hf_open_llm_v1_240829_frozen.csv llamaragdrama,HF OpenLLM v1,74.65,,hf_open_llm_v1_240829_frozen.csv llamaragdrama,HFv1 ARC,72.01,,hf_open_llm_v1_240829_frozen.csv llamaragdrama,HFv1 GSM8K,65.66,,hf_open_llm_v1_240829_frozen.csv llamaragdrama,HFv1 HellaSwag,88.83,,hf_open_llm_v1_240829_frozen.csv llamaragdrama,HFv1 MMLU,64.5,,hf_open_llm_v1_240829_frozen.csv llamaragdrama,HFv1 TruthfulQA,70.24,,hf_open_llm_v1_240829_frozen.csv llamaragdrama,HFv1 Winogrande,86.66,,hf_open_llm_v1_240829_frozen.csv llamaster_8b_v0_1,HF OpenLLM v1,66.84,,hf_open_llm_v1_240829_frozen.csv llamaster_8b_v0_1,HFv1 ARC,60.92,,hf_open_llm_v1_240829_frozen.csv llamaster_8b_v0_1,HFv1 GSM8K,68.69,,hf_open_llm_v1_240829_frozen.csv llamaster_8b_v0_1,HFv1 HellaSwag,78.33,,hf_open_llm_v1_240829_frozen.csv llamaster_8b_v0_1,HFv1 MMLU,67.11,,hf_open_llm_v1_240829_frozen.csv llamaster_8b_v0_1,HFv1 TruthfulQA,51.55,,hf_open_llm_v1_240829_frozen.csv llamaster_8b_v0_1,HFv1 Winogrande,74.43,,hf_open_llm_v1_240829_frozen.csv llamion_14b_base,HF OpenLLM v1,59.1,,hf_open_llm_v1_240829_frozen.csv llamion_14b_base,HFv1 ARC,55.2,,hf_open_llm_v1_240829_frozen.csv llamion_14b_base,HFv1 GSM8K,37.98,,hf_open_llm_v1_240829_frozen.csv llamion_14b_base,HFv1 HellaSwag,80.57,,hf_open_llm_v1_240829_frozen.csv llamion_14b_base,HFv1 MMLU,67.89,,hf_open_llm_v1_240829_frozen.csv llamion_14b_base,HFv1 TruthfulQA,39.79,,hf_open_llm_v1_240829_frozen.csv llamion_14b_base,HFv1 Winogrande,73.16,,hf_open_llm_v1_240829_frozen.csv llamion_14b_chat,HF OpenLLM v1,58.31,,hf_open_llm_v1_240829_frozen.csv llamion_14b_chat,HFv1 ARC,54.61,,hf_open_llm_v1_240829_frozen.csv llamion_14b_chat,HFv1 GSM8K,36.69,,hf_open_llm_v1_240829_frozen.csv llamion_14b_chat,HFv1 HellaSwag,78.99,,hf_open_llm_v1_240829_frozen.csv llamion_14b_chat,HFv1 MMLU,63.28,,hf_open_llm_v1_240829_frozen.csv llamion_14b_chat,HFv1 TruthfulQA,42.98,,hf_open_llm_v1_240829_frozen.csv llamion_14b_chat,HFv1 Winogrande,73.32,,hf_open_llm_v1_240829_frozen.csv llava_v1_5_7b_vicuna,HF OpenLLM v1,52.28,,hf_open_llm_v1_240829_frozen.csv llava_v1_5_7b_vicuna,HFv1 ARC,52.65,,hf_open_llm_v1_240829_frozen.csv llava_v1_5_7b_vicuna,HFv1 GSM8K,15.31,,hf_open_llm_v1_240829_frozen.csv llava_v1_5_7b_vicuna,HFv1 HellaSwag,76.09,,hf_open_llm_v1_240829_frozen.csv llava_v1_5_7b_vicuna,HFv1 MMLU,51.68,,hf_open_llm_v1_240829_frozen.csv llava_v1_5_7b_vicuna,HFv1 TruthfulQA,45.86,,hf_open_llm_v1_240829_frozen.csv llava_v1_5_7b_vicuna,HFv1 Winogrande,72.06,,hf_open_llm_v1_240829_frozen.csv llm_jp_13b_instruct_full_jaster_dolly_oasst_v1_0,HF OpenLLM v1,31.77,,hf_open_llm_v1_240829_frozen.csv llm_jp_13b_instruct_full_jaster_dolly_oasst_v1_0,HFv1 ARC,26.88,,hf_open_llm_v1_240829_frozen.csv llm_jp_13b_instruct_full_jaster_dolly_oasst_v1_0,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv llm_jp_13b_instruct_full_jaster_dolly_oasst_v1_0,HFv1 HellaSwag,44.78,,hf_open_llm_v1_240829_frozen.csv llm_jp_13b_instruct_full_jaster_dolly_oasst_v1_0,HFv1 MMLU,23.12,,hf_open_llm_v1_240829_frozen.csv llm_jp_13b_instruct_full_jaster_dolly_oasst_v1_0,HFv1 TruthfulQA,45.19,,hf_open_llm_v1_240829_frozen.csv llm_jp_13b_instruct_full_jaster_dolly_oasst_v1_0,HFv1 Winogrande,50.67,,hf_open_llm_v1_240829_frozen.csv llm_jp_13b_instruct_full_jaster_v1_0,HF OpenLLM v1,31.63,,hf_open_llm_v1_240829_frozen.csv llm_jp_13b_instruct_full_jaster_v1_0,HFv1 ARC,27.22,,hf_open_llm_v1_240829_frozen.csv llm_jp_13b_instruct_full_jaster_v1_0,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv llm_jp_13b_instruct_full_jaster_v1_0,HFv1 HellaSwag,44.7,,hf_open_llm_v1_240829_frozen.csv llm_jp_13b_instruct_full_jaster_v1_0,HFv1 MMLU,23.12,,hf_open_llm_v1_240829_frozen.csv llm_jp_13b_instruct_full_jaster_v1_0,HFv1 TruthfulQA,44.69,,hf_open_llm_v1_240829_frozen.csv llm_jp_13b_instruct_full_jaster_v1_0,HFv1 Winogrande,50.04,,hf_open_llm_v1_240829_frozen.csv llongma_3b_lima,HF OpenLLM v1,38.51,,hf_open_llm_v1_240829_frozen.csv llongma_3b_lima,HFv1 ARC,39.08,,hf_open_llm_v1_240829_frozen.csv llongma_3b_lima,HFv1 GSM8K,0.3,,hf_open_llm_v1_240829_frozen.csv llongma_3b_lima,HFv1 HellaSwag,67.15,,hf_open_llm_v1_240829_frozen.csv llongma_3b_lima,HFv1 MMLU,26.43,,hf_open_llm_v1_240829_frozen.csv llongma_3b_lima,HFv1 TruthfulQA,34.71,,hf_open_llm_v1_240829_frozen.csv llongma_3b_lima,HFv1 Winogrande,63.38,,hf_open_llm_v1_240829_frozen.csv llongorca_7b_16k,HF OpenLLM v1,53.02,,hf_open_llm_v1_240829_frozen.csv llongorca_7b_16k,HFv1 ARC,57.51,,hf_open_llm_v1_240829_frozen.csv llongorca_7b_16k,HFv1 GSM8K,7.51,,hf_open_llm_v1_240829_frozen.csv llongorca_7b_16k,HFv1 HellaSwag,79.44,,hf_open_llm_v1_240829_frozen.csv llongorca_7b_16k,HFv1 MMLU,49.35,,hf_open_llm_v1_240829_frozen.csv llongorca_7b_16k,HFv1 TruthfulQA,49.84,,hf_open_llm_v1_240829_frozen.csv llongorca_7b_16k,HFv1 Winogrande,74.51,,hf_open_llm_v1_240829_frozen.csv losslessmegacoder_llama2_13b_mini,HF OpenLLM v1,56.92,,hf_open_llm_v1_240829_frozen.csv losslessmegacoder_llama2_13b_mini,HFv1 ARC,60.58,,hf_open_llm_v1_240829_frozen.csv losslessmegacoder_llama2_13b_mini,HFv1 GSM8K,15.92,,hf_open_llm_v1_240829_frozen.csv losslessmegacoder_llama2_13b_mini,HFv1 HellaSwag,81.26,,hf_open_llm_v1_240829_frozen.csv losslessmegacoder_llama2_13b_mini,HFv1 MMLU,57.92,,hf_open_llm_v1_240829_frozen.csv losslessmegacoder_llama2_13b_mini,HFv1 TruthfulQA,48.89,,hf_open_llm_v1_240829_frozen.csv losslessmegacoder_llama2_13b_mini,HFv1 Winogrande,76.95,,hf_open_llm_v1_240829_frozen.csv losslessmegacoder_llama2_7b_mini,HF OpenLLM v1,51.66,,hf_open_llm_v1_240829_frozen.csv losslessmegacoder_llama2_7b_mini,HFv1 ARC,53.5,,hf_open_llm_v1_240829_frozen.csv losslessmegacoder_llama2_7b_mini,HFv1 GSM8K,9.55,,hf_open_llm_v1_240829_frozen.csv losslessmegacoder_llama2_7b_mini,HFv1 HellaSwag,77.38,,hf_open_llm_v1_240829_frozen.csv losslessmegacoder_llama2_7b_mini,HFv1 MMLU,49.72,,hf_open_llm_v1_240829_frozen.csv losslessmegacoder_llama2_7b_mini,HFv1 TruthfulQA,45.77,,hf_open_llm_v1_240829_frozen.csv losslessmegacoder_llama2_7b_mini,HFv1 Winogrande,74.03,,hf_open_llm_v1_240829_frozen.csv loyal_piano_m7,HF OpenLLM v1,68.67,,hf_open_llm_v1_240829_frozen.csv loyal_piano_m7,HFv1 ARC,66.72,,hf_open_llm_v1_240829_frozen.csv loyal_piano_m7,HFv1 GSM8K,56.71,,hf_open_llm_v1_240829_frozen.csv loyal_piano_m7,HFv1 HellaSwag,85.03,,hf_open_llm_v1_240829_frozen.csv loyal_piano_m7,HFv1 MMLU,64.43,,hf_open_llm_v1_240829_frozen.csv loyal_piano_m7,HFv1 TruthfulQA,60.03,,hf_open_llm_v1_240829_frozen.csv loyal_piano_m7,HFv1 Winogrande,79.08,,hf_open_llm_v1_240829_frozen.csv loyal_piano_m7_cdpo,HF OpenLLM v1,69.08,,hf_open_llm_v1_240829_frozen.csv loyal_piano_m7_cdpo,HFv1 ARC,67.15,,hf_open_llm_v1_240829_frozen.csv loyal_piano_m7_cdpo,HFv1 GSM8K,56.48,,hf_open_llm_v1_240829_frozen.csv loyal_piano_m7_cdpo,HFv1 HellaSwag,85.39,,hf_open_llm_v1_240829_frozen.csv loyal_piano_m7_cdpo,HFv1 MMLU,64.52,,hf_open_llm_v1_240829_frozen.csv loyal_piano_m7_cdpo,HFv1 TruthfulQA,61.53,,hf_open_llm_v1_240829_frozen.csv loyal_piano_m7_cdpo,HFv1 Winogrande,79.4,,hf_open_llm_v1_240829_frozen.csv lr_experiment1_7b,HF OpenLLM v1,62.77,,hf_open_llm_v1_240829_frozen.csv lr_experiment1_7b,HFv1 ARC,60.75,,hf_open_llm_v1_240829_frozen.csv lr_experiment1_7b,HFv1 GSM8K,45.34,,hf_open_llm_v1_240829_frozen.csv lr_experiment1_7b,HFv1 HellaSwag,83.73,,hf_open_llm_v1_240829_frozen.csv lr_experiment1_7b,HFv1 MMLU,63.25,,hf_open_llm_v1_240829_frozen.csv lr_experiment1_7b,HFv1 TruthfulQA,44.07,,hf_open_llm_v1_240829_frozen.csv lr_experiment1_7b,HFv1 Winogrande,79.48,,hf_open_llm_v1_240829_frozen.csv luban_marcoroni_13b,HF OpenLLM v1,57.98,,hf_open_llm_v1_240829_frozen.csv luban_marcoroni_13b,HFv1 ARC,63.65,,hf_open_llm_v1_240829_frozen.csv luban_marcoroni_13b,HFv1 GSM8K,10.01,,hf_open_llm_v1_240829_frozen.csv luban_marcoroni_13b,HFv1 HellaSwag,82.92,,hf_open_llm_v1_240829_frozen.csv luban_marcoroni_13b,HFv1 MMLU,58.7,,hf_open_llm_v1_240829_frozen.csv luban_marcoroni_13b,HFv1 TruthfulQA,55.55,,hf_open_llm_v1_240829_frozen.csv luban_marcoroni_13b,HFv1 Winogrande,77.03,,hf_open_llm_v1_240829_frozen.csv luban_marcoroni_13b_v2,HF OpenLLM v1,57.92,,hf_open_llm_v1_240829_frozen.csv luban_marcoroni_13b_v2,HFv1 ARC,63.48,,hf_open_llm_v1_240829_frozen.csv luban_marcoroni_13b_v2,HFv1 GSM8K,9.93,,hf_open_llm_v1_240829_frozen.csv luban_marcoroni_13b_v2,HFv1 HellaSwag,82.89,,hf_open_llm_v1_240829_frozen.csv luban_marcoroni_13b_v2,HFv1 MMLU,58.72,,hf_open_llm_v1_240829_frozen.csv luban_marcoroni_13b_v2,HFv1 TruthfulQA,55.56,,hf_open_llm_v1_240829_frozen.csv luban_marcoroni_13b_v2,HFv1 Winogrande,76.95,,hf_open_llm_v1_240829_frozen.csv luban_marcoroni_13b_v3,HF OpenLLM v1,57.94,,hf_open_llm_v1_240829_frozen.csv luban_marcoroni_13b_v3,HFv1 ARC,63.74,,hf_open_llm_v1_240829_frozen.csv luban_marcoroni_13b_v3,HFv1 GSM8K,9.93,,hf_open_llm_v1_240829_frozen.csv luban_marcoroni_13b_v3,HFv1 HellaSwag,82.88,,hf_open_llm_v1_240829_frozen.csv luban_marcoroni_13b_v3,HFv1 MMLU,58.64,,hf_open_llm_v1_240829_frozen.csv luban_marcoroni_13b_v3,HFv1 TruthfulQA,55.56,,hf_open_llm_v1_240829_frozen.csv luban_marcoroni_13b_v3,HFv1 Winogrande,76.87,,hf_open_llm_v1_240829_frozen.csv luban_platypus2_13b_qlora_0_80_epoch,HF OpenLLM v1,55.34,,hf_open_llm_v1_240829_frozen.csv luban_platypus2_13b_qlora_0_80_epoch,HFv1 ARC,60.24,,hf_open_llm_v1_240829_frozen.csv luban_platypus2_13b_qlora_0_80_epoch,HFv1 GSM8K,0.91,,hf_open_llm_v1_240829_frozen.csv luban_platypus2_13b_qlora_0_80_epoch,HFv1 HellaSwag,82.22,,hf_open_llm_v1_240829_frozen.csv luban_platypus2_13b_qlora_0_80_epoch,HFv1 MMLU,58.03,,hf_open_llm_v1_240829_frozen.csv luban_platypus2_13b_qlora_0_80_epoch,HFv1 TruthfulQA,55.26,,hf_open_llm_v1_240829_frozen.csv luban_platypus2_13b_qlora_0_80_epoch,HFv1 Winogrande,75.37,,hf_open_llm_v1_240829_frozen.csv lucie_7b_v0_2_16bit,HF OpenLLM v1,65.3,,hf_open_llm_v1_240829_frozen.csv lucie_7b_v0_2_16bit,HFv1 ARC,62.12,,hf_open_llm_v1_240829_frozen.csv lucie_7b_v0_2_16bit,HFv1 GSM8K,39.88,,hf_open_llm_v1_240829_frozen.csv lucie_7b_v0_2_16bit,HFv1 HellaSwag,84.83,,hf_open_llm_v1_240829_frozen.csv lucie_7b_v0_2_16bit,HFv1 MMLU,60.45,,hf_open_llm_v1_240829_frozen.csv lucie_7b_v0_2_16bit,HFv1 TruthfulQA,67.65,,hf_open_llm_v1_240829_frozen.csv lucie_7b_v0_2_16bit,HFv1 Winogrande,76.87,,hf_open_llm_v1_240829_frozen.csv lumosia_moe_4x10_7,HF OpenLLM v1,69.61,,hf_open_llm_v1_240829_frozen.csv lumosia_moe_4x10_7,HFv1 ARC,68.34,,hf_open_llm_v1_240829_frozen.csv lumosia_moe_4x10_7,HFv1 GSM8K,51.02,,hf_open_llm_v1_240829_frozen.csv lumosia_moe_4x10_7,HFv1 HellaSwag,87.13,,hf_open_llm_v1_240829_frozen.csv lumosia_moe_4x10_7,HFv1 MMLU,64.38,,hf_open_llm_v1_240829_frozen.csv lumosia_moe_4x10_7,HFv1 TruthfulQA,63.81,,hf_open_llm_v1_240829_frozen.csv lumosia_moe_4x10_7,HFv1 Winogrande,82.95,,hf_open_llm_v1_240829_frozen.csv m_b_4_32,HF OpenLLM v1,60.02,,hf_open_llm_v1_240829_frozen.csv m_b_4_32,HFv1 ARC,59.56,,hf_open_llm_v1_240829_frozen.csv m_b_4_32,HFv1 GSM8K,30.78,,hf_open_llm_v1_240829_frozen.csv m_b_4_32,HFv1 HellaSwag,83.03,,hf_open_llm_v1_240829_frozen.csv m_b_4_32,HFv1 MMLU,60.9,,hf_open_llm_v1_240829_frozen.csv m_b_4_32,HFv1 TruthfulQA,49.29,,hf_open_llm_v1_240829_frozen.csv m_b_4_32,HFv1 Winogrande,76.56,,hf_open_llm_v1_240829_frozen.csv malayalam_llama_7b_instruct_v0_1,HF OpenLLM v1,39.51,,hf_open_llm_v1_240829_frozen.csv malayalam_llama_7b_instruct_v0_1,HFv1 ARC,37.03,,hf_open_llm_v1_240829_frozen.csv malayalam_llama_7b_instruct_v0_1,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv malayalam_llama_7b_instruct_v0_1,HFv1 HellaSwag,67.75,,hf_open_llm_v1_240829_frozen.csv malayalam_llama_7b_instruct_v0_1,HFv1 MMLU,23.12,,hf_open_llm_v1_240829_frozen.csv malayalam_llama_7b_instruct_v0_1,HFv1 TruthfulQA,47.05,,hf_open_llm_v1_240829_frozen.csv malayalam_llama_7b_instruct_v0_1,HFv1 Winogrande,62.12,,hf_open_llm_v1_240829_frozen.csv marcoro14_7b_slerp,HF OpenLLM v1,63.08,,hf_open_llm_v1_240829_frozen.csv marcoro14_7b_slerp,HFv1 ARC,63.4,,hf_open_llm_v1_240829_frozen.csv marcoro14_7b_slerp,HFv1 GSM8K,40.18,,hf_open_llm_v1_240829_frozen.csv marcoro14_7b_slerp,HFv1 HellaSwag,83.77,,hf_open_llm_v1_240829_frozen.csv marcoro14_7b_slerp,HFv1 MMLU,61.18,,hf_open_llm_v1_240829_frozen.csv marcoro14_7b_slerp,HFv1 TruthfulQA,52.08,,hf_open_llm_v1_240829_frozen.csv marcoro14_7b_slerp,HFv1 Winogrande,77.9,,hf_open_llm_v1_240829_frozen.csv master_yi_9b,HF OpenLLM v1,67.44,,hf_open_llm_v1_240829_frozen.csv master_yi_9b,HFv1 ARC,61.43,,hf_open_llm_v1_240829_frozen.csv master_yi_9b,HFv1 GSM8K,65.5,,hf_open_llm_v1_240829_frozen.csv master_yi_9b,HFv1 HellaSwag,79.75,,hf_open_llm_v1_240829_frozen.csv master_yi_9b,HFv1 MMLU,71.64,,hf_open_llm_v1_240829_frozen.csv master_yi_9b,HFv1 TruthfulQA,48.55,,hf_open_llm_v1_240829_frozen.csv master_yi_9b,HFv1 Winogrande,77.74,,hf_open_llm_v1_240829_frozen.csv mathhermes_2_5_mistral_7b,HF OpenLLM v1,65.24,,hf_open_llm_v1_240829_frozen.csv mathhermes_2_5_mistral_7b,HFv1 ARC,64.76,,hf_open_llm_v1_240829_frozen.csv mathhermes_2_5_mistral_7b,HFv1 GSM8K,49.28,,hf_open_llm_v1_240829_frozen.csv mathhermes_2_5_mistral_7b,HFv1 HellaSwag,84.19,,hf_open_llm_v1_240829_frozen.csv mathhermes_2_5_mistral_7b,HFv1 MMLU,63.59,,hf_open_llm_v1_240829_frozen.csv mathhermes_2_5_mistral_7b,HFv1 TruthfulQA,51.95,,hf_open_llm_v1_240829_frozen.csv mathhermes_2_5_mistral_7b,HFv1 Winogrande,77.66,,hf_open_llm_v1_240829_frozen.csv matter_0_1_7b,HF OpenLLM v1,63.39,,hf_open_llm_v1_240829_frozen.csv matter_0_1_7b,HFv1 ARC,61.77,,hf_open_llm_v1_240829_frozen.csv matter_0_1_7b,HFv1 GSM8K,53.75,,hf_open_llm_v1_240829_frozen.csv matter_0_1_7b,HFv1 HellaSwag,82.14,,hf_open_llm_v1_240829_frozen.csv matter_0_1_7b,HFv1 MMLU,62.42,,hf_open_llm_v1_240829_frozen.csv matter_0_1_7b,HFv1 TruthfulQA,42.44,,hf_open_llm_v1_240829_frozen.csv matter_0_1_7b,HFv1 Winogrande,77.82,,hf_open_llm_v1_240829_frozen.csv matter_0_1_7b_boost,HF OpenLLM v1,63.22,,hf_open_llm_v1_240829_frozen.csv matter_0_1_7b_boost,HFv1 ARC,62.63,,hf_open_llm_v1_240829_frozen.csv matter_0_1_7b_boost,HFv1 GSM8K,42.61,,hf_open_llm_v1_240829_frozen.csv matter_0_1_7b_boost,HFv1 HellaSwag,81.51,,hf_open_llm_v1_240829_frozen.csv matter_0_1_7b_boost,HFv1 MMLU,61.97,,hf_open_llm_v1_240829_frozen.csv matter_0_1_7b_boost,HFv1 TruthfulQA,54.7,,hf_open_llm_v1_240829_frozen.csv matter_0_1_7b_boost,HFv1 Winogrande,75.93,,hf_open_llm_v1_240829_frozen.csv matter_0_1_7b_boost_dpo,HF OpenLLM v1,65.99,,hf_open_llm_v1_240829_frozen.csv matter_0_1_7b_boost_dpo,HFv1 ARC,65.02,,hf_open_llm_v1_240829_frozen.csv matter_0_1_7b_boost_dpo,HFv1 GSM8K,50.04,,hf_open_llm_v1_240829_frozen.csv matter_0_1_7b_boost_dpo,HFv1 HellaSwag,83.08,,hf_open_llm_v1_240829_frozen.csv matter_0_1_7b_boost_dpo,HFv1 MMLU,61.87,,hf_open_llm_v1_240829_frozen.csv matter_0_1_7b_boost_dpo,HFv1 TruthfulQA,60.29,,hf_open_llm_v1_240829_frozen.csv matter_0_1_7b_boost_dpo,HFv1 Winogrande,75.61,,hf_open_llm_v1_240829_frozen.csv matter_0_1_7b_boost_dpo_preview,HF OpenLLM v1,65.77,,hf_open_llm_v1_240829_frozen.csv matter_0_1_7b_boost_dpo_preview,HFv1 ARC,64.59,,hf_open_llm_v1_240829_frozen.csv matter_0_1_7b_boost_dpo_preview,HFv1 GSM8K,50.42,,hf_open_llm_v1_240829_frozen.csv matter_0_1_7b_boost_dpo_preview,HFv1 HellaSwag,82.87,,hf_open_llm_v1_240829_frozen.csv matter_0_1_7b_boost_dpo_preview,HFv1 MMLU,62.02,,hf_open_llm_v1_240829_frozen.csv matter_0_1_7b_boost_dpo_preview,HFv1 TruthfulQA,58.86,,hf_open_llm_v1_240829_frozen.csv matter_0_1_7b_boost_dpo_preview,HFv1 Winogrande,75.85,,hf_open_llm_v1_240829_frozen.csv matter_0_1_7b_dpo_preview,HF OpenLLM v1,64.87,,hf_open_llm_v1_240829_frozen.csv matter_0_1_7b_dpo_preview,HFv1 ARC,62.71,,hf_open_llm_v1_240829_frozen.csv matter_0_1_7b_dpo_preview,HFv1 GSM8K,56.18,,hf_open_llm_v1_240829_frozen.csv matter_0_1_7b_dpo_preview,HFv1 HellaSwag,82.99,,hf_open_llm_v1_240829_frozen.csv matter_0_1_7b_dpo_preview,HFv1 MMLU,62.7,,hf_open_llm_v1_240829_frozen.csv matter_0_1_7b_dpo_preview,HFv1 TruthfulQA,45.79,,hf_open_llm_v1_240829_frozen.csv matter_0_1_7b_dpo_preview,HFv1 Winogrande,78.85,,hf_open_llm_v1_240829_frozen.csv matter_0_2_7b,HF OpenLLM v1,64.67,,hf_open_llm_v1_240829_frozen.csv matter_0_2_7b,HFv1 ARC,61.6,,hf_open_llm_v1_240829_frozen.csv matter_0_2_7b,HFv1 GSM8K,53.9,,hf_open_llm_v1_240829_frozen.csv matter_0_2_7b,HFv1 HellaSwag,82.39,,hf_open_llm_v1_240829_frozen.csv matter_0_2_7b,HFv1 MMLU,62.51,,hf_open_llm_v1_240829_frozen.csv matter_0_2_7b,HFv1 TruthfulQA,48.11,,hf_open_llm_v1_240829_frozen.csv matter_0_2_7b,HFv1 Winogrande,79.48,,hf_open_llm_v1_240829_frozen.csv matter_0_2_7b_dpo,HF OpenLLM v1,66.15,,hf_open_llm_v1_240829_frozen.csv matter_0_2_7b_dpo,HFv1 ARC,63.31,,hf_open_llm_v1_240829_frozen.csv matter_0_2_7b_dpo,HFv1 GSM8K,56.94,,hf_open_llm_v1_240829_frozen.csv matter_0_2_7b_dpo,HFv1 HellaSwag,83.16,,hf_open_llm_v1_240829_frozen.csv matter_0_2_7b_dpo,HFv1 MMLU,62.9,,hf_open_llm_v1_240829_frozen.csv matter_0_2_7b_dpo,HFv1 TruthfulQA,50.3,,hf_open_llm_v1_240829_frozen.csv matter_0_2_7b_dpo,HFv1 Winogrande,80.27,,hf_open_llm_v1_240829_frozen.csv mc_data_30k_from_platpus_orca_7b_10k_v1_lora_qkvo_rank14_v2,HF OpenLLM v1,52.13,,hf_open_llm_v1_240829_frozen.csv mc_data_30k_from_platpus_orca_7b_10k_v1_lora_qkvo_rank14_v2,HFv1 ARC,57.17,,hf_open_llm_v1_240829_frozen.csv mc_data_30k_from_platpus_orca_7b_10k_v1_lora_qkvo_rank14_v2,HFv1 GSM8K,0.38,,hf_open_llm_v1_240829_frozen.csv mc_data_30k_from_platpus_orca_7b_10k_v1_lora_qkvo_rank14_v2,HFv1 HellaSwag,79.57,,hf_open_llm_v1_240829_frozen.csv mc_data_30k_from_platpus_orca_7b_10k_v1_lora_qkvo_rank14_v2,HFv1 MMLU,50.24,,hf_open_llm_v1_240829_frozen.csv mc_data_30k_from_platpus_orca_7b_10k_v1_lora_qkvo_rank14_v2,HFv1 TruthfulQA,52.51,,hf_open_llm_v1_240829_frozen.csv mc_data_30k_from_platpus_orca_7b_10k_v1_lora_qkvo_rank14_v2,HFv1 Winogrande,72.93,,hf_open_llm_v1_240829_frozen.csv mc_model_v1,HF OpenLLM v1,36.06,,hf_open_llm_v1_240829_frozen.csv mc_model_v1,HFv1 ARC,32.94,,hf_open_llm_v1_240829_frozen.csv mc_model_v1,HFv1 GSM8K,1.36,,hf_open_llm_v1_240829_frozen.csv mc_model_v1,HFv1 HellaSwag,47.69,,hf_open_llm_v1_240829_frozen.csv mc_model_v1,HFv1 MMLU,31.9,,hf_open_llm_v1_240829_frozen.csv mc_model_v1,HFv1 TruthfulQA,43.53,,hf_open_llm_v1_240829_frozen.csv mc_model_v1,HFv1 Winogrande,58.96,,hf_open_llm_v1_240829_frozen.csv medchator_2x7b,HF OpenLLM v1,58.13,,hf_open_llm_v1_240829_frozen.csv medchator_2x7b,HFv1 ARC,57.59,,hf_open_llm_v1_240829_frozen.csv medchator_2x7b,HFv1 GSM8K,32.83,,hf_open_llm_v1_240829_frozen.csv medchator_2x7b,HFv1 HellaSwag,78.14,,hf_open_llm_v1_240829_frozen.csv medchator_2x7b,HFv1 MMLU,56.13,,hf_open_llm_v1_240829_frozen.csv medchator_2x7b,HFv1 TruthfulQA,48.77,,hf_open_llm_v1_240829_frozen.csv medchator_2x7b,HFv1 Winogrande,75.3,,hf_open_llm_v1_240829_frozen.csv medes_7b,HF OpenLLM v1,72.11,,hf_open_llm_v1_240829_frozen.csv medes_7b,HFv1 ARC,69.2,,hf_open_llm_v1_240829_frozen.csv medes_7b,HFv1 GSM8K,65.05,,hf_open_llm_v1_240829_frozen.csv medes_7b,HFv1 HellaSwag,86.84,,hf_open_llm_v1_240829_frozen.csv medes_7b,HFv1 MMLU,64.91,,hf_open_llm_v1_240829_frozen.csv medes_7b,HFv1 TruthfulQA,64.42,,hf_open_llm_v1_240829_frozen.csv medes_7b,HFv1 Winogrande,82.24,,hf_open_llm_v1_240829_frozen.csv megachat,HF OpenLLM v1,34.75,,hf_open_llm_v1_240829_frozen.csv megachat,HFv1 ARC,30.8,,hf_open_llm_v1_240829_frozen.csv megachat,HFv1 GSM8K,0.99,,hf_open_llm_v1_240829_frozen.csv megachat,HFv1 HellaSwag,54.35,,hf_open_llm_v1_240829_frozen.csv megachat,HFv1 MMLU,25.55,,hf_open_llm_v1_240829_frozen.csv megachat,HFv1 TruthfulQA,39.85,,hf_open_llm_v1_240829_frozen.csv megachat,HFv1 Winogrande,56.99,,hf_open_llm_v1_240829_frozen.csv megatron_gpt2_345m,HF OpenLLM v1,30.4,,hf_open_llm_v1_240829_frozen.csv megatron_gpt2_345m,HFv1 ARC,24.23,,hf_open_llm_v1_240829_frozen.csv megatron_gpt2_345m,HFv1 GSM8K,0.23,,hf_open_llm_v1_240829_frozen.csv megatron_gpt2_345m,HFv1 HellaSwag,39.18,,hf_open_llm_v1_240829_frozen.csv megatron_gpt2_345m,HFv1 MMLU,24.32,,hf_open_llm_v1_240829_frozen.csv megatron_gpt2_345m,HFv1 TruthfulQA,41.51,,hf_open_llm_v1_240829_frozen.csv megatron_gpt2_345m,HFv1 Winogrande,52.96,,hf_open_llm_v1_240829_frozen.csv megatron_gpt2_345m_evol_instruct_v2,HF OpenLLM v1,30.31,,hf_open_llm_v1_240829_frozen.csv megatron_gpt2_345m_evol_instruct_v2,HFv1 ARC,26.37,,hf_open_llm_v1_240829_frozen.csv megatron_gpt2_345m_evol_instruct_v2,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv megatron_gpt2_345m_evol_instruct_v2,HFv1 HellaSwag,38.39,,hf_open_llm_v1_240829_frozen.csv megatron_gpt2_345m_evol_instruct_v2,HFv1 MMLU,23.6,,hf_open_llm_v1_240829_frozen.csv megatron_gpt2_345m_evol_instruct_v2,HFv1 TruthfulQA,41.19,,hf_open_llm_v1_240829_frozen.csv megatron_gpt2_345m_evol_instruct_v2,HFv1 Winogrande,52.33,,hf_open_llm_v1_240829_frozen.csv melangea_70b,HF OpenLLM v1,62.82,,hf_open_llm_v1_240829_frozen.csv melangea_70b,HFv1 ARC,71.25,,hf_open_llm_v1_240829_frozen.csv melangea_70b,HFv1 GSM8K,5.69,,hf_open_llm_v1_240829_frozen.csv melangea_70b,HFv1 HellaSwag,87.3,,hf_open_llm_v1_240829_frozen.csv melangea_70b,HFv1 MMLU,70.56,,hf_open_llm_v1_240829_frozen.csv melangea_70b,HFv1 TruthfulQA,60.61,,hf_open_llm_v1_240829_frozen.csv melangea_70b,HFv1 Winogrande,81.53,,hf_open_llm_v1_240829_frozen.csv melangeb_70b,HF OpenLLM v1,67.12,,hf_open_llm_v1_240829_frozen.csv melangeb_70b,HFv1 ARC,71.67,,hf_open_llm_v1_240829_frozen.csv melangeb_70b,HFv1 GSM8K,30.63,,hf_open_llm_v1_240829_frozen.csv melangeb_70b,HFv1 HellaSwag,87.5,,hf_open_llm_v1_240829_frozen.csv melangeb_70b,HFv1 MMLU,70.03,,hf_open_llm_v1_240829_frozen.csv melangeb_70b,HFv1 TruthfulQA,59.36,,hf_open_llm_v1_240829_frozen.csv melangeb_70b,HFv1 Winogrande,83.5,,hf_open_llm_v1_240829_frozen.csv melangec_70b,HF OpenLLM v1,61.96,,hf_open_llm_v1_240829_frozen.csv melangec_70b,HFv1 ARC,71.67,,hf_open_llm_v1_240829_frozen.csv melangec_70b,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv melangec_70b,HFv1 HellaSwag,87.6,,hf_open_llm_v1_240829_frozen.csv melangec_70b,HFv1 MMLU,70.37,,hf_open_llm_v1_240829_frozen.csv melangec_70b,HFv1 TruthfulQA,58.13,,hf_open_llm_v1_240829_frozen.csv melangec_70b,HFv1 Winogrande,83.98,,hf_open_llm_v1_240829_frozen.csv merge_dolly_v2_3b_dpo_test,HF OpenLLM v1,37.49,,hf_open_llm_v1_240829_frozen.csv merge_dolly_v2_3b_dpo_test,HFv1 ARC,40.02,,hf_open_llm_v1_240829_frozen.csv merge_dolly_v2_3b_dpo_test,HFv1 GSM8K,2.12,,hf_open_llm_v1_240829_frozen.csv merge_dolly_v2_3b_dpo_test,HFv1 HellaSwag,65.14,,hf_open_llm_v1_240829_frozen.csv merge_dolly_v2_3b_dpo_test,HFv1 MMLU,24.99,,hf_open_llm_v1_240829_frozen.csv merge_dolly_v2_3b_dpo_test,HFv1 TruthfulQA,33.3,,hf_open_llm_v1_240829_frozen.csv merge_dolly_v2_3b_dpo_test,HFv1 Winogrande,59.35,,hf_open_llm_v1_240829_frozen.csv merged_dpo_7b,HF OpenLLM v1,68.06,,hf_open_llm_v1_240829_frozen.csv merged_dpo_7b,HFv1 ARC,68.94,,hf_open_llm_v1_240829_frozen.csv merged_dpo_7b,HFv1 GSM8K,45.19,,hf_open_llm_v1_240829_frozen.csv merged_dpo_7b,HFv1 HellaSwag,87.75,,hf_open_llm_v1_240829_frozen.csv merged_dpo_7b,HFv1 MMLU,55.35,,hf_open_llm_v1_240829_frozen.csv merged_dpo_7b,HFv1 TruthfulQA,72.76,,hf_open_llm_v1_240829_frozen.csv merged_dpo_7b,HFv1 Winogrande,78.37,,hf_open_llm_v1_240829_frozen.csv mergetrix_7b,HF OpenLLM v1,74.33,,hf_open_llm_v1_240829_frozen.csv mergetrix_7b,HFv1 ARC,72.27,,hf_open_llm_v1_240829_frozen.csv mergetrix_7b,HFv1 GSM8K,71.19,,hf_open_llm_v1_240829_frozen.csv mergetrix_7b,HFv1 HellaSwag,87.84,,hf_open_llm_v1_240829_frozen.csv mergetrix_7b,HFv1 MMLU,64.88,,hf_open_llm_v1_240829_frozen.csv mergetrix_7b,HFv1 TruthfulQA,66.27,,hf_open_llm_v1_240829_frozen.csv mergetrix_7b,HFv1 Winogrande,83.5,,hf_open_llm_v1_240829_frozen.csv merlinite_7b,HF OpenLLM v1,64.0,,hf_open_llm_v1_240829_frozen.csv merlinite_7b,HFv1 ARC,63.65,,hf_open_llm_v1_240829_frozen.csv merlinite_7b,HFv1 GSM8K,41.09,,hf_open_llm_v1_240829_frozen.csv merlinite_7b,HFv1 HellaSwag,84.52,,hf_open_llm_v1_240829_frozen.csv merlinite_7b,HFv1 MMLU,64.91,,hf_open_llm_v1_240829_frozen.csv merlinite_7b,HFv1 TruthfulQA,50.15,,hf_open_llm_v1_240829_frozen.csv merlinite_7b,HFv1 Winogrande,79.72,,hf_open_llm_v1_240829_frozen.csv mermaid_7b_ties,HF OpenLLM v1,65.22,,hf_open_llm_v1_240829_frozen.csv mermaid_7b_ties,HFv1 ARC,64.51,,hf_open_llm_v1_240829_frozen.csv mermaid_7b_ties,HFv1 GSM8K,47.76,,hf_open_llm_v1_240829_frozen.csv mermaid_7b_ties,HFv1 HellaSwag,85.17,,hf_open_llm_v1_240829_frozen.csv mermaid_7b_ties,HFv1 MMLU,64.34,,hf_open_llm_v1_240829_frozen.csv mermaid_7b_ties,HFv1 TruthfulQA,50.05,,hf_open_llm_v1_240829_frozen.csv mermaid_7b_ties,HFv1 Winogrande,79.48,,hf_open_llm_v1_240829_frozen.csv metamath_bagel_dpo_34b,HF OpenLLM v1,74.8,,hf_open_llm_v1_240829_frozen.csv metamath_bagel_dpo_34b,HFv1 ARC,68.17,,hf_open_llm_v1_240829_frozen.csv metamath_bagel_dpo_34b,HFv1 GSM8K,72.18,,hf_open_llm_v1_240829_frozen.csv metamath_bagel_dpo_34b,HFv1 HellaSwag,84.23,,hf_open_llm_v1_240829_frozen.csv metamath_bagel_dpo_34b,HFv1 MMLU,76.54,,hf_open_llm_v1_240829_frozen.csv metamath_bagel_dpo_34b,HFv1 TruthfulQA,65.44,,hf_open_llm_v1_240829_frozen.csv metamath_bagel_dpo_34b,HFv1 Winogrande,82.24,,hf_open_llm_v1_240829_frozen.csv metamodel_moe,HF OpenLLM v1,74.42,,hf_open_llm_v1_240829_frozen.csv metamodel_moe,HFv1 ARC,71.25,,hf_open_llm_v1_240829_frozen.csv metamodel_moe,HFv1 GSM8K,65.43,,hf_open_llm_v1_240829_frozen.csv metamodel_moe,HFv1 HellaSwag,88.4,,hf_open_llm_v1_240829_frozen.csv metamodel_moe,HFv1 MMLU,66.31,,hf_open_llm_v1_240829_frozen.csv metamodel_moe,HFv1 TruthfulQA,71.86,,hf_open_llm_v1_240829_frozen.csv metamodel_moe,HFv1 Winogrande,83.5,,hf_open_llm_v1_240829_frozen.csv metamodel_moe_multilingualv1,HF OpenLLM v1,69.33,,hf_open_llm_v1_240829_frozen.csv metamodel_moe_multilingualv1,HFv1 ARC,67.58,,hf_open_llm_v1_240829_frozen.csv metamodel_moe_multilingualv1,HFv1 GSM8K,61.33,,hf_open_llm_v1_240829_frozen.csv metamodel_moe_multilingualv1,HFv1 HellaSwag,84.73,,hf_open_llm_v1_240829_frozen.csv metamodel_moe_multilingualv1,HFv1 MMLU,63.93,,hf_open_llm_v1_240829_frozen.csv metamodel_moe_multilingualv1,HFv1 TruthfulQA,61.23,,hf_open_llm_v1_240829_frozen.csv metamodel_moe_multilingualv1,HFv1 Winogrande,77.58,,hf_open_llm_v1_240829_frozen.csv metamodel_moex8,HF OpenLLM v1,74.39,,hf_open_llm_v1_240829_frozen.csv metamodel_moex8,HFv1 ARC,71.16,,hf_open_llm_v1_240829_frozen.csv metamodel_moex8,HFv1 GSM8K,65.35,,hf_open_llm_v1_240829_frozen.csv metamodel_moex8,HFv1 HellaSwag,88.38,,hf_open_llm_v1_240829_frozen.csv metamodel_moex8,HFv1 MMLU,66.29,,hf_open_llm_v1_240829_frozen.csv metamodel_moex8,HFv1 TruthfulQA,71.91,,hf_open_llm_v1_240829_frozen.csv metamodel_moex8,HFv1 Winogrande,83.27,,hf_open_llm_v1_240829_frozen.csv metamodelv3,HF OpenLLM v1,74.39,,hf_open_llm_v1_240829_frozen.csv metamodelv3,HFv1 ARC,71.16,,hf_open_llm_v1_240829_frozen.csv metamodelv3,HFv1 GSM8K,65.28,,hf_open_llm_v1_240829_frozen.csv metamodelv3,HFv1 HellaSwag,88.39,,hf_open_llm_v1_240829_frozen.csv metamodelv3,HFv1 MMLU,66.32,,hf_open_llm_v1_240829_frozen.csv metamodelv3,HFv1 TruthfulQA,71.86,,hf_open_llm_v1_240829_frozen.csv metamodelv3,HFv1 Winogrande,83.35,,hf_open_llm_v1_240829_frozen.csv metis_0_1,HF OpenLLM v1,60.02,,hf_open_llm_v1_240829_frozen.csv metis_0_1,HFv1 ARC,60.15,,hf_open_llm_v1_240829_frozen.csv metis_0_1,HFv1 GSM8K,33.21,,hf_open_llm_v1_240829_frozen.csv metis_0_1,HFv1 HellaSwag,82.85,,hf_open_llm_v1_240829_frozen.csv metis_0_1,HFv1 MMLU,61.42,,hf_open_llm_v1_240829_frozen.csv metis_0_1,HFv1 TruthfulQA,45.24,,hf_open_llm_v1_240829_frozen.csv metis_0_1,HFv1 Winogrande,77.27,,hf_open_llm_v1_240829_frozen.csv metis_0_3,HF OpenLLM v1,65.44,,hf_open_llm_v1_240829_frozen.csv metis_0_3,HFv1 ARC,62.71,,hf_open_llm_v1_240829_frozen.csv metis_0_3,HFv1 GSM8K,39.35,,hf_open_llm_v1_240829_frozen.csv metis_0_3,HFv1 HellaSwag,84.8,,hf_open_llm_v1_240829_frozen.csv metis_0_3,HFv1 MMLU,60.92,,hf_open_llm_v1_240829_frozen.csv metis_0_3,HFv1 TruthfulQA,67.56,,hf_open_llm_v1_240829_frozen.csv metis_0_3,HFv1 Winogrande,77.27,,hf_open_llm_v1_240829_frozen.csv metis_0_3_merged,HF OpenLLM v1,61.34,,hf_open_llm_v1_240829_frozen.csv metis_0_3_merged,HFv1 ARC,62.2,,hf_open_llm_v1_240829_frozen.csv metis_0_3_merged,HFv1 GSM8K,21.83,,hf_open_llm_v1_240829_frozen.csv metis_0_3_merged,HFv1 HellaSwag,84.0,,hf_open_llm_v1_240829_frozen.csv metis_0_3_merged,HFv1 MMLU,62.65,,hf_open_llm_v1_240829_frozen.csv metis_0_3_merged,HFv1 TruthfulQA,59.24,,hf_open_llm_v1_240829_frozen.csv metis_0_3_merged,HFv1 Winogrande,78.14,,hf_open_llm_v1_240829_frozen.csv metis_0_4,HF OpenLLM v1,61.34,,hf_open_llm_v1_240829_frozen.csv metis_0_4,HFv1 ARC,62.29,,hf_open_llm_v1_240829_frozen.csv metis_0_4,HFv1 GSM8K,22.21,,hf_open_llm_v1_240829_frozen.csv metis_0_4,HFv1 HellaSwag,84.0,,hf_open_llm_v1_240829_frozen.csv metis_0_4,HFv1 MMLU,62.7,,hf_open_llm_v1_240829_frozen.csv metis_0_4,HFv1 TruthfulQA,59.24,,hf_open_llm_v1_240829_frozen.csv metis_0_4,HFv1 Winogrande,78.14,,hf_open_llm_v1_240829_frozen.csv metis_0_5,HF OpenLLM v1,62.65,,hf_open_llm_v1_240829_frozen.csv metis_0_5,HFv1 ARC,62.63,,hf_open_llm_v1_240829_frozen.csv metis_0_5,HFv1 GSM8K,42.91,,hf_open_llm_v1_240829_frozen.csv metis_0_5,HFv1 HellaSwag,83.77,,hf_open_llm_v1_240829_frozen.csv metis_0_5,HFv1 MMLU,62.16,,hf_open_llm_v1_240829_frozen.csv metis_0_5,HFv1 TruthfulQA,49.33,,hf_open_llm_v1_240829_frozen.csv metis_0_5,HFv1 Winogrande,75.14,,hf_open_llm_v1_240829_frozen.csv mgpt,HF OpenLLM v1,27.61,,hf_open_llm_v1_240829_frozen.csv mgpt,HFv1 ARC,23.81,,hf_open_llm_v1_240829_frozen.csv mgpt,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv mgpt,HFv1 HellaSwag,26.37,,hf_open_llm_v1_240829_frozen.csv mgpt,HFv1 MMLU,25.17,,hf_open_llm_v1_240829_frozen.csv mgpt,HFv1 TruthfulQA,39.62,,hf_open_llm_v1_240829_frozen.csv mgpt,HFv1 Winogrande,50.67,,hf_open_llm_v1_240829_frozen.csv mhm_7b_v1_3,HF OpenLLM v1,47.29,,hf_open_llm_v1_240829_frozen.csv mhm_7b_v1_3,HFv1 ARC,47.53,,hf_open_llm_v1_240829_frozen.csv mhm_7b_v1_3,HFv1 GSM8K,16.68,,hf_open_llm_v1_240829_frozen.csv mhm_7b_v1_3,HFv1 HellaSwag,65.31,,hf_open_llm_v1_240829_frozen.csv mhm_7b_v1_3,HFv1 MMLU,45.74,,hf_open_llm_v1_240829_frozen.csv mhm_7b_v1_3,HFv1 TruthfulQA,46.22,,hf_open_llm_v1_240829_frozen.csv mhm_7b_v1_3,HFv1 Winogrande,62.27,,hf_open_llm_v1_240829_frozen.csv mhm_7b_v1_3_dpo_1,HF OpenLLM v1,47.77,,hf_open_llm_v1_240829_frozen.csv mhm_7b_v1_3_dpo_1,HFv1 ARC,49.57,,hf_open_llm_v1_240829_frozen.csv mhm_7b_v1_3_dpo_1,HFv1 GSM8K,15.24,,hf_open_llm_v1_240829_frozen.csv mhm_7b_v1_3_dpo_1,HFv1 HellaSwag,68.1,,hf_open_llm_v1_240829_frozen.csv mhm_7b_v1_3_dpo_1,HFv1 MMLU,45.76,,hf_open_llm_v1_240829_frozen.csv mhm_7b_v1_3_dpo_1,HFv1 TruthfulQA,45.88,,hf_open_llm_v1_240829_frozen.csv mhm_7b_v1_3_dpo_1,HFv1 Winogrande,62.04,,hf_open_llm_v1_240829_frozen.csv mhm_8x7b_frankenmoe_v1_0,HF OpenLLM v1,74.01,,hf_open_llm_v1_240829_frozen.csv mhm_8x7b_frankenmoe_v1_0,HFv1 ARC,70.9,,hf_open_llm_v1_240829_frozen.csv mhm_8x7b_frankenmoe_v1_0,HFv1 GSM8K,71.57,,hf_open_llm_v1_240829_frozen.csv mhm_8x7b_frankenmoe_v1_0,HFv1 HellaSwag,87.75,,hf_open_llm_v1_240829_frozen.csv mhm_8x7b_frankenmoe_v1_0,HFv1 MMLU,64.7,,hf_open_llm_v1_240829_frozen.csv mhm_8x7b_frankenmoe_v1_0,HFv1 TruthfulQA,67.1,,hf_open_llm_v1_240829_frozen.csv mhm_8x7b_frankenmoe_v1_0,HFv1 Winogrande,82.0,,hf_open_llm_v1_240829_frozen.csv michel_13b,HF OpenLLM v1,57.56,,hf_open_llm_v1_240829_frozen.csv michel_13b,HFv1 ARC,61.26,,hf_open_llm_v1_240829_frozen.csv michel_13b,HFv1 GSM8K,20.17,,hf_open_llm_v1_240829_frozen.csv michel_13b,HFv1 HellaSwag,83.21,,hf_open_llm_v1_240829_frozen.csv michel_13b,HFv1 MMLU,55.05,,hf_open_llm_v1_240829_frozen.csv michel_13b,HFv1 TruthfulQA,50.43,,hf_open_llm_v1_240829_frozen.csv michel_13b,HFv1 Winogrande,75.22,,hf_open_llm_v1_240829_frozen.csv microscopic_mistral_87k_steps,HF OpenLLM v1,28.9,,hf_open_llm_v1_240829_frozen.csv microscopic_mistral_87k_steps,HFv1 ARC,24.06,,hf_open_llm_v1_240829_frozen.csv microscopic_mistral_87k_steps,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv microscopic_mistral_87k_steps,HFv1 HellaSwag,29.25,,hf_open_llm_v1_240829_frozen.csv microscopic_mistral_87k_steps,HFv1 MMLU,25.54,,hf_open_llm_v1_240829_frozen.csv microscopic_mistral_87k_steps,HFv1 TruthfulQA,45.02,,hf_open_llm_v1_240829_frozen.csv microscopic_mistral_87k_steps,HFv1 Winogrande,49.57,,hf_open_llm_v1_240829_frozen.csv mindllm,HF OpenLLM v1,29.28,,hf_open_llm_v1_240829_frozen.csv mindllm,HFv1 ARC,22.44,,hf_open_llm_v1_240829_frozen.csv mindllm,HFv1 GSM8K,0.83,,hf_open_llm_v1_240829_frozen.csv mindllm,HFv1 HellaSwag,34.11,,hf_open_llm_v1_240829_frozen.csv mindllm,HFv1 MMLU,25.5,,hf_open_llm_v1_240829_frozen.csv mindllm,HFv1 TruthfulQA,43.48,,hf_open_llm_v1_240829_frozen.csv mindllm,HFv1 Winogrande,49.33,,hf_open_llm_v1_240829_frozen.csv mini_7b_dare_v1,HF OpenLLM v1,64.4,,hf_open_llm_v1_240829_frozen.csv mini_7b_dare_v1,HFv1 ARC,61.77,,hf_open_llm_v1_240829_frozen.csv mini_7b_dare_v1,HFv1 GSM8K,56.56,,hf_open_llm_v1_240829_frozen.csv mini_7b_dare_v1,HFv1 HellaSwag,79.91,,hf_open_llm_v1_240829_frozen.csv mini_7b_dare_v1,HFv1 MMLU,59.55,,hf_open_llm_v1_240829_frozen.csv mini_7b_dare_v1,HFv1 TruthfulQA,54.64,,hf_open_llm_v1_240829_frozen.csv mini_7b_dare_v1,HFv1 Winogrande,73.95,,hf_open_llm_v1_240829_frozen.csv mini_dpo_test02,HF OpenLLM v1,61.23,,hf_open_llm_v1_240829_frozen.csv mini_dpo_test02,HFv1 ARC,59.73,,hf_open_llm_v1_240829_frozen.csv mini_dpo_test02,HFv1 GSM8K,35.03,,hf_open_llm_v1_240829_frozen.csv mini_dpo_test02,HFv1 HellaSwag,83.89,,hf_open_llm_v1_240829_frozen.csv mini_dpo_test02,HFv1 MMLU,61.9,,hf_open_llm_v1_240829_frozen.csv mini_dpo_test02,HFv1 TruthfulQA,48.47,,hf_open_llm_v1_240829_frozen.csv mini_dpo_test02,HFv1 Winogrande,78.37,,hf_open_llm_v1_240829_frozen.csv mini_synatra_sft,HF OpenLLM v1,63.39,,hf_open_llm_v1_240829_frozen.csv mini_synatra_sft,HFv1 ARC,62.46,,hf_open_llm_v1_240829_frozen.csv mini_synatra_sft,HFv1 GSM8K,44.88,,hf_open_llm_v1_240829_frozen.csv mini_synatra_sft,HFv1 HellaSwag,83.44,,hf_open_llm_v1_240829_frozen.csv mini_synatra_sft,HFv1 MMLU,61.2,,hf_open_llm_v1_240829_frozen.csv mini_synatra_sft,HFv1 TruthfulQA,53.67,,hf_open_llm_v1_240829_frozen.csv mini_synatra_sft,HFv1 Winogrande,74.66,,hf_open_llm_v1_240829_frozen.csv minillama_1_8b_chat_v0_1,HF OpenLLM v1,37.37,,hf_open_llm_v1_240829_frozen.csv minillama_1_8b_chat_v0_1,HFv1 ARC,34.73,,hf_open_llm_v1_240829_frozen.csv minillama_1_8b_chat_v0_1,HFv1 GSM8K,1.9,,hf_open_llm_v1_240829_frozen.csv minillama_1_8b_chat_v0_1,HFv1 HellaSwag,62.38,,hf_open_llm_v1_240829_frozen.csv minillama_1_8b_chat_v0_1,HFv1 MMLU,25.69,,hf_open_llm_v1_240829_frozen.csv minillama_1_8b_chat_v0_1,HFv1 TruthfulQA,38.97,,hf_open_llm_v1_240829_frozen.csv minillama_1_8b_chat_v0_1,HFv1 Winogrande,60.54,,hf_open_llm_v1_240829_frozen.csv minima_3b,HF OpenLLM v1,41.44,,hf_open_llm_v1_240829_frozen.csv minima_3b,HFv1 ARC,43.43,,hf_open_llm_v1_240829_frozen.csv minima_3b,HFv1 GSM8K,2.73,,hf_open_llm_v1_240829_frozen.csv minima_3b,HFv1 HellaSwag,68.06,,hf_open_llm_v1_240829_frozen.csv minima_3b,HFv1 MMLU,28.69,,hf_open_llm_v1_240829_frozen.csv minima_3b,HFv1 TruthfulQA,39.76,,hf_open_llm_v1_240829_frozen.csv minima_3b,HFv1 Winogrande,65.98,,hf_open_llm_v1_240829_frozen.csv minimerlin_3b_v0_1,HF OpenLLM v1,41.6,,hf_open_llm_v1_240829_frozen.csv minimerlin_3b_v0_1,HFv1 ARC,40.7,,hf_open_llm_v1_240829_frozen.csv minimerlin_3b_v0_1,HFv1 GSM8K,1.36,,hf_open_llm_v1_240829_frozen.csv minimerlin_3b_v0_1,HFv1 HellaSwag,54.06,,hf_open_llm_v1_240829_frozen.csv minimerlin_3b_v0_1,HFv1 MMLU,43.32,,hf_open_llm_v1_240829_frozen.csv minimerlin_3b_v0_1,HFv1 TruthfulQA,49.65,,hf_open_llm_v1_240829_frozen.csv minimerlin_3b_v0_1,HFv1 Winogrande,60.54,,hf_open_llm_v1_240829_frozen.csv minotaur_llama2_13b_qlora,HF OpenLLM v1,55.37,,hf_open_llm_v1_240829_frozen.csv minotaur_llama2_13b_qlora,HFv1 ARC,60.07,,hf_open_llm_v1_240829_frozen.csv minotaur_llama2_13b_qlora,HFv1 GSM8K,12.05,,hf_open_llm_v1_240829_frozen.csv minotaur_llama2_13b_qlora,HFv1 HellaSwag,82.42,,hf_open_llm_v1_240829_frozen.csv minotaur_llama2_13b_qlora,HFv1 MMLU,55.87,,hf_open_llm_v1_240829_frozen.csv minotaur_llama2_13b_qlora,HFv1 TruthfulQA,45.57,,hf_open_llm_v1_240829_frozen.csv minotaur_llama2_13b_qlora,HFv1 Winogrande,76.24,,hf_open_llm_v1_240829_frozen.csv minueza_32m_base,HF OpenLLM v1,28.92,,hf_open_llm_v1_240829_frozen.csv minueza_32m_base,HFv1 ARC,21.33,,hf_open_llm_v1_240829_frozen.csv minueza_32m_base,HFv1 GSM8K,0.38,,hf_open_llm_v1_240829_frozen.csv minueza_32m_base,HFv1 HellaSwag,26.39,,hf_open_llm_v1_240829_frozen.csv minueza_32m_base,HFv1 MMLU,24.8,,hf_open_llm_v1_240829_frozen.csv minueza_32m_base,HFv1 TruthfulQA,47.45,,hf_open_llm_v1_240829_frozen.csv minueza_32m_base,HFv1 Winogrande,53.2,,hf_open_llm_v1_240829_frozen.csv minueza_32m_chat,HF OpenLLM v1,28.49,,hf_open_llm_v1_240829_frozen.csv minueza_32m_chat,HFv1 ARC,20.39,,hf_open_llm_v1_240829_frozen.csv minueza_32m_chat,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv minueza_32m_chat,HFv1 HellaSwag,26.54,,hf_open_llm_v1_240829_frozen.csv minueza_32m_chat,HFv1 MMLU,25.75,,hf_open_llm_v1_240829_frozen.csv minueza_32m_chat,HFv1 TruthfulQA,47.27,,hf_open_llm_v1_240829_frozen.csv minueza_32m_chat,HFv1 Winogrande,50.99,,hf_open_llm_v1_240829_frozen.csv minueza_32m_deita,HF OpenLLM v1,28.8,,hf_open_llm_v1_240829_frozen.csv minueza_32m_deita,HFv1 ARC,20.73,,hf_open_llm_v1_240829_frozen.csv minueza_32m_deita,HFv1 GSM8K,0.23,,hf_open_llm_v1_240829_frozen.csv minueza_32m_deita,HFv1 HellaSwag,26.72,,hf_open_llm_v1_240829_frozen.csv minueza_32m_deita,HFv1 MMLU,26.84,,hf_open_llm_v1_240829_frozen.csv minueza_32m_deita,HFv1 TruthfulQA,47.75,,hf_open_llm_v1_240829_frozen.csv minueza_32m_deita,HFv1 Winogrande,50.51,,hf_open_llm_v1_240829_frozen.csv minueza_32m_ultrachat,HF OpenLLM v1,28.97,,hf_open_llm_v1_240829_frozen.csv minueza_32m_ultrachat,HFv1 ARC,21.08,,hf_open_llm_v1_240829_frozen.csv minueza_32m_ultrachat,HFv1 GSM8K,0.23,,hf_open_llm_v1_240829_frozen.csv minueza_32m_ultrachat,HFv1 HellaSwag,26.95,,hf_open_llm_v1_240829_frozen.csv minueza_32m_ultrachat,HFv1 MMLU,26.08,,hf_open_llm_v1_240829_frozen.csv minueza_32m_ultrachat,HFv1 TruthfulQA,47.7,,hf_open_llm_v1_240829_frozen.csv minueza_32m_ultrachat,HFv1 Winogrande,51.78,,hf_open_llm_v1_240829_frozen.csv minueza_32mx2_chat,HF OpenLLM v1,28.12,,hf_open_llm_v1_240829_frozen.csv minueza_32mx2_chat,HFv1 ARC,20.14,,hf_open_llm_v1_240829_frozen.csv minueza_32mx2_chat,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv minueza_32mx2_chat,HFv1 HellaSwag,26.36,,hf_open_llm_v1_240829_frozen.csv minueza_32mx2_chat,HFv1 MMLU,26.07,,hf_open_llm_v1_240829_frozen.csv minueza_32mx2_chat,HFv1 TruthfulQA,44.56,,hf_open_llm_v1_240829_frozen.csv minueza_32mx2_chat,HFv1 Winogrande,51.62,,hf_open_llm_v1_240829_frozen.csv miqu_1_70b_sf,HF OpenLLM v1,76.59,,hf_open_llm_v1_240829_frozen.csv miqu_1_70b_sf,HFv1 ARC,73.04,,hf_open_llm_v1_240829_frozen.csv miqu_1_70b_sf,HFv1 GSM8K,67.7,,hf_open_llm_v1_240829_frozen.csv miqu_1_70b_sf,HFv1 HellaSwag,88.61,,hf_open_llm_v1_240829_frozen.csv miqu_1_70b_sf,HFv1 MMLU,75.49,,hf_open_llm_v1_240829_frozen.csv miqu_1_70b_sf,HFv1 TruthfulQA,69.38,,hf_open_llm_v1_240829_frozen.csv miqu_1_70b_sf,HFv1 Winogrande,85.32,,hf_open_llm_v1_240829_frozen.csv miqu_70b_alpaca_dpo,HF OpenLLM v1,76.6,,hf_open_llm_v1_240829_frozen.csv miqu_70b_alpaca_dpo,HFv1 ARC,73.21,,hf_open_llm_v1_240829_frozen.csv miqu_70b_alpaca_dpo,HFv1 GSM8K,67.55,,hf_open_llm_v1_240829_frozen.csv miqu_70b_alpaca_dpo,HFv1 HellaSwag,88.6,,hf_open_llm_v1_240829_frozen.csv miqu_70b_alpaca_dpo,HFv1 MMLU,75.41,,hf_open_llm_v1_240829_frozen.csv miqu_70b_alpaca_dpo,HFv1 TruthfulQA,69.44,,hf_open_llm_v1_240829_frozen.csv miqu_70b_alpaca_dpo,HFv1 Winogrande,85.4,,hf_open_llm_v1_240829_frozen.csv mistral7b_test001,HF OpenLLM v1,29.49,,hf_open_llm_v1_240829_frozen.csv mistral7b_test001,HFv1 ARC,24.66,,hf_open_llm_v1_240829_frozen.csv mistral7b_test001,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv mistral7b_test001,HFv1 HellaSwag,26.78,,hf_open_llm_v1_240829_frozen.csv mistral7b_test001,HFv1 MMLU,23.12,,hf_open_llm_v1_240829_frozen.csv mistral7b_test001,HFv1 TruthfulQA,50.07,,hf_open_llm_v1_240829_frozen.csv mistral7b_test001,HFv1 Winogrande,52.33,,hf_open_llm_v1_240829_frozen.csv mistral_11b_slimorca,HF OpenLLM v1,66.12,,hf_open_llm_v1_240829_frozen.csv mistral_11b_slimorca,HFv1 ARC,64.25,,hf_open_llm_v1_240829_frozen.csv mistral_11b_slimorca,HFv1 GSM8K,52.39,,hf_open_llm_v1_240829_frozen.csv mistral_11b_slimorca,HFv1 HellaSwag,83.81,,hf_open_llm_v1_240829_frozen.csv mistral_11b_slimorca,HFv1 MMLU,63.66,,hf_open_llm_v1_240829_frozen.csv mistral_11b_slimorca,HFv1 TruthfulQA,54.66,,hf_open_llm_v1_240829_frozen.csv mistral_11b_slimorca,HFv1 Winogrande,77.98,,hf_open_llm_v1_240829_frozen.csv mistral_1_from_mixtral_8x7b_v0_1,HF OpenLLM v1,29.64,,hf_open_llm_v1_240829_frozen.csv mistral_1_from_mixtral_8x7b_v0_1,HFv1 ARC,28.67,,hf_open_llm_v1_240829_frozen.csv mistral_1_from_mixtral_8x7b_v0_1,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv mistral_1_from_mixtral_8x7b_v0_1,HFv1 HellaSwag,26.35,,hf_open_llm_v1_240829_frozen.csv mistral_1_from_mixtral_8x7b_v0_1,HFv1 MMLU,24.66,,hf_open_llm_v1_240829_frozen.csv mistral_1_from_mixtral_8x7b_v0_1,HFv1 TruthfulQA,48.1,,hf_open_llm_v1_240829_frozen.csv mistral_1_from_mixtral_8x7b_v0_1,HFv1 Winogrande,50.04,,hf_open_llm_v1_240829_frozen.csv mistral_22b_v0_1,HF OpenLLM v1,49.94,,hf_open_llm_v1_240829_frozen.csv mistral_22b_v0_1,HFv1 ARC,49.4,,hf_open_llm_v1_240829_frozen.csv mistral_22b_v0_1,HFv1 GSM8K,6.37,,hf_open_llm_v1_240829_frozen.csv mistral_22b_v0_1,HFv1 HellaSwag,72.92,,hf_open_llm_v1_240829_frozen.csv mistral_22b_v0_1,HFv1 MMLU,48.75,,hf_open_llm_v1_240829_frozen.csv mistral_22b_v0_1,HFv1 TruthfulQA,47.35,,hf_open_llm_v1_240829_frozen.csv mistral_22b_v0_1,HFv1 Winogrande,74.82,,hf_open_llm_v1_240829_frozen.csv mistral_2_from_mixtral_8x7b_v0_1,HF OpenLLM v1,29.83,,hf_open_llm_v1_240829_frozen.csv mistral_2_from_mixtral_8x7b_v0_1,HFv1 ARC,28.41,,hf_open_llm_v1_240829_frozen.csv mistral_2_from_mixtral_8x7b_v0_1,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv mistral_2_from_mixtral_8x7b_v0_1,HFv1 HellaSwag,26.49,,hf_open_llm_v1_240829_frozen.csv mistral_2_from_mixtral_8x7b_v0_1,HFv1 MMLU,24.17,,hf_open_llm_v1_240829_frozen.csv mistral_2_from_mixtral_8x7b_v0_1,HFv1 TruthfulQA,48.37,,hf_open_llm_v1_240829_frozen.csv mistral_2_from_mixtral_8x7b_v0_1,HFv1 Winogrande,51.54,,hf_open_llm_v1_240829_frozen.csv mistral_3_from_mixtral_8x7b_v0_1,HF OpenLLM v1,29.93,,hf_open_llm_v1_240829_frozen.csv mistral_3_from_mixtral_8x7b_v0_1,HFv1 ARC,29.35,,hf_open_llm_v1_240829_frozen.csv mistral_3_from_mixtral_8x7b_v0_1,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv mistral_3_from_mixtral_8x7b_v0_1,HFv1 HellaSwag,26.59,,hf_open_llm_v1_240829_frozen.csv mistral_3_from_mixtral_8x7b_v0_1,HFv1 MMLU,25.73,,hf_open_llm_v1_240829_frozen.csv mistral_3_from_mixtral_8x7b_v0_1,HFv1 TruthfulQA,48.19,,hf_open_llm_v1_240829_frozen.csv mistral_3_from_mixtral_8x7b_v0_1,HFv1 Winogrande,49.72,,hf_open_llm_v1_240829_frozen.csv mistral_4_from_mixtral_8x7b_v0_1,HF OpenLLM v1,29.53,,hf_open_llm_v1_240829_frozen.csv mistral_4_from_mixtral_8x7b_v0_1,HFv1 ARC,28.24,,hf_open_llm_v1_240829_frozen.csv mistral_4_from_mixtral_8x7b_v0_1,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv mistral_4_from_mixtral_8x7b_v0_1,HFv1 HellaSwag,27.53,,hf_open_llm_v1_240829_frozen.csv mistral_4_from_mixtral_8x7b_v0_1,HFv1 MMLU,24.83,,hf_open_llm_v1_240829_frozen.csv mistral_4_from_mixtral_8x7b_v0_1,HFv1 TruthfulQA,48.51,,hf_open_llm_v1_240829_frozen.csv mistral_4_from_mixtral_8x7b_v0_1,HFv1 Winogrande,48.07,,hf_open_llm_v1_240829_frozen.csv mistral_5_from_mixtral_8x7b_v0_1,HF OpenLLM v1,29.83,,hf_open_llm_v1_240829_frozen.csv mistral_5_from_mixtral_8x7b_v0_1,HFv1 ARC,29.35,,hf_open_llm_v1_240829_frozen.csv mistral_5_from_mixtral_8x7b_v0_1,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv mistral_5_from_mixtral_8x7b_v0_1,HFv1 HellaSwag,26.44,,hf_open_llm_v1_240829_frozen.csv mistral_5_from_mixtral_8x7b_v0_1,HFv1 MMLU,25.1,,hf_open_llm_v1_240829_frozen.csv mistral_5_from_mixtral_8x7b_v0_1,HFv1 TruthfulQA,48.27,,hf_open_llm_v1_240829_frozen.csv mistral_5_from_mixtral_8x7b_v0_1,HFv1 Winogrande,49.8,,hf_open_llm_v1_240829_frozen.csv mistral_6_from_mixtral_8x7b_v0_1,HF OpenLLM v1,29.61,,hf_open_llm_v1_240829_frozen.csv mistral_6_from_mixtral_8x7b_v0_1,HFv1 ARC,28.33,,hf_open_llm_v1_240829_frozen.csv mistral_6_from_mixtral_8x7b_v0_1,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv mistral_6_from_mixtral_8x7b_v0_1,HFv1 HellaSwag,26.82,,hf_open_llm_v1_240829_frozen.csv mistral_6_from_mixtral_8x7b_v0_1,HFv1 MMLU,25.45,,hf_open_llm_v1_240829_frozen.csv mistral_6_from_mixtral_8x7b_v0_1,HFv1 TruthfulQA,48.39,,hf_open_llm_v1_240829_frozen.csv mistral_6_from_mixtral_8x7b_v0_1,HFv1 Winogrande,48.7,,hf_open_llm_v1_240829_frozen.csv mistral_7_from_mixtral_8x7b_v0_1,HF OpenLLM v1,29.64,,hf_open_llm_v1_240829_frozen.csv mistral_7_from_mixtral_8x7b_v0_1,HFv1 ARC,29.1,,hf_open_llm_v1_240829_frozen.csv mistral_7_from_mixtral_8x7b_v0_1,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv mistral_7_from_mixtral_8x7b_v0_1,HFv1 HellaSwag,26.57,,hf_open_llm_v1_240829_frozen.csv mistral_7_from_mixtral_8x7b_v0_1,HFv1 MMLU,25.12,,hf_open_llm_v1_240829_frozen.csv mistral_7_from_mixtral_8x7b_v0_1,HFv1 TruthfulQA,48.54,,hf_open_llm_v1_240829_frozen.csv mistral_7_from_mixtral_8x7b_v0_1,HFv1 Winogrande,48.54,,hf_open_llm_v1_240829_frozen.csv mistral_7b_aezakmi_v1,HF OpenLLM v1,54.92,,hf_open_llm_v1_240829_frozen.csv mistral_7b_aezakmi_v1,HFv1 ARC,58.87,,hf_open_llm_v1_240829_frozen.csv mistral_7b_aezakmi_v1,HFv1 GSM8K,0.68,,hf_open_llm_v1_240829_frozen.csv mistral_7b_aezakmi_v1,HFv1 HellaSwag,82.01,,hf_open_llm_v1_240829_frozen.csv mistral_7b_aezakmi_v1,HFv1 MMLU,58.72,,hf_open_llm_v1_240829_frozen.csv mistral_7b_aezakmi_v1,HFv1 TruthfulQA,53.54,,hf_open_llm_v1_240829_frozen.csv mistral_7b_aezakmi_v1,HFv1 Winogrande,75.69,,hf_open_llm_v1_240829_frozen.csv mistral_7b_claude_instruct,HF OpenLLM v1,59.27,,hf_open_llm_v1_240829_frozen.csv mistral_7b_claude_instruct,HFv1 ARC,63.23,,hf_open_llm_v1_240829_frozen.csv mistral_7b_claude_instruct,HFv1 GSM8K,17.97,,hf_open_llm_v1_240829_frozen.csv mistral_7b_claude_instruct,HFv1 HellaSwag,84.99,,hf_open_llm_v1_240829_frozen.csv mistral_7b_claude_instruct,HFv1 MMLU,63.84,,hf_open_llm_v1_240829_frozen.csv mistral_7b_claude_instruct,HFv1 TruthfulQA,47.47,,hf_open_llm_v1_240829_frozen.csv mistral_7b_claude_instruct,HFv1 Winogrande,78.14,,hf_open_llm_v1_240829_frozen.csv mistral_7b_dpo_merge_v1_1,HF OpenLLM v1,74.53,,hf_open_llm_v1_240829_frozen.csv mistral_7b_dpo_merge_v1_1,HFv1 ARC,72.53,,hf_open_llm_v1_240829_frozen.csv mistral_7b_dpo_merge_v1_1,HFv1 GSM8K,70.89,,hf_open_llm_v1_240829_frozen.csv mistral_7b_dpo_merge_v1_1,HFv1 HellaSwag,88.15,,hf_open_llm_v1_240829_frozen.csv mistral_7b_dpo_merge_v1_1,HFv1 MMLU,64.83,,hf_open_llm_v1_240829_frozen.csv mistral_7b_dpo_merge_v1_1,HFv1 TruthfulQA,68.48,,hf_open_llm_v1_240829_frozen.csv mistral_7b_dpo_merge_v1_1,HFv1 Winogrande,82.32,,hf_open_llm_v1_240829_frozen.csv mistral_7b_dpo_open_orca_flan_50k_synthetic_5_models,HF OpenLLM v1,29.48,,hf_open_llm_v1_240829_frozen.csv mistral_7b_dpo_open_orca_flan_50k_synthetic_5_models,HFv1 ARC,25.51,,hf_open_llm_v1_240829_frozen.csv mistral_7b_dpo_open_orca_flan_50k_synthetic_5_models,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv mistral_7b_dpo_open_orca_flan_50k_synthetic_5_models,HFv1 HellaSwag,25.52,,hf_open_llm_v1_240829_frozen.csv mistral_7b_dpo_open_orca_flan_50k_synthetic_5_models,HFv1 MMLU,26.82,,hf_open_llm_v1_240829_frozen.csv mistral_7b_dpo_open_orca_flan_50k_synthetic_5_models,HFv1 TruthfulQA,48.81,,hf_open_llm_v1_240829_frozen.csv mistral_7b_dpo_open_orca_flan_50k_synthetic_5_models,HFv1 Winogrande,50.2,,hf_open_llm_v1_240829_frozen.csv mistral_7b_dpo_v5,HF OpenLLM v1,73.87,,hf_open_llm_v1_240829_frozen.csv mistral_7b_dpo_v5,HFv1 ARC,72.01,,hf_open_llm_v1_240829_frozen.csv mistral_7b_dpo_v5,HFv1 GSM8K,70.66,,hf_open_llm_v1_240829_frozen.csv mistral_7b_dpo_v5,HFv1 HellaSwag,87.57,,hf_open_llm_v1_240829_frozen.csv mistral_7b_dpo_v5,HFv1 MMLU,63.85,,hf_open_llm_v1_240829_frozen.csv mistral_7b_dpo_v5,HFv1 TruthfulQA,66.86,,hf_open_llm_v1_240829_frozen.csv mistral_7b_dpo_v5,HFv1 Winogrande,82.24,,hf_open_llm_v1_240829_frozen.csv mistral_7b_dpo_v6,HF OpenLLM v1,74.5,,hf_open_llm_v1_240829_frozen.csv mistral_7b_dpo_v6,HFv1 ARC,72.53,,hf_open_llm_v1_240829_frozen.csv mistral_7b_dpo_v6,HFv1 GSM8K,70.89,,hf_open_llm_v1_240829_frozen.csv mistral_7b_dpo_v6,HFv1 HellaSwag,88.1,,hf_open_llm_v1_240829_frozen.csv mistral_7b_dpo_v6,HFv1 MMLU,64.68,,hf_open_llm_v1_240829_frozen.csv mistral_7b_dpo_v6,HFv1 TruthfulQA,68.24,,hf_open_llm_v1_240829_frozen.csv mistral_7b_dpo_v6,HFv1 Winogrande,82.56,,hf_open_llm_v1_240829_frozen.csv mistral_7b_erebus_v3,HF OpenLLM v1,56.82,,hf_open_llm_v1_240829_frozen.csv mistral_7b_erebus_v3,HFv1 ARC,59.9,,hf_open_llm_v1_240829_frozen.csv mistral_7b_erebus_v3,HFv1 GSM8K,25.55,,hf_open_llm_v1_240829_frozen.csv mistral_7b_erebus_v3,HFv1 HellaSwag,80.3,,hf_open_llm_v1_240829_frozen.csv mistral_7b_erebus_v3,HFv1 MMLU,59.42,,hf_open_llm_v1_240829_frozen.csv mistral_7b_erebus_v3,HFv1 TruthfulQA,40.93,,hf_open_llm_v1_240829_frozen.csv mistral_7b_erebus_v3,HFv1 Winogrande,74.82,,hf_open_llm_v1_240829_frozen.csv mistral_7b_ft_h4_no_robots_instructions,HF OpenLLM v1,61.16,,hf_open_llm_v1_240829_frozen.csv mistral_7b_ft_h4_no_robots_instructions,HFv1 ARC,60.92,,hf_open_llm_v1_240829_frozen.csv mistral_7b_ft_h4_no_robots_instructions,HFv1 GSM8K,37.0,,hf_open_llm_v1_240829_frozen.csv mistral_7b_ft_h4_no_robots_instructions,HFv1 HellaSwag,83.24,,hf_open_llm_v1_240829_frozen.csv mistral_7b_ft_h4_no_robots_instructions,HFv1 MMLU,63.74,,hf_open_llm_v1_240829_frozen.csv mistral_7b_ft_h4_no_robots_instructions,HFv1 TruthfulQA,43.64,,hf_open_llm_v1_240829_frozen.csv mistral_7b_ft_h4_no_robots_instructions,HFv1 Winogrande,78.85,,hf_open_llm_v1_240829_frozen.csv mistral_7b_golden,HF OpenLLM v1,52.49,,hf_open_llm_v1_240829_frozen.csv mistral_7b_golden,HFv1 ARC,60.75,,hf_open_llm_v1_240829_frozen.csv mistral_7b_golden,HFv1 GSM8K,20.32,,hf_open_llm_v1_240829_frozen.csv mistral_7b_golden,HFv1 HellaSwag,44.42,,hf_open_llm_v1_240829_frozen.csv mistral_7b_golden,HFv1 MMLU,59.29,,hf_open_llm_v1_240829_frozen.csv mistral_7b_golden,HFv1 TruthfulQA,53.51,,hf_open_llm_v1_240829_frozen.csv mistral_7b_golden,HFv1 Winogrande,76.64,,hf_open_llm_v1_240829_frozen.csv mistral_7b_instruct_sft_tuned_v0_2,HF OpenLLM v1,62.29,,hf_open_llm_v1_240829_frozen.csv mistral_7b_instruct_sft_tuned_v0_2,HFv1 ARC,58.02,,hf_open_llm_v1_240829_frozen.csv mistral_7b_instruct_sft_tuned_v0_2,HFv1 GSM8K,50.34,,hf_open_llm_v1_240829_frozen.csv mistral_7b_instruct_sft_tuned_v0_2,HFv1 HellaSwag,79.26,,hf_open_llm_v1_240829_frozen.csv mistral_7b_instruct_sft_tuned_v0_2,HFv1 MMLU,58.78,,hf_open_llm_v1_240829_frozen.csv mistral_7b_instruct_sft_tuned_v0_2,HFv1 TruthfulQA,50.66,,hf_open_llm_v1_240829_frozen.csv mistral_7b_instruct_sft_tuned_v0_2,HFv1 Winogrande,76.87,,hf_open_llm_v1_240829_frozen.csv mistral_7b_instruct_v0_1,HF OpenLLM v1,54.96,,hf_open_llm_v1_240829_frozen.csv mistral_7b_instruct_v0_1,HFv1 ARC,54.52,,hf_open_llm_v1_240829_frozen.csv mistral_7b_instruct_v0_1,HFv1 GSM8K,14.25,,hf_open_llm_v1_240829_frozen.csv mistral_7b_instruct_v0_1,HFv1 HellaSwag,75.63,,hf_open_llm_v1_240829_frozen.csv mistral_7b_instruct_v0_1,HFv1 MMLU,55.38,,hf_open_llm_v1_240829_frozen.csv mistral_7b_instruct_v0_1,HFv1 TruthfulQA,56.28,,hf_open_llm_v1_240829_frozen.csv mistral_7b_instruct_v0_1,HFv1 Winogrande,73.72,,hf_open_llm_v1_240829_frozen.csv mistral_7b_instruct_v0_2,HF OpenLLM v1,65.71,,hf_open_llm_v1_240829_frozen.csv mistral_7b_instruct_v0_2,HFv1 ARC,63.14,,hf_open_llm_v1_240829_frozen.csv mistral_7b_instruct_v0_2,HFv1 GSM8K,40.03,,hf_open_llm_v1_240829_frozen.csv mistral_7b_instruct_v0_2,HFv1 HellaSwag,84.88,,hf_open_llm_v1_240829_frozen.csv mistral_7b_instruct_v0_2,HFv1 MMLU,60.78,,hf_open_llm_v1_240829_frozen.csv mistral_7b_instruct_v0_2,HFv1 TruthfulQA,68.26,,hf_open_llm_v1_240829_frozen.csv mistral_7b_instruct_v0_2,HFv1 Winogrande,77.19,,hf_open_llm_v1_240829_frozen.csv mistral_7b_instruct_v0_2_attention_sparsity_10_v0_1,HF OpenLLM v1,65.48,,hf_open_llm_v1_240829_frozen.csv mistral_7b_instruct_v0_2_attention_sparsity_10_v0_1,HFv1 ARC,63.05,,hf_open_llm_v1_240829_frozen.csv mistral_7b_instruct_v0_2_attention_sparsity_10_v0_1,HFv1 GSM8K,38.89,,hf_open_llm_v1_240829_frozen.csv mistral_7b_instruct_v0_2_attention_sparsity_10_v0_1,HFv1 HellaSwag,84.88,,hf_open_llm_v1_240829_frozen.csv mistral_7b_instruct_v0_2_attention_sparsity_10_v0_1,HFv1 MMLU,60.84,,hf_open_llm_v1_240829_frozen.csv mistral_7b_instruct_v0_2_attention_sparsity_10_v0_1,HFv1 TruthfulQA,68.11,,hf_open_llm_v1_240829_frozen.csv mistral_7b_instruct_v0_2_attention_sparsity_10_v0_1,HFv1 Winogrande,77.11,,hf_open_llm_v1_240829_frozen.csv mistral_7b_instruct_v0_2_attention_sparsity_20,HF OpenLLM v1,65.74,,hf_open_llm_v1_240829_frozen.csv mistral_7b_instruct_v0_2_attention_sparsity_20,HFv1 ARC,62.88,,hf_open_llm_v1_240829_frozen.csv mistral_7b_instruct_v0_2_attention_sparsity_20,HFv1 GSM8K,39.73,,hf_open_llm_v1_240829_frozen.csv mistral_7b_instruct_v0_2_attention_sparsity_20,HFv1 HellaSwag,84.84,,hf_open_llm_v1_240829_frozen.csv mistral_7b_instruct_v0_2_attention_sparsity_20,HFv1 MMLU,60.81,,hf_open_llm_v1_240829_frozen.csv mistral_7b_instruct_v0_2_attention_sparsity_20,HFv1 TruthfulQA,68.26,,hf_open_llm_v1_240829_frozen.csv mistral_7b_instruct_v0_2_attention_sparsity_20,HFv1 Winogrande,77.9,,hf_open_llm_v1_240829_frozen.csv mistral_7b_instruct_v0_2_attention_sparsity_30,HF OpenLLM v1,65.51,,hf_open_llm_v1_240829_frozen.csv mistral_7b_instruct_v0_2_attention_sparsity_30,HFv1 ARC,62.97,,hf_open_llm_v1_240829_frozen.csv mistral_7b_instruct_v0_2_attention_sparsity_30,HFv1 GSM8K,39.42,,hf_open_llm_v1_240829_frozen.csv mistral_7b_instruct_v0_2_attention_sparsity_30,HFv1 HellaSwag,84.71,,hf_open_llm_v1_240829_frozen.csv mistral_7b_instruct_v0_2_attention_sparsity_30,HFv1 MMLU,60.49,,hf_open_llm_v1_240829_frozen.csv mistral_7b_instruct_v0_2_attention_sparsity_30,HFv1 TruthfulQA,67.49,,hf_open_llm_v1_240829_frozen.csv mistral_7b_instruct_v0_2_attention_sparsity_30,HFv1 Winogrande,77.98,,hf_open_llm_v1_240829_frozen.csv mistral_7b_instruct_v0_2_selfplay_v0,HF OpenLLM v1,65.56,,hf_open_llm_v1_240829_frozen.csv mistral_7b_instruct_v0_2_selfplay_v0,HFv1 ARC,62.8,,hf_open_llm_v1_240829_frozen.csv mistral_7b_instruct_v0_2_selfplay_v0,HFv1 GSM8K,40.26,,hf_open_llm_v1_240829_frozen.csv mistral_7b_instruct_v0_2_selfplay_v0,HFv1 HellaSwag,84.74,,hf_open_llm_v1_240829_frozen.csv mistral_7b_instruct_v0_2_selfplay_v0,HFv1 MMLU,60.6,,hf_open_llm_v1_240829_frozen.csv mistral_7b_instruct_v0_2_selfplay_v0,HFv1 TruthfulQA,67.35,,hf_open_llm_v1_240829_frozen.csv mistral_7b_instruct_v0_2_selfplay_v0,HFv1 Winogrande,77.58,,hf_open_llm_v1_240829_frozen.csv mistral_7b_instruct_v0_2_sp_v0,HF OpenLLM v1,65.68,,hf_open_llm_v1_240829_frozen.csv mistral_7b_instruct_v0_2_sp_v0,HFv1 ARC,63.05,,hf_open_llm_v1_240829_frozen.csv mistral_7b_instruct_v0_2_sp_v0,HFv1 GSM8K,40.11,,hf_open_llm_v1_240829_frozen.csv mistral_7b_instruct_v0_2_sp_v0,HFv1 HellaSwag,84.84,,hf_open_llm_v1_240829_frozen.csv mistral_7b_instruct_v0_2_sp_v0,HFv1 MMLU,60.75,,hf_open_llm_v1_240829_frozen.csv mistral_7b_instruct_v0_2_sp_v0,HFv1 TruthfulQA,68.22,,hf_open_llm_v1_240829_frozen.csv mistral_7b_instruct_v0_2_sp_v0,HFv1 Winogrande,77.11,,hf_open_llm_v1_240829_frozen.csv mistral_7b_instruct_v0_2_sparsity_10,HF OpenLLM v1,65.48,,hf_open_llm_v1_240829_frozen.csv mistral_7b_instruct_v0_2_sparsity_10,HFv1 ARC,62.88,,hf_open_llm_v1_240829_frozen.csv mistral_7b_instruct_v0_2_sparsity_10,HFv1 GSM8K,38.82,,hf_open_llm_v1_240829_frozen.csv mistral_7b_instruct_v0_2_sparsity_10,HFv1 HellaSwag,84.85,,hf_open_llm_v1_240829_frozen.csv mistral_7b_instruct_v0_2_sparsity_10,HFv1 MMLU,60.87,,hf_open_llm_v1_240829_frozen.csv mistral_7b_instruct_v0_2_sparsity_10,HFv1 TruthfulQA,67.93,,hf_open_llm_v1_240829_frozen.csv mistral_7b_instruct_v0_2_sparsity_10,HFv1 Winogrande,77.51,,hf_open_llm_v1_240829_frozen.csv mistral_7b_instruct_v0_2_sparsity_20,HF OpenLLM v1,50.7,,hf_open_llm_v1_240829_frozen.csv mistral_7b_instruct_v0_2_sparsity_20,HFv1 ARC,52.65,,hf_open_llm_v1_240829_frozen.csv mistral_7b_instruct_v0_2_sparsity_20,HFv1 GSM8K,11.3,,hf_open_llm_v1_240829_frozen.csv mistral_7b_instruct_v0_2_sparsity_20,HFv1 HellaSwag,76.71,,hf_open_llm_v1_240829_frozen.csv mistral_7b_instruct_v0_2_sparsity_20,HFv1 MMLU,47.27,,hf_open_llm_v1_240829_frozen.csv mistral_7b_instruct_v0_2_sparsity_20,HFv1 TruthfulQA,47.22,,hf_open_llm_v1_240829_frozen.csv mistral_7b_instruct_v0_2_sparsity_20,HFv1 Winogrande,69.06,,hf_open_llm_v1_240829_frozen.csv mistral_7b_instruct_v0_2_sparsity_30,HF OpenLLM v1,49.74,,hf_open_llm_v1_240829_frozen.csv mistral_7b_instruct_v0_2_sparsity_30,HFv1 ARC,51.11,,hf_open_llm_v1_240829_frozen.csv mistral_7b_instruct_v0_2_sparsity_30,HFv1 GSM8K,10.54,,hf_open_llm_v1_240829_frozen.csv mistral_7b_instruct_v0_2_sparsity_30,HFv1 HellaSwag,75.72,,hf_open_llm_v1_240829_frozen.csv mistral_7b_instruct_v0_2_sparsity_30,HFv1 MMLU,46.54,,hf_open_llm_v1_240829_frozen.csv mistral_7b_instruct_v0_2_sparsity_30,HFv1 TruthfulQA,45.53,,hf_open_llm_v1_240829_frozen.csv mistral_7b_instruct_v0_2_sparsity_30,HFv1 Winogrande,68.98,,hf_open_llm_v1_240829_frozen.csv mistral_7b_instruct_v2_sp_v0_1,HF OpenLLM v1,65.68,,hf_open_llm_v1_240829_frozen.csv mistral_7b_instruct_v2_sp_v0_1,HFv1 ARC,63.05,,hf_open_llm_v1_240829_frozen.csv mistral_7b_instruct_v2_sp_v0_1,HFv1 GSM8K,40.11,,hf_open_llm_v1_240829_frozen.csv mistral_7b_instruct_v2_sp_v0_1,HFv1 HellaSwag,84.84,,hf_open_llm_v1_240829_frozen.csv mistral_7b_instruct_v2_sp_v0_1,HFv1 MMLU,60.75,,hf_open_llm_v1_240829_frozen.csv mistral_7b_instruct_v2_sp_v0_1,HFv1 TruthfulQA,68.22,,hf_open_llm_v1_240829_frozen.csv mistral_7b_instruct_v2_sp_v0_1,HFv1 Winogrande,77.11,,hf_open_llm_v1_240829_frozen.csv mistral_7b_med_merge,HF OpenLLM v1,63.75,,hf_open_llm_v1_240829_frozen.csv mistral_7b_med_merge,HFv1 ARC,64.51,,hf_open_llm_v1_240829_frozen.csv mistral_7b_med_merge,HFv1 GSM8K,44.96,,hf_open_llm_v1_240829_frozen.csv mistral_7b_med_merge,HFv1 HellaSwag,82.96,,hf_open_llm_v1_240829_frozen.csv mistral_7b_med_merge,HFv1 MMLU,57.84,,hf_open_llm_v1_240829_frozen.csv mistral_7b_med_merge,HFv1 TruthfulQA,53.65,,hf_open_llm_v1_240829_frozen.csv mistral_7b_med_merge,HFv1 Winogrande,78.61,,hf_open_llm_v1_240829_frozen.csv mistral_7b_norobots,HF OpenLLM v1,58.85,,hf_open_llm_v1_240829_frozen.csv mistral_7b_norobots,HFv1 ARC,58.96,,hf_open_llm_v1_240829_frozen.csv mistral_7b_norobots,HFv1 GSM8K,38.36,,hf_open_llm_v1_240829_frozen.csv mistral_7b_norobots,HFv1 HellaSwag,80.57,,hf_open_llm_v1_240829_frozen.csv mistral_7b_norobots,HFv1 MMLU,57.66,,hf_open_llm_v1_240829_frozen.csv mistral_7b_norobots,HFv1 TruthfulQA,41.91,,hf_open_llm_v1_240829_frozen.csv mistral_7b_norobots,HFv1 Winogrande,75.61,,hf_open_llm_v1_240829_frozen.csv mistral_7b_open_platypus,HF OpenLLM v1,56.29,,hf_open_llm_v1_240829_frozen.csv mistral_7b_open_platypus,HFv1 ARC,55.8,,hf_open_llm_v1_240829_frozen.csv mistral_7b_open_platypus,HFv1 GSM8K,12.59,,hf_open_llm_v1_240829_frozen.csv mistral_7b_open_platypus,HFv1 HellaSwag,82.13,,hf_open_llm_v1_240829_frozen.csv mistral_7b_open_platypus,HFv1 MMLU,59.76,,hf_open_llm_v1_240829_frozen.csv mistral_7b_open_platypus,HFv1 TruthfulQA,48.87,,hf_open_llm_v1_240829_frozen.csv mistral_7b_open_platypus,HFv1 Winogrande,78.61,,hf_open_llm_v1_240829_frozen.csv mistral_7b_openorca_1k,HF OpenLLM v1,58.9,,hf_open_llm_v1_240829_frozen.csv mistral_7b_openorca_1k,HFv1 ARC,62.97,,hf_open_llm_v1_240829_frozen.csv mistral_7b_openorca_1k,HFv1 GSM8K,11.98,,hf_open_llm_v1_240829_frozen.csv mistral_7b_openorca_1k,HFv1 HellaSwag,84.66,,hf_open_llm_v1_240829_frozen.csv mistral_7b_openorca_1k,HFv1 MMLU,62.2,,hf_open_llm_v1_240829_frozen.csv mistral_7b_openorca_1k,HFv1 TruthfulQA,52.96,,hf_open_llm_v1_240829_frozen.csv mistral_7b_openorca_1k,HFv1 Winogrande,78.61,,hf_open_llm_v1_240829_frozen.csv mistral_7b_openplatypus_1k,HF OpenLLM v1,58.07,,hf_open_llm_v1_240829_frozen.csv mistral_7b_openplatypus_1k,HFv1 ARC,60.15,,hf_open_llm_v1_240829_frozen.csv mistral_7b_openplatypus_1k,HFv1 GSM8K,17.44,,hf_open_llm_v1_240829_frozen.csv mistral_7b_openplatypus_1k,HFv1 HellaSwag,84.25,,hf_open_llm_v1_240829_frozen.csv mistral_7b_openplatypus_1k,HFv1 MMLU,59.84,,hf_open_llm_v1_240829_frozen.csv mistral_7b_openplatypus_1k,HFv1 TruthfulQA,49.86,,hf_open_llm_v1_240829_frozen.csv mistral_7b_openplatypus_1k,HFv1 Winogrande,76.87,,hf_open_llm_v1_240829_frozen.csv mistral_7b_orpo_airoboros_pref_10k,HF OpenLLM v1,58.82,,hf_open_llm_v1_240829_frozen.csv mistral_7b_orpo_airoboros_pref_10k,HFv1 ARC,60.07,,hf_open_llm_v1_240829_frozen.csv mistral_7b_orpo_airoboros_pref_10k,HFv1 GSM8K,20.09,,hf_open_llm_v1_240829_frozen.csv mistral_7b_orpo_airoboros_pref_10k,HFv1 HellaSwag,82.44,,hf_open_llm_v1_240829_frozen.csv mistral_7b_orpo_airoboros_pref_10k,HFv1 MMLU,60.75,,hf_open_llm_v1_240829_frozen.csv mistral_7b_orpo_airoboros_pref_10k,HFv1 TruthfulQA,54.59,,hf_open_llm_v1_240829_frozen.csv mistral_7b_orpo_airoboros_pref_10k,HFv1 Winogrande,74.98,,hf_open_llm_v1_240829_frozen.csv mistral_7b_orpo_alignment_handbook,HF OpenLLM v1,63.82,,hf_open_llm_v1_240829_frozen.csv mistral_7b_orpo_alignment_handbook,HFv1 ARC,62.46,,hf_open_llm_v1_240829_frozen.csv mistral_7b_orpo_alignment_handbook,HFv1 GSM8K,41.32,,hf_open_llm_v1_240829_frozen.csv mistral_7b_orpo_alignment_handbook,HFv1 HellaSwag,83.96,,hf_open_llm_v1_240829_frozen.csv mistral_7b_orpo_alignment_handbook,HFv1 MMLU,63.49,,hf_open_llm_v1_240829_frozen.csv mistral_7b_orpo_alignment_handbook,HFv1 TruthfulQA,53.87,,hf_open_llm_v1_240829_frozen.csv mistral_7b_orpo_alignment_handbook,HFv1 Winogrande,77.82,,hf_open_llm_v1_240829_frozen.csv mistral_7b_orpo_capybara_reproduction,HF OpenLLM v1,60.36,,hf_open_llm_v1_240829_frozen.csv mistral_7b_orpo_capybara_reproduction,HFv1 ARC,58.79,,hf_open_llm_v1_240829_frozen.csv mistral_7b_orpo_capybara_reproduction,HFv1 GSM8K,28.43,,hf_open_llm_v1_240829_frozen.csv mistral_7b_orpo_capybara_reproduction,HFv1 HellaSwag,83.57,,hf_open_llm_v1_240829_frozen.csv mistral_7b_orpo_capybara_reproduction,HFv1 MMLU,61.15,,hf_open_llm_v1_240829_frozen.csv mistral_7b_orpo_capybara_reproduction,HFv1 TruthfulQA,54.13,,hf_open_llm_v1_240829_frozen.csv mistral_7b_orpo_capybara_reproduction,HFv1 Winogrande,76.09,,hf_open_llm_v1_240829_frozen.csv mistral_7b_platypus1k,HF OpenLLM v1,58.19,,hf_open_llm_v1_240829_frozen.csv mistral_7b_platypus1k,HFv1 ARC,61.6,,hf_open_llm_v1_240829_frozen.csv mistral_7b_platypus1k,HFv1 GSM8K,16.38,,hf_open_llm_v1_240829_frozen.csv mistral_7b_platypus1k,HFv1 HellaSwag,82.93,,hf_open_llm_v1_240829_frozen.csv mistral_7b_platypus1k,HFv1 MMLU,63.16,,hf_open_llm_v1_240829_frozen.csv mistral_7b_platypus1k,HFv1 TruthfulQA,46.96,,hf_open_llm_v1_240829_frozen.csv mistral_7b_platypus1k,HFv1 Winogrande,78.14,,hf_open_llm_v1_240829_frozen.csv mistral_7b_platypus_fp16,HF OpenLLM v1,58.71,,hf_open_llm_v1_240829_frozen.csv mistral_7b_platypus_fp16,HFv1 ARC,63.05,,hf_open_llm_v1_240829_frozen.csv mistral_7b_platypus_fp16,HFv1 GSM8K,17.36,,hf_open_llm_v1_240829_frozen.csv mistral_7b_platypus_fp16,HFv1 HellaSwag,84.15,,hf_open_llm_v1_240829_frozen.csv mistral_7b_platypus_fp16,HFv1 MMLU,64.11,,hf_open_llm_v1_240829_frozen.csv mistral_7b_platypus_fp16,HFv1 TruthfulQA,45.07,,hf_open_llm_v1_240829_frozen.csv mistral_7b_platypus_fp16,HFv1 Winogrande,78.53,,hf_open_llm_v1_240829_frozen.csv mistral_7b_selfplay_v0,HF OpenLLM v1,57.82,,hf_open_llm_v1_240829_frozen.csv mistral_7b_selfplay_v0,HFv1 ARC,54.69,,hf_open_llm_v1_240829_frozen.csv mistral_7b_selfplay_v0,HFv1 GSM8K,31.24,,hf_open_llm_v1_240829_frozen.csv mistral_7b_selfplay_v0,HFv1 HellaSwag,75.69,,hf_open_llm_v1_240829_frozen.csv mistral_7b_selfplay_v0,HFv1 MMLU,55.4,,hf_open_llm_v1_240829_frozen.csv mistral_7b_selfplay_v0,HFv1 TruthfulQA,56.28,,hf_open_llm_v1_240829_frozen.csv mistral_7b_selfplay_v0,HFv1 Winogrande,73.64,,hf_open_llm_v1_240829_frozen.csv mistral_7b_sft_dpo_v0,HF OpenLLM v1,72.17,,hf_open_llm_v1_240829_frozen.csv mistral_7b_sft_dpo_v0,HFv1 ARC,66.3,,hf_open_llm_v1_240829_frozen.csv mistral_7b_sft_dpo_v0,HFv1 GSM8K,65.81,,hf_open_llm_v1_240829_frozen.csv mistral_7b_sft_dpo_v0,HFv1 HellaSwag,84.9,,hf_open_llm_v1_240829_frozen.csv mistral_7b_sft_dpo_v0,HFv1 MMLU,64.53,,hf_open_llm_v1_240829_frozen.csv mistral_7b_sft_dpo_v0,HFv1 TruthfulQA,69.72,,hf_open_llm_v1_240829_frozen.csv mistral_7b_sft_dpo_v0,HFv1 Winogrande,81.77,,hf_open_llm_v1_240829_frozen.csv mistral_7b_sft_open_orca_flan_50k,HF OpenLLM v1,53.7,,hf_open_llm_v1_240829_frozen.csv mistral_7b_sft_open_orca_flan_50k,HFv1 ARC,58.79,,hf_open_llm_v1_240829_frozen.csv mistral_7b_sft_open_orca_flan_50k,HFv1 GSM8K,10.31,,hf_open_llm_v1_240829_frozen.csv mistral_7b_sft_open_orca_flan_50k,HFv1 HellaSwag,81.92,,hf_open_llm_v1_240829_frozen.csv mistral_7b_sft_open_orca_flan_50k,HFv1 MMLU,55.72,,hf_open_llm_v1_240829_frozen.csv mistral_7b_sft_open_orca_flan_50k,HFv1 TruthfulQA,37.49,,hf_open_llm_v1_240829_frozen.csv mistral_7b_sft_open_orca_flan_50k,HFv1 Winogrande,77.98,,hf_open_llm_v1_240829_frozen.csv mistral_7b_v0_1,HF OpenLLM v1,60.97,,hf_open_llm_v1_240829_frozen.csv mistral_7b_v0_1,HFv1 ARC,59.98,,hf_open_llm_v1_240829_frozen.csv mistral_7b_v0_1,HFv1 GSM8K,37.83,,hf_open_llm_v1_240829_frozen.csv mistral_7b_v0_1,HFv1 HellaSwag,83.31,,hf_open_llm_v1_240829_frozen.csv mistral_7b_v0_1,HFv1 MMLU,64.16,,hf_open_llm_v1_240829_frozen.csv mistral_7b_v0_1,HFv1 TruthfulQA,42.15,,hf_open_llm_v1_240829_frozen.csv mistral_7b_v0_1,HFv1 Winogrande,78.37,,hf_open_llm_v1_240829_frozen.csv mistral_7b_v0_1_dpo,HF OpenLLM v1,61.3,,hf_open_llm_v1_240829_frozen.csv mistral_7b_v0_1_dpo,HFv1 ARC,60.32,,hf_open_llm_v1_240829_frozen.csv mistral_7b_v0_1_dpo,HFv1 GSM8K,37.23,,hf_open_llm_v1_240829_frozen.csv mistral_7b_v0_1_dpo,HFv1 HellaSwag,83.69,,hf_open_llm_v1_240829_frozen.csv mistral_7b_v0_1_dpo,HFv1 MMLU,64.01,,hf_open_llm_v1_240829_frozen.csv mistral_7b_v0_1_dpo,HFv1 TruthfulQA,43.53,,hf_open_llm_v1_240829_frozen.csv mistral_7b_v0_1_dpo,HFv1 Winogrande,79.01,,hf_open_llm_v1_240829_frozen.csv mistral_7b_v0_1_half_naive_a,HF OpenLLM v1,60.79,,hf_open_llm_v1_240829_frozen.csv mistral_7b_v0_1_half_naive_a,HFv1 ARC,60.32,,hf_open_llm_v1_240829_frozen.csv mistral_7b_v0_1_half_naive_a,HFv1 GSM8K,36.85,,hf_open_llm_v1_240829_frozen.csv mistral_7b_v0_1_half_naive_a,HFv1 HellaSwag,83.22,,hf_open_llm_v1_240829_frozen.csv mistral_7b_v0_1_half_naive_a,HFv1 MMLU,64.16,,hf_open_llm_v1_240829_frozen.csv mistral_7b_v0_1_half_naive_a,HFv1 TruthfulQA,42.28,,hf_open_llm_v1_240829_frozen.csv mistral_7b_v0_1_half_naive_a,HFv1 Winogrande,77.9,,hf_open_llm_v1_240829_frozen.csv mistral_7b_v0_1_layla_v4,HF OpenLLM v1,64.69,,hf_open_llm_v1_240829_frozen.csv mistral_7b_v0_1_layla_v4,HFv1 ARC,62.29,,hf_open_llm_v1_240829_frozen.csv mistral_7b_v0_1_layla_v4,HFv1 GSM8K,55.5,,hf_open_llm_v1_240829_frozen.csv mistral_7b_v0_1_layla_v4,HFv1 HellaSwag,83.36,,hf_open_llm_v1_240829_frozen.csv mistral_7b_v0_1_layla_v4,HFv1 MMLU,64.32,,hf_open_llm_v1_240829_frozen.csv mistral_7b_v0_1_layla_v4,HFv1 TruthfulQA,43.14,,hf_open_llm_v1_240829_frozen.csv mistral_7b_v0_1_layla_v4,HFv1 Winogrande,79.56,,hf_open_llm_v1_240829_frozen.csv mistral_7b_v0_1_open_platypus,HF OpenLLM v1,58.92,,hf_open_llm_v1_240829_frozen.csv mistral_7b_v0_1_open_platypus,HFv1 ARC,62.37,,hf_open_llm_v1_240829_frozen.csv mistral_7b_v0_1_open_platypus,HFv1 GSM8K,17.29,,hf_open_llm_v1_240829_frozen.csv mistral_7b_v0_1_open_platypus,HFv1 HellaSwag,85.08,,hf_open_llm_v1_240829_frozen.csv mistral_7b_v0_1_open_platypus,HFv1 MMLU,63.79,,hf_open_llm_v1_240829_frozen.csv mistral_7b_v0_1_open_platypus,HFv1 TruthfulQA,47.33,,hf_open_llm_v1_240829_frozen.csv mistral_7b_v0_1_open_platypus,HFv1 Winogrande,77.66,,hf_open_llm_v1_240829_frozen.csv mistral_7b_v0_1_orpo,HF OpenLLM v1,64.16,,hf_open_llm_v1_240829_frozen.csv mistral_7b_v0_1_orpo,HFv1 ARC,62.54,,hf_open_llm_v1_240829_frozen.csv mistral_7b_v0_1_orpo,HFv1 GSM8K,43.29,,hf_open_llm_v1_240829_frozen.csv mistral_7b_v0_1_orpo,HFv1 HellaSwag,83.78,,hf_open_llm_v1_240829_frozen.csv mistral_7b_v0_1_orpo,HFv1 MMLU,63.57,,hf_open_llm_v1_240829_frozen.csv mistral_7b_v0_1_orpo,HFv1 TruthfulQA,52.6,,hf_open_llm_v1_240829_frozen.csv mistral_7b_v0_1_orpo,HFv1 Winogrande,79.16,,hf_open_llm_v1_240829_frozen.csv mistral_7b_v0_1_signtensors_1_over_2,HF OpenLLM v1,60.48,,hf_open_llm_v1_240829_frozen.csv mistral_7b_v0_1_signtensors_1_over_2,HFv1 ARC,60.32,,hf_open_llm_v1_240829_frozen.csv mistral_7b_v0_1_signtensors_1_over_2,HFv1 GSM8K,35.94,,hf_open_llm_v1_240829_frozen.csv mistral_7b_v0_1_signtensors_1_over_2,HFv1 HellaSwag,83.12,,hf_open_llm_v1_240829_frozen.csv mistral_7b_v0_1_signtensors_1_over_2,HFv1 MMLU,64.11,,hf_open_llm_v1_240829_frozen.csv mistral_7b_v0_1_signtensors_1_over_2,HFv1 TruthfulQA,41.94,,hf_open_llm_v1_240829_frozen.csv mistral_7b_v0_1_signtensors_1_over_2,HFv1 Winogrande,77.43,,hf_open_llm_v1_240829_frozen.csv mistral_7b_v0_1_signtensors_1_over_4,HF OpenLLM v1,29.16,,hf_open_llm_v1_240829_frozen.csv mistral_7b_v0_1_signtensors_1_over_4,HFv1 ARC,28.67,,hf_open_llm_v1_240829_frozen.csv mistral_7b_v0_1_signtensors_1_over_4,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv mistral_7b_v0_1_signtensors_1_over_4,HFv1 HellaSwag,25.64,,hf_open_llm_v1_240829_frozen.csv mistral_7b_v0_1_signtensors_1_over_4,HFv1 MMLU,23.12,,hf_open_llm_v1_240829_frozen.csv mistral_7b_v0_1_signtensors_1_over_4,HFv1 TruthfulQA,47.95,,hf_open_llm_v1_240829_frozen.csv mistral_7b_v0_1_signtensors_1_over_4,HFv1 Winogrande,49.57,,hf_open_llm_v1_240829_frozen.csv mistral_7b_v0_2,HF OpenLLM v1,60.41,,hf_open_llm_v1_240829_frozen.csv mistral_7b_v0_2,HFv1 ARC,60.84,,hf_open_llm_v1_240829_frozen.csv mistral_7b_v0_2,HFv1 GSM8K,34.95,,hf_open_llm_v1_240829_frozen.csv mistral_7b_v0_2,HFv1 HellaSwag,83.08,,hf_open_llm_v1_240829_frozen.csv mistral_7b_v0_2,HFv1 MMLU,63.69,,hf_open_llm_v1_240829_frozen.csv mistral_7b_v0_2,HFv1 TruthfulQA,41.8,,hf_open_llm_v1_240829_frozen.csv mistral_7b_v0_2,HFv1 Winogrande,78.69,,hf_open_llm_v1_240829_frozen.csv mistral_7b_v0_3,HF OpenLLM v1,60.28,,hf_open_llm_v1_240829_frozen.csv mistral_7b_v0_3,HFv1 ARC,60.49,,hf_open_llm_v1_240829_frozen.csv mistral_7b_v0_3,HFv1 GSM8K,34.5,,hf_open_llm_v1_240829_frozen.csv mistral_7b_v0_3,HFv1 HellaSwag,82.99,,hf_open_llm_v1_240829_frozen.csv mistral_7b_v0_3,HFv1 MMLU,63.46,,hf_open_llm_v1_240829_frozen.csv mistral_7b_v0_3,HFv1 TruthfulQA,41.79,,hf_open_llm_v1_240829_frozen.csv mistral_7b_v0_3,HFv1 Winogrande,78.45,,hf_open_llm_v1_240829_frozen.csv mistral_7b_v2_selfplay,HF OpenLLM v1,65.67,,hf_open_llm_v1_240829_frozen.csv mistral_7b_v2_selfplay,HFv1 ARC,62.97,,hf_open_llm_v1_240829_frozen.csv mistral_7b_v2_selfplay,HFv1 GSM8K,39.8,,hf_open_llm_v1_240829_frozen.csv mistral_7b_v2_selfplay,HFv1 HellaSwag,84.97,,hf_open_llm_v1_240829_frozen.csv mistral_7b_v2_selfplay,HFv1 MMLU,60.71,,hf_open_llm_v1_240829_frozen.csv mistral_7b_v2_selfplay,HFv1 TruthfulQA,68.24,,hf_open_llm_v1_240829_frozen.csv mistral_7b_v2_selfplay,HFv1 Winogrande,77.35,,hf_open_llm_v1_240829_frozen.csv mistral_8_from_mixtral_8x7b_v0_1,HF OpenLLM v1,29.91,,hf_open_llm_v1_240829_frozen.csv mistral_8_from_mixtral_8x7b_v0_1,HFv1 ARC,29.01,,hf_open_llm_v1_240829_frozen.csv mistral_8_from_mixtral_8x7b_v0_1,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv mistral_8_from_mixtral_8x7b_v0_1,HFv1 HellaSwag,26.23,,hf_open_llm_v1_240829_frozen.csv mistral_8_from_mixtral_8x7b_v0_1,HFv1 MMLU,25.29,,hf_open_llm_v1_240829_frozen.csv mistral_8_from_mixtral_8x7b_v0_1,HFv1 TruthfulQA,48.12,,hf_open_llm_v1_240829_frozen.csv mistral_8_from_mixtral_8x7b_v0_1,HFv1 Winogrande,50.83,,hf_open_llm_v1_240829_frozen.csv mistral_dmbr03_32_sig,HF OpenLLM v1,60.95,,hf_open_llm_v1_240829_frozen.csv mistral_dmbr03_32_sig,HFv1 ARC,59.98,,hf_open_llm_v1_240829_frozen.csv mistral_dmbr03_32_sig,HFv1 GSM8K,35.33,,hf_open_llm_v1_240829_frozen.csv mistral_dmbr03_32_sig,HFv1 HellaSwag,83.22,,hf_open_llm_v1_240829_frozen.csv mistral_dmbr03_32_sig,HFv1 MMLU,61.22,,hf_open_llm_v1_240829_frozen.csv mistral_dmbr03_32_sig,HFv1 TruthfulQA,47.9,,hf_open_llm_v1_240829_frozen.csv mistral_dmbr03_32_sig,HFv1 Winogrande,78.06,,hf_open_llm_v1_240829_frozen.csv mistral_dmbr05_32_sig,HF OpenLLM v1,60.85,,hf_open_llm_v1_240829_frozen.csv mistral_dmbr05_32_sig,HFv1 ARC,59.9,,hf_open_llm_v1_240829_frozen.csv mistral_dmbr05_32_sig,HFv1 GSM8K,34.19,,hf_open_llm_v1_240829_frozen.csv mistral_dmbr05_32_sig,HFv1 HellaSwag,83.28,,hf_open_llm_v1_240829_frozen.csv mistral_dmbr05_32_sig,HFv1 MMLU,60.86,,hf_open_llm_v1_240829_frozen.csv mistral_dmbr05_32_sig,HFv1 TruthfulQA,49.69,,hf_open_llm_v1_240829_frozen.csv mistral_dmbr05_32_sig,HFv1 Winogrande,77.19,,hf_open_llm_v1_240829_frozen.csv mistral_dmbr10_32_sig,HF OpenLLM v1,60.43,,hf_open_llm_v1_240829_frozen.csv mistral_dmbr10_32_sig,HFv1 ARC,58.62,,hf_open_llm_v1_240829_frozen.csv mistral_dmbr10_32_sig,HFv1 GSM8K,37.83,,hf_open_llm_v1_240829_frozen.csv mistral_dmbr10_32_sig,HFv1 HellaSwag,82.57,,hf_open_llm_v1_240829_frozen.csv mistral_dmbr10_32_sig,HFv1 MMLU,61.35,,hf_open_llm_v1_240829_frozen.csv mistral_dmbr10_32_sig,HFv1 TruthfulQA,44.34,,hf_open_llm_v1_240829_frozen.csv mistral_dmbr10_32_sig,HFv1 Winogrande,77.9,,hf_open_llm_v1_240829_frozen.csv mistral_dmbr20_32_sig,HF OpenLLM v1,60.43,,hf_open_llm_v1_240829_frozen.csv mistral_dmbr20_32_sig,HFv1 ARC,58.7,,hf_open_llm_v1_240829_frozen.csv mistral_dmbr20_32_sig,HFv1 GSM8K,37.6,,hf_open_llm_v1_240829_frozen.csv mistral_dmbr20_32_sig,HFv1 HellaSwag,82.54,,hf_open_llm_v1_240829_frozen.csv mistral_dmbr20_32_sig,HFv1 MMLU,61.41,,hf_open_llm_v1_240829_frozen.csv mistral_dmbr20_32_sig,HFv1 TruthfulQA,44.75,,hf_open_llm_v1_240829_frozen.csv mistral_dmbr20_32_sig,HFv1 Winogrande,77.58,,hf_open_llm_v1_240829_frozen.csv mistral_dolphin_2_8_grok_instract_2_7b_slerp,HF OpenLLM v1,64.99,,hf_open_llm_v1_240829_frozen.csv mistral_dolphin_2_8_grok_instract_2_7b_slerp,HFv1 ARC,63.91,,hf_open_llm_v1_240829_frozen.csv mistral_dolphin_2_8_grok_instract_2_7b_slerp,HFv1 GSM8K,48.67,,hf_open_llm_v1_240829_frozen.csv mistral_dolphin_2_8_grok_instract_2_7b_slerp,HFv1 HellaSwag,84.42,,hf_open_llm_v1_240829_frozen.csv mistral_dolphin_2_8_grok_instract_2_7b_slerp,HFv1 MMLU,63.01,,hf_open_llm_v1_240829_frozen.csv mistral_dolphin_2_8_grok_instract_2_7b_slerp,HFv1 TruthfulQA,51.74,,hf_open_llm_v1_240829_frozen.csv mistral_dolphin_2_8_grok_instract_2_7b_slerp,HFv1 Winogrande,78.22,,hf_open_llm_v1_240829_frozen.csv mistral_evolved_11b_v0_1,HF OpenLLM v1,65.8,,hf_open_llm_v1_240829_frozen.csv mistral_evolved_11b_v0_1,HFv1 ARC,62.2,,hf_open_llm_v1_240829_frozen.csv mistral_evolved_11b_v0_1,HFv1 GSM8K,49.81,,hf_open_llm_v1_240829_frozen.csv mistral_evolved_11b_v0_1,HFv1 HellaSwag,84.65,,hf_open_llm_v1_240829_frozen.csv mistral_evolved_11b_v0_1,HFv1 MMLU,63.11,,hf_open_llm_v1_240829_frozen.csv mistral_evolved_11b_v0_1,HFv1 TruthfulQA,59.23,,hf_open_llm_v1_240829_frozen.csv mistral_evolved_11b_v0_1,HFv1 Winogrande,75.77,,hf_open_llm_v1_240829_frozen.csv mistral_grok_instract_2_7b_slerp,HF OpenLLM v1,62.87,,hf_open_llm_v1_240829_frozen.csv mistral_grok_instract_2_7b_slerp,HFv1 ARC,62.8,,hf_open_llm_v1_240829_frozen.csv mistral_grok_instract_2_7b_slerp,HFv1 GSM8K,39.88,,hf_open_llm_v1_240829_frozen.csv mistral_grok_instract_2_7b_slerp,HFv1 HellaSwag,83.03,,hf_open_llm_v1_240829_frozen.csv mistral_grok_instract_2_7b_slerp,HFv1 MMLU,61.04,,hf_open_llm_v1_240829_frozen.csv mistral_grok_instract_2_7b_slerp,HFv1 TruthfulQA,53.51,,hf_open_llm_v1_240829_frozen.csv mistral_grok_instract_2_7b_slerp,HFv1 Winogrande,76.95,,hf_open_llm_v1_240829_frozen.csv mistral_instruct_7b_v0_2_chatalpaca,HF OpenLLM v1,61.21,,hf_open_llm_v1_240829_frozen.csv mistral_instruct_7b_v0_2_chatalpaca,HFv1 ARC,56.74,,hf_open_llm_v1_240829_frozen.csv mistral_instruct_7b_v0_2_chatalpaca,HFv1 GSM8K,37.6,,hf_open_llm_v1_240829_frozen.csv mistral_instruct_7b_v0_2_chatalpaca,HFv1 HellaSwag,80.82,,hf_open_llm_v1_240829_frozen.csv mistral_instruct_7b_v0_2_chatalpaca,HFv1 MMLU,59.1,,hf_open_llm_v1_240829_frozen.csv mistral_instruct_7b_v0_2_chatalpaca,HFv1 TruthfulQA,55.86,,hf_open_llm_v1_240829_frozen.csv mistral_instruct_7b_v0_2_chatalpaca,HFv1 Winogrande,77.11,,hf_open_llm_v1_240829_frozen.csv mistral_instruct_7b_v0_2_chatalpaca_dpo2,HF OpenLLM v1,64.05,,hf_open_llm_v1_240829_frozen.csv mistral_instruct_7b_v0_2_chatalpaca_dpo2,HFv1 ARC,61.86,,hf_open_llm_v1_240829_frozen.csv mistral_instruct_7b_v0_2_chatalpaca_dpo2,HFv1 GSM8K,37.0,,hf_open_llm_v1_240829_frozen.csv mistral_instruct_7b_v0_2_chatalpaca_dpo2,HFv1 HellaSwag,83.71,,hf_open_llm_v1_240829_frozen.csv mistral_instruct_7b_v0_2_chatalpaca_dpo2,HFv1 MMLU,59.19,,hf_open_llm_v1_240829_frozen.csv mistral_instruct_7b_v0_2_chatalpaca_dpo2,HFv1 TruthfulQA,64.08,,hf_open_llm_v1_240829_frozen.csv mistral_instruct_7b_v0_2_chatalpaca_dpo2,HFv1 Winogrande,78.45,,hf_open_llm_v1_240829_frozen.csv mistral_instruct_7b_v0_2_chatalpacav2_4bit,HF OpenLLM v1,65.34,,hf_open_llm_v1_240829_frozen.csv mistral_instruct_7b_v0_2_chatalpacav2_4bit,HFv1 ARC,62.12,,hf_open_llm_v1_240829_frozen.csv mistral_instruct_7b_v0_2_chatalpacav2_4bit,HFv1 GSM8K,40.33,,hf_open_llm_v1_240829_frozen.csv mistral_instruct_7b_v0_2_chatalpacav2_4bit,HFv1 HellaSwag,84.55,,hf_open_llm_v1_240829_frozen.csv mistral_instruct_7b_v0_2_chatalpacav2_4bit,HFv1 MMLU,60.66,,hf_open_llm_v1_240829_frozen.csv mistral_instruct_7b_v0_2_chatalpacav2_4bit,HFv1 TruthfulQA,67.29,,hf_open_llm_v1_240829_frozen.csv mistral_instruct_7b_v0_2_chatalpacav2_4bit,HFv1 Winogrande,77.11,,hf_open_llm_v1_240829_frozen.csv mistral_instruct_frankenmerge,HF OpenLLM v1,58.96,,hf_open_llm_v1_240829_frozen.csv mistral_instruct_frankenmerge,HFv1 ARC,58.19,,hf_open_llm_v1_240829_frozen.csv mistral_instruct_frankenmerge,HFv1 GSM8K,11.22,,hf_open_llm_v1_240829_frozen.csv mistral_instruct_frankenmerge,HFv1 HellaSwag,83.26,,hf_open_llm_v1_240829_frozen.csv mistral_instruct_frankenmerge,HFv1 MMLU,59.53,,hf_open_llm_v1_240829_frozen.csv mistral_instruct_frankenmerge,HFv1 TruthfulQA,66.48,,hf_open_llm_v1_240829_frozen.csv mistral_instruct_frankenmerge,HFv1 Winogrande,75.06,,hf_open_llm_v1_240829_frozen.csv mistral_instruct_moe_experimental,HF OpenLLM v1,61.39,,hf_open_llm_v1_240829_frozen.csv mistral_instruct_moe_experimental,HFv1 ARC,61.01,,hf_open_llm_v1_240829_frozen.csv mistral_instruct_moe_experimental,HFv1 GSM8K,31.08,,hf_open_llm_v1_240829_frozen.csv mistral_instruct_moe_experimental,HFv1 HellaSwag,81.55,,hf_open_llm_v1_240829_frozen.csv mistral_instruct_moe_experimental,HFv1 MMLU,58.22,,hf_open_llm_v1_240829_frozen.csv mistral_instruct_moe_experimental,HFv1 TruthfulQA,60.4,,hf_open_llm_v1_240829_frozen.csv mistral_instruct_moe_experimental,HFv1 Winogrande,76.09,,hf_open_llm_v1_240829_frozen.csv mistral_instruct_slerp,HF OpenLLM v1,59.08,,hf_open_llm_v1_240829_frozen.csv mistral_instruct_slerp,HFv1 ARC,57.42,,hf_open_llm_v1_240829_frozen.csv mistral_instruct_slerp,HFv1 GSM8K,30.78,,hf_open_llm_v1_240829_frozen.csv mistral_instruct_slerp,HFv1 HellaSwag,78.34,,hf_open_llm_v1_240829_frozen.csv mistral_instruct_slerp,HFv1 MMLU,55.19,,hf_open_llm_v1_240829_frozen.csv mistral_instruct_slerp,HFv1 TruthfulQA,57.61,,hf_open_llm_v1_240829_frozen.csv mistral_instruct_slerp,HFv1 Winogrande,75.14,,hf_open_llm_v1_240829_frozen.csv mistral_kmmbr_32_sig,HF OpenLLM v1,60.78,,hf_open_llm_v1_240829_frozen.csv mistral_kmmbr_32_sig,HFv1 ARC,58.96,,hf_open_llm_v1_240829_frozen.csv mistral_kmmbr_32_sig,HFv1 GSM8K,37.53,,hf_open_llm_v1_240829_frozen.csv mistral_kmmbr_32_sig,HFv1 HellaSwag,82.84,,hf_open_llm_v1_240829_frozen.csv mistral_kmmbr_32_sig,HFv1 MMLU,61.39,,hf_open_llm_v1_240829_frozen.csv mistral_kmmbr_32_sig,HFv1 TruthfulQA,46.2,,hf_open_llm_v1_240829_frozen.csv mistral_kmmbr_32_sig,HFv1 Winogrande,77.74,,hf_open_llm_v1_240829_frozen.csv mistral_mbr_32_sig,HF OpenLLM v1,60.79,,hf_open_llm_v1_240829_frozen.csv mistral_mbr_32_sig,HFv1 ARC,59.64,,hf_open_llm_v1_240829_frozen.csv mistral_mbr_32_sig,HFv1 GSM8K,36.09,,hf_open_llm_v1_240829_frozen.csv mistral_mbr_32_sig,HFv1 HellaSwag,83.1,,hf_open_llm_v1_240829_frozen.csv mistral_mbr_32_sig,HFv1 MMLU,61.43,,hf_open_llm_v1_240829_frozen.csv mistral_mbr_32_sig,HFv1 TruthfulQA,46.31,,hf_open_llm_v1_240829_frozen.csv mistral_mbr_32_sig,HFv1 Winogrande,78.14,,hf_open_llm_v1_240829_frozen.csv mistral_megamerge_dare_7b,HF OpenLLM v1,48.93,,hf_open_llm_v1_240829_frozen.csv mistral_megamerge_dare_7b,HFv1 ARC,55.29,,hf_open_llm_v1_240829_frozen.csv mistral_megamerge_dare_7b,HFv1 GSM8K,6.6,,hf_open_llm_v1_240829_frozen.csv mistral_megamerge_dare_7b,HFv1 HellaSwag,70.48,,hf_open_llm_v1_240829_frozen.csv mistral_megamerge_dare_7b,HFv1 MMLU,43.05,,hf_open_llm_v1_240829_frozen.csv mistral_megamerge_dare_7b,HFv1 TruthfulQA,51.08,,hf_open_llm_v1_240829_frozen.csv mistral_megamerge_dare_7b,HFv1 Winogrande,67.09,,hf_open_llm_v1_240829_frozen.csv mistral_neuraldpo_v0_4,HF OpenLLM v1,63.85,,hf_open_llm_v1_240829_frozen.csv mistral_neuraldpo_v0_4,HFv1 ARC,66.04,,hf_open_llm_v1_240829_frozen.csv mistral_neuraldpo_v0_4,HFv1 GSM8K,36.85,,hf_open_llm_v1_240829_frozen.csv mistral_neuraldpo_v0_4,HFv1 HellaSwag,85.18,,hf_open_llm_v1_240829_frozen.csv mistral_neuraldpo_v0_4,HFv1 MMLU,63.57,,hf_open_llm_v1_240829_frozen.csv mistral_neuraldpo_v0_4,HFv1 TruthfulQA,51.32,,hf_open_llm_v1_240829_frozen.csv mistral_neuraldpo_v0_4,HFv1 Winogrande,80.11,,hf_open_llm_v1_240829_frozen.csv mistral_neuraldpo_v0_4_laser,HF OpenLLM v1,63.89,,hf_open_llm_v1_240829_frozen.csv mistral_neuraldpo_v0_4_laser,HFv1 ARC,66.89,,hf_open_llm_v1_240829_frozen.csv mistral_neuraldpo_v0_4_laser,HFv1 GSM8K,36.54,,hf_open_llm_v1_240829_frozen.csv mistral_neuraldpo_v0_4_laser,HFv1 HellaSwag,85.23,,hf_open_llm_v1_240829_frozen.csv mistral_neuraldpo_v0_4_laser,HFv1 MMLU,63.47,,hf_open_llm_v1_240829_frozen.csv mistral_neuraldpo_v0_4_laser,HFv1 TruthfulQA,50.91,,hf_open_llm_v1_240829_frozen.csv mistral_neuraldpo_v0_4_laser,HFv1 Winogrande,80.27,,hf_open_llm_v1_240829_frozen.csv mistral_neuraldpo_v0_5,HF OpenLLM v1,61.98,,hf_open_llm_v1_240829_frozen.csv mistral_neuraldpo_v0_5,HFv1 ARC,65.44,,hf_open_llm_v1_240829_frozen.csv mistral_neuraldpo_v0_5,HFv1 GSM8K,36.54,,hf_open_llm_v1_240829_frozen.csv mistral_neuraldpo_v0_5,HFv1 HellaSwag,84.66,,hf_open_llm_v1_240829_frozen.csv mistral_neuraldpo_v0_5,HFv1 MMLU,62.56,,hf_open_llm_v1_240829_frozen.csv mistral_neuraldpo_v0_5,HFv1 TruthfulQA,42.43,,hf_open_llm_v1_240829_frozen.csv mistral_neuraldpo_v0_5,HFv1 Winogrande,80.27,,hf_open_llm_v1_240829_frozen.csv mistral_neuraldpo_v0_7,HF OpenLLM v1,58.74,,hf_open_llm_v1_240829_frozen.csv mistral_neuraldpo_v0_7,HFv1 ARC,65.87,,hf_open_llm_v1_240829_frozen.csv mistral_neuraldpo_v0_7,HFv1 GSM8K,25.09,,hf_open_llm_v1_240829_frozen.csv mistral_neuraldpo_v0_7,HFv1 HellaSwag,84.4,,hf_open_llm_v1_240829_frozen.csv mistral_neuraldpo_v0_7,HFv1 MMLU,57.6,,hf_open_llm_v1_240829_frozen.csv mistral_neuraldpo_v0_7,HFv1 TruthfulQA,39.91,,hf_open_llm_v1_240829_frozen.csv mistral_neuraldpo_v0_7,HFv1 Winogrande,79.56,,hf_open_llm_v1_240829_frozen.csv mistral_nucleus09_32_sig,HF OpenLLM v1,60.93,,hf_open_llm_v1_240829_frozen.csv mistral_nucleus09_32_sig,HFv1 ARC,59.73,,hf_open_llm_v1_240829_frozen.csv mistral_nucleus09_32_sig,HFv1 GSM8K,36.85,,hf_open_llm_v1_240829_frozen.csv mistral_nucleus09_32_sig,HFv1 HellaSwag,83.14,,hf_open_llm_v1_240829_frozen.csv mistral_nucleus09_32_sig,HFv1 MMLU,61.42,,hf_open_llm_v1_240829_frozen.csv mistral_nucleus09_32_sig,HFv1 TruthfulQA,46.37,,hf_open_llm_v1_240829_frozen.csv mistral_nucleus09_32_sig,HFv1 Winogrande,78.06,,hf_open_llm_v1_240829_frozen.csv mistral_orpo_alpha,HF OpenLLM v1,55.41,,hf_open_llm_v1_240829_frozen.csv mistral_orpo_alpha,HFv1 ARC,60.41,,hf_open_llm_v1_240829_frozen.csv mistral_orpo_alpha,HFv1 GSM8K,0.15,,hf_open_llm_v1_240829_frozen.csv mistral_orpo_alpha,HFv1 HellaSwag,85.1,,hf_open_llm_v1_240829_frozen.csv mistral_orpo_alpha,HFv1 MMLU,61.11,,hf_open_llm_v1_240829_frozen.csv mistral_orpo_alpha,HFv1 TruthfulQA,48.33,,hf_open_llm_v1_240829_frozen.csv mistral_orpo_alpha,HFv1 Winogrande,77.35,,hf_open_llm_v1_240829_frozen.csv mistral_orpo_beta,HF OpenLLM v1,62.53,,hf_open_llm_v1_240829_frozen.csv mistral_orpo_beta,HFv1 ARC,61.18,,hf_open_llm_v1_240829_frozen.csv mistral_orpo_beta,HFv1 GSM8K,39.8,,hf_open_llm_v1_240829_frozen.csv mistral_orpo_beta,HFv1 HellaSwag,84.03,,hf_open_llm_v1_240829_frozen.csv mistral_orpo_beta,HFv1 MMLU,63.26,,hf_open_llm_v1_240829_frozen.csv mistral_orpo_beta,HFv1 TruthfulQA,47.69,,hf_open_llm_v1_240829_frozen.csv mistral_orpo_beta,HFv1 Winogrande,79.24,,hf_open_llm_v1_240829_frozen.csv mistral_orpo_capybara_3k,HF OpenLLM v1,61.77,,hf_open_llm_v1_240829_frozen.csv mistral_orpo_capybara_3k,HFv1 ARC,63.57,,hf_open_llm_v1_240829_frozen.csv mistral_orpo_capybara_3k,HFv1 GSM8K,36.85,,hf_open_llm_v1_240829_frozen.csv mistral_orpo_capybara_3k,HFv1 HellaSwag,85.98,,hf_open_llm_v1_240829_frozen.csv mistral_orpo_capybara_3k,HFv1 MMLU,62.91,,hf_open_llm_v1_240829_frozen.csv mistral_orpo_capybara_3k,HFv1 TruthfulQA,43.83,,hf_open_llm_v1_240829_frozen.csv mistral_orpo_capybara_3k,HFv1 Winogrande,77.51,,hf_open_llm_v1_240829_frozen.csv mistral_orpo_capybara_7k,HF OpenLLM v1,63.36,,hf_open_llm_v1_240829_frozen.csv mistral_orpo_capybara_7k,HFv1 ARC,63.48,,hf_open_llm_v1_240829_frozen.csv mistral_orpo_capybara_7k,HFv1 GSM8K,42.38,,hf_open_llm_v1_240829_frozen.csv mistral_orpo_capybara_7k,HFv1 HellaSwag,85.34,,hf_open_llm_v1_240829_frozen.csv mistral_orpo_capybara_7k,HFv1 MMLU,63.41,,hf_open_llm_v1_240829_frozen.csv mistral_orpo_capybara_7k,HFv1 TruthfulQA,45.98,,hf_open_llm_v1_240829_frozen.csv mistral_orpo_capybara_7k,HFv1 Winogrande,79.56,,hf_open_llm_v1_240829_frozen.csv mistral_orpo_mix_21k,HF OpenLLM v1,61.81,,hf_open_llm_v1_240829_frozen.csv mistral_orpo_mix_21k,HFv1 ARC,59.39,,hf_open_llm_v1_240829_frozen.csv mistral_orpo_mix_21k,HFv1 GSM8K,40.03,,hf_open_llm_v1_240829_frozen.csv mistral_orpo_mix_21k,HFv1 HellaSwag,83.45,,hf_open_llm_v1_240829_frozen.csv mistral_orpo_mix_21k,HFv1 MMLU,63.33,,hf_open_llm_v1_240829_frozen.csv mistral_orpo_mix_21k,HFv1 TruthfulQA,45.18,,hf_open_llm_v1_240829_frozen.csv mistral_orpo_mix_21k,HFv1 Winogrande,79.48,,hf_open_llm_v1_240829_frozen.csv mistral_orpo_mix_7k,HF OpenLLM v1,63.04,,hf_open_llm_v1_240829_frozen.csv mistral_orpo_mix_7k,HFv1 ARC,61.95,,hf_open_llm_v1_240829_frozen.csv mistral_orpo_mix_7k,HFv1 GSM8K,41.24,,hf_open_llm_v1_240829_frozen.csv mistral_orpo_mix_7k,HFv1 HellaSwag,85.51,,hf_open_llm_v1_240829_frozen.csv mistral_orpo_mix_7k,HFv1 MMLU,62.89,,hf_open_llm_v1_240829_frozen.csv mistral_orpo_mix_7k,HFv1 TruthfulQA,46.91,,hf_open_llm_v1_240829_frozen.csv mistral_orpo_mix_7k,HFv1 Winogrande,79.72,,hf_open_llm_v1_240829_frozen.csv mistral_plus_7b,HF OpenLLM v1,59.52,,hf_open_llm_v1_240829_frozen.csv mistral_plus_7b,HFv1 ARC,62.2,,hf_open_llm_v1_240829_frozen.csv mistral_plus_7b,HFv1 GSM8K,33.51,,hf_open_llm_v1_240829_frozen.csv mistral_plus_7b,HFv1 HellaSwag,84.24,,hf_open_llm_v1_240829_frozen.csv mistral_plus_7b,HFv1 MMLU,63.63,,hf_open_llm_v1_240829_frozen.csv mistral_plus_7b,HFv1 TruthfulQA,35.8,,hf_open_llm_v1_240829_frozen.csv mistral_plus_7b,HFv1 Winogrande,77.74,,hf_open_llm_v1_240829_frozen.csv mistral_portuguese_luana_7b_chat,HF OpenLLM v1,61.76,,hf_open_llm_v1_240829_frozen.csv mistral_portuguese_luana_7b_chat,HFv1 ARC,59.3,,hf_open_llm_v1_240829_frozen.csv mistral_portuguese_luana_7b_chat,HFv1 GSM8K,38.21,,hf_open_llm_v1_240829_frozen.csv mistral_portuguese_luana_7b_chat,HFv1 HellaSwag,81.4,,hf_open_llm_v1_240829_frozen.csv mistral_portuguese_luana_7b_chat,HFv1 MMLU,60.84,,hf_open_llm_v1_240829_frozen.csv mistral_portuguese_luana_7b_chat,HFv1 TruthfulQA,54.6,,hf_open_llm_v1_240829_frozen.csv mistral_portuguese_luana_7b_chat,HFv1 Winogrande,76.24,,hf_open_llm_v1_240829_frozen.csv mistral_pro_8b_v0_1,HF OpenLLM v1,61.06,,hf_open_llm_v1_240829_frozen.csv mistral_pro_8b_v0_1,HFv1 ARC,62.2,,hf_open_llm_v1_240829_frozen.csv mistral_pro_8b_v0_1,HFv1 GSM8K,34.19,,hf_open_llm_v1_240829_frozen.csv mistral_pro_8b_v0_1,HFv1 HellaSwag,82.13,,hf_open_llm_v1_240829_frozen.csv mistral_pro_8b_v0_1,HFv1 MMLU,61.74,,hf_open_llm_v1_240829_frozen.csv mistral_pro_8b_v0_1,HFv1 TruthfulQA,49.32,,hf_open_llm_v1_240829_frozen.csv mistral_pro_8b_v0_1,HFv1 Winogrande,76.8,,hf_open_llm_v1_240829_frozen.csv mistral_rank16_dpo,HF OpenLLM v1,65.59,,hf_open_llm_v1_240829_frozen.csv mistral_rank16_dpo,HFv1 ARC,63.05,,hf_open_llm_v1_240829_frozen.csv mistral_rank16_dpo,HFv1 GSM8K,39.95,,hf_open_llm_v1_240829_frozen.csv mistral_rank16_dpo,HFv1 HellaSwag,84.97,,hf_open_llm_v1_240829_frozen.csv mistral_rank16_dpo,HFv1 MMLU,60.3,,hf_open_llm_v1_240829_frozen.csv mistral_rank16_dpo,HFv1 TruthfulQA,68.3,,hf_open_llm_v1_240829_frozen.csv mistral_rank16_dpo,HFv1 Winogrande,76.95,,hf_open_llm_v1_240829_frozen.csv mistral_rank16_invert,HF OpenLLM v1,61.18,,hf_open_llm_v1_240829_frozen.csv mistral_rank16_invert,HFv1 ARC,55.63,,hf_open_llm_v1_240829_frozen.csv mistral_rank16_invert,HFv1 GSM8K,35.41,,hf_open_llm_v1_240829_frozen.csv mistral_rank16_invert,HFv1 HellaSwag,81.44,,hf_open_llm_v1_240829_frozen.csv mistral_rank16_invert,HFv1 MMLU,60.0,,hf_open_llm_v1_240829_frozen.csv mistral_rank16_invert,HFv1 TruthfulQA,57.49,,hf_open_llm_v1_240829_frozen.csv mistral_rank16_invert,HFv1 Winogrande,77.11,,hf_open_llm_v1_240829_frozen.csv mistral_rank16_packing,HF OpenLLM v1,65.23,,hf_open_llm_v1_240829_frozen.csv mistral_rank16_packing,HFv1 ARC,62.54,,hf_open_llm_v1_240829_frozen.csv mistral_rank16_packing,HFv1 GSM8K,39.12,,hf_open_llm_v1_240829_frozen.csv mistral_rank16_packing,HFv1 HellaSwag,84.78,,hf_open_llm_v1_240829_frozen.csv mistral_rank16_packing,HFv1 MMLU,60.36,,hf_open_llm_v1_240829_frozen.csv mistral_rank16_packing,HFv1 TruthfulQA,67.44,,hf_open_llm_v1_240829_frozen.csv mistral_rank16_packing,HFv1 Winogrande,77.11,,hf_open_llm_v1_240829_frozen.csv mistral_rank16_sft,HF OpenLLM v1,59.61,,hf_open_llm_v1_240829_frozen.csv mistral_rank16_sft,HFv1 ARC,55.03,,hf_open_llm_v1_240829_frozen.csv mistral_rank16_sft,HFv1 GSM8K,35.18,,hf_open_llm_v1_240829_frozen.csv mistral_rank16_sft,HFv1 HellaSwag,81.21,,hf_open_llm_v1_240829_frozen.csv mistral_rank16_sft,HFv1 MMLU,60.43,,hf_open_llm_v1_240829_frozen.csv mistral_rank16_sft,HFv1 TruthfulQA,48.76,,hf_open_llm_v1_240829_frozen.csv mistral_rank16_sft,HFv1 Winogrande,77.03,,hf_open_llm_v1_240829_frozen.csv mistral_rank32_dpo,HF OpenLLM v1,65.42,,hf_open_llm_v1_240829_frozen.csv mistral_rank32_dpo,HFv1 ARC,63.14,,hf_open_llm_v1_240829_frozen.csv mistral_rank32_dpo,HFv1 GSM8K,38.89,,hf_open_llm_v1_240829_frozen.csv mistral_rank32_dpo,HFv1 HellaSwag,85.01,,hf_open_llm_v1_240829_frozen.csv mistral_rank32_dpo,HFv1 MMLU,60.57,,hf_open_llm_v1_240829_frozen.csv mistral_rank32_dpo,HFv1 TruthfulQA,68.29,,hf_open_llm_v1_240829_frozen.csv mistral_rank32_dpo,HFv1 Winogrande,76.64,,hf_open_llm_v1_240829_frozen.csv mistral_rank32_invert,HF OpenLLM v1,60.85,,hf_open_llm_v1_240829_frozen.csv mistral_rank32_invert,HFv1 ARC,55.72,,hf_open_llm_v1_240829_frozen.csv mistral_rank32_invert,HFv1 GSM8K,35.18,,hf_open_llm_v1_240829_frozen.csv mistral_rank32_invert,HFv1 HellaSwag,81.2,,hf_open_llm_v1_240829_frozen.csv mistral_rank32_invert,HFv1 MMLU,59.88,,hf_open_llm_v1_240829_frozen.csv mistral_rank32_invert,HFv1 TruthfulQA,56.18,,hf_open_llm_v1_240829_frozen.csv mistral_rank32_invert,HFv1 Winogrande,76.95,,hf_open_llm_v1_240829_frozen.csv mistral_rank32_sft,HF OpenLLM v1,59.39,,hf_open_llm_v1_240829_frozen.csv mistral_rank32_sft,HFv1 ARC,54.95,,hf_open_llm_v1_240829_frozen.csv mistral_rank32_sft,HFv1 GSM8K,34.95,,hf_open_llm_v1_240829_frozen.csv mistral_rank32_sft,HFv1 HellaSwag,80.97,,hf_open_llm_v1_240829_frozen.csv mistral_rank32_sft,HFv1 MMLU,60.42,,hf_open_llm_v1_240829_frozen.csv mistral_rank32_sft,HFv1 TruthfulQA,48.02,,hf_open_llm_v1_240829_frozen.csv mistral_rank32_sft,HFv1 Winogrande,77.03,,hf_open_llm_v1_240829_frozen.csv mistral_rank8_dpo,HF OpenLLM v1,65.48,,hf_open_llm_v1_240829_frozen.csv mistral_rank8_dpo,HFv1 ARC,63.05,,hf_open_llm_v1_240829_frozen.csv mistral_rank8_dpo,HFv1 GSM8K,38.89,,hf_open_llm_v1_240829_frozen.csv mistral_rank8_dpo,HFv1 HellaSwag,85.11,,hf_open_llm_v1_240829_frozen.csv mistral_rank8_dpo,HFv1 MMLU,60.32,,hf_open_llm_v1_240829_frozen.csv mistral_rank8_dpo,HFv1 TruthfulQA,68.61,,hf_open_llm_v1_240829_frozen.csv mistral_rank8_dpo,HFv1 Winogrande,76.87,,hf_open_llm_v1_240829_frozen.csv mistral_rank8_invert,HF OpenLLM v1,61.85,,hf_open_llm_v1_240829_frozen.csv mistral_rank8_invert,HFv1 ARC,56.48,,hf_open_llm_v1_240829_frozen.csv mistral_rank8_invert,HFv1 GSM8K,36.85,,hf_open_llm_v1_240829_frozen.csv mistral_rank8_invert,HFv1 HellaSwag,81.68,,hf_open_llm_v1_240829_frozen.csv mistral_rank8_invert,HFv1 MMLU,60.26,,hf_open_llm_v1_240829_frozen.csv mistral_rank8_invert,HFv1 TruthfulQA,58.32,,hf_open_llm_v1_240829_frozen.csv mistral_rank8_invert,HFv1 Winogrande,77.51,,hf_open_llm_v1_240829_frozen.csv mistral_rank8_packing,HF OpenLLM v1,65.14,,hf_open_llm_v1_240829_frozen.csv mistral_rank8_packing,HFv1 ARC,62.54,,hf_open_llm_v1_240829_frozen.csv mistral_rank8_packing,HFv1 GSM8K,39.04,,hf_open_llm_v1_240829_frozen.csv mistral_rank8_packing,HFv1 HellaSwag,84.77,,hf_open_llm_v1_240829_frozen.csv mistral_rank8_packing,HFv1 MMLU,60.38,,hf_open_llm_v1_240829_frozen.csv mistral_rank8_packing,HFv1 TruthfulQA,67.31,,hf_open_llm_v1_240829_frozen.csv mistral_rank8_packing,HFv1 Winogrande,76.8,,hf_open_llm_v1_240829_frozen.csv mistral_rank8_sft,HF OpenLLM v1,59.41,,hf_open_llm_v1_240829_frozen.csv mistral_rank8_sft,HFv1 ARC,55.8,,hf_open_llm_v1_240829_frozen.csv mistral_rank8_sft,HFv1 GSM8K,32.45,,hf_open_llm_v1_240829_frozen.csv mistral_rank8_sft,HFv1 HellaSwag,81.21,,hf_open_llm_v1_240829_frozen.csv mistral_rank8_sft,HFv1 MMLU,60.5,,hf_open_llm_v1_240829_frozen.csv mistral_rank8_sft,HFv1 TruthfulQA,49.07,,hf_open_llm_v1_240829_frozen.csv mistral_rank8_sft,HFv1 Winogrande,77.43,,hf_open_llm_v1_240829_frozen.csv mistral_sft_v3,HF OpenLLM v1,60.93,,hf_open_llm_v1_240829_frozen.csv mistral_sft_v3,HFv1 ARC,61.35,,hf_open_llm_v1_240829_frozen.csv mistral_sft_v3,HFv1 GSM8K,32.45,,hf_open_llm_v1_240829_frozen.csv mistral_sft_v3,HFv1 HellaSwag,82.23,,hf_open_llm_v1_240829_frozen.csv mistral_sft_v3,HFv1 MMLU,63.4,,hf_open_llm_v1_240829_frozen.csv mistral_sft_v3,HFv1 TruthfulQA,48.49,,hf_open_llm_v1_240829_frozen.csv mistral_sft_v3,HFv1 Winogrande,77.66,,hf_open_llm_v1_240829_frozen.csv mistral_trismegistus_7b,HF OpenLLM v1,52.66,,hf_open_llm_v1_240829_frozen.csv mistral_trismegistus_7b,HFv1 ARC,54.1,,hf_open_llm_v1_240829_frozen.csv mistral_trismegistus_7b,HFv1 GSM8K,9.93,,hf_open_llm_v1_240829_frozen.csv mistral_trismegistus_7b,HFv1 HellaSwag,77.91,,hf_open_llm_v1_240829_frozen.csv mistral_trismegistus_7b,HFv1 MMLU,54.49,,hf_open_llm_v1_240829_frozen.csv mistral_trismegistus_7b,HFv1 TruthfulQA,49.36,,hf_open_llm_v1_240829_frozen.csv mistral_trismegistus_7b,HFv1 Winogrande,70.17,,hf_open_llm_v1_240829_frozen.csv mistral_v0_1_peanutbutter_v0_0_0_7b,HF OpenLLM v1,59.09,,hf_open_llm_v1_240829_frozen.csv mistral_v0_1_peanutbutter_v0_0_0_7b,HFv1 ARC,62.2,,hf_open_llm_v1_240829_frozen.csv mistral_v0_1_peanutbutter_v0_0_0_7b,HFv1 GSM8K,18.5,,hf_open_llm_v1_240829_frozen.csv mistral_v0_1_peanutbutter_v0_0_0_7b,HFv1 HellaSwag,84.1,,hf_open_llm_v1_240829_frozen.csv mistral_v0_1_peanutbutter_v0_0_0_7b,HFv1 MMLU,64.14,,hf_open_llm_v1_240829_frozen.csv mistral_v0_1_peanutbutter_v0_0_0_7b,HFv1 TruthfulQA,46.94,,hf_open_llm_v1_240829_frozen.csv mistral_v0_1_peanutbutter_v0_0_0_7b,HFv1 Winogrande,78.69,,hf_open_llm_v1_240829_frozen.csv mistral_v0_1_peanutbutter_v0_0_2_7b,HF OpenLLM v1,58.66,,hf_open_llm_v1_240829_frozen.csv mistral_v0_1_peanutbutter_v0_0_2_7b,HFv1 ARC,61.77,,hf_open_llm_v1_240829_frozen.csv mistral_v0_1_peanutbutter_v0_0_2_7b,HFv1 GSM8K,17.44,,hf_open_llm_v1_240829_frozen.csv mistral_v0_1_peanutbutter_v0_0_2_7b,HFv1 HellaSwag,84.11,,hf_open_llm_v1_240829_frozen.csv mistral_v0_1_peanutbutter_v0_0_2_7b,HFv1 MMLU,64.38,,hf_open_llm_v1_240829_frozen.csv mistral_v0_1_peanutbutter_v0_0_2_7b,HFv1 TruthfulQA,45.92,,hf_open_llm_v1_240829_frozen.csv mistral_v0_1_peanutbutter_v0_0_2_7b,HFv1 Winogrande,78.37,,hf_open_llm_v1_240829_frozen.csv mistral_v0_1_peanutbutter_v0_0_5_dpo_7b_qlora,HF OpenLLM v1,58.65,,hf_open_llm_v1_240829_frozen.csv mistral_v0_1_peanutbutter_v0_0_5_dpo_7b_qlora,HFv1 ARC,61.26,,hf_open_llm_v1_240829_frozen.csv mistral_v0_1_peanutbutter_v0_0_5_dpo_7b_qlora,HFv1 GSM8K,18.12,,hf_open_llm_v1_240829_frozen.csv mistral_v0_1_peanutbutter_v0_0_5_dpo_7b_qlora,HFv1 HellaSwag,84.52,,hf_open_llm_v1_240829_frozen.csv mistral_v0_1_peanutbutter_v0_0_5_dpo_7b_qlora,HFv1 MMLU,63.63,,hf_open_llm_v1_240829_frozen.csv mistral_v0_1_peanutbutter_v0_0_5_dpo_7b_qlora,HFv1 TruthfulQA,45.75,,hf_open_llm_v1_240829_frozen.csv mistral_v0_1_peanutbutter_v0_0_5_dpo_7b_qlora,HFv1 Winogrande,78.61,,hf_open_llm_v1_240829_frozen.csv mistral_v0_1_peanutbutter_v0_0_5_sft_7b_qlora,HF OpenLLM v1,58.24,,hf_open_llm_v1_240829_frozen.csv mistral_v0_1_peanutbutter_v0_0_5_sft_7b_qlora,HFv1 ARC,60.75,,hf_open_llm_v1_240829_frozen.csv mistral_v0_1_peanutbutter_v0_0_5_sft_7b_qlora,HFv1 GSM8K,17.13,,hf_open_llm_v1_240829_frozen.csv mistral_v0_1_peanutbutter_v0_0_5_sft_7b_qlora,HFv1 HellaSwag,84.24,,hf_open_llm_v1_240829_frozen.csv mistral_v0_1_peanutbutter_v0_0_5_sft_7b_qlora,HFv1 MMLU,63.66,,hf_open_llm_v1_240829_frozen.csv mistral_v0_1_peanutbutter_v0_0_5_sft_7b_qlora,HFv1 TruthfulQA,44.94,,hf_open_llm_v1_240829_frozen.csv mistral_v0_1_peanutbutter_v0_0_5_sft_7b_qlora,HFv1 Winogrande,78.69,,hf_open_llm_v1_240829_frozen.csv mistral_v2_7b_selfplay_low_tmp,HF OpenLLM v1,65.63,,hf_open_llm_v1_240829_frozen.csv mistral_v2_7b_selfplay_low_tmp,HFv1 ARC,63.05,,hf_open_llm_v1_240829_frozen.csv mistral_v2_7b_selfplay_low_tmp,HFv1 GSM8K,39.58,,hf_open_llm_v1_240829_frozen.csv mistral_v2_7b_selfplay_low_tmp,HFv1 HellaSwag,84.91,,hf_open_llm_v1_240829_frozen.csv mistral_v2_7b_selfplay_low_tmp,HFv1 MMLU,60.76,,hf_open_llm_v1_240829_frozen.csv mistral_v2_7b_selfplay_low_tmp,HFv1 TruthfulQA,68.13,,hf_open_llm_v1_240829_frozen.csv mistral_v2_7b_selfplay_low_tmp,HFv1 Winogrande,77.35,,hf_open_llm_v1_240829_frozen.csv mistral_v2_7b_selfplay_v0,HF OpenLLM v1,65.72,,hf_open_llm_v1_240829_frozen.csv mistral_v2_7b_selfplay_v0,HFv1 ARC,63.05,,hf_open_llm_v1_240829_frozen.csv mistral_v2_7b_selfplay_v0,HFv1 GSM8K,40.26,,hf_open_llm_v1_240829_frozen.csv mistral_v2_7b_selfplay_v0,HFv1 HellaSwag,84.88,,hf_open_llm_v1_240829_frozen.csv mistral_v2_7b_selfplay_v0,HFv1 MMLU,60.78,,hf_open_llm_v1_240829_frozen.csv mistral_v2_7b_selfplay_v0,HFv1 TruthfulQA,68.14,,hf_open_llm_v1_240829_frozen.csv mistral_v2_7b_selfplay_v0,HFv1 Winogrande,77.19,,hf_open_llm_v1_240829_frozen.csv mistral_v2_7b_selfplay_v0_test,HF OpenLLM v1,65.61,,hf_open_llm_v1_240829_frozen.csv mistral_v2_7b_selfplay_v0_test,HFv1 ARC,62.97,,hf_open_llm_v1_240829_frozen.csv mistral_v2_7b_selfplay_v0_test,HFv1 GSM8K,39.73,,hf_open_llm_v1_240829_frozen.csv mistral_v2_7b_selfplay_v0_test,HFv1 HellaSwag,84.86,,hf_open_llm_v1_240829_frozen.csv mistral_v2_7b_selfplay_v0_test,HFv1 MMLU,60.64,,hf_open_llm_v1_240829_frozen.csv mistral_v2_7b_selfplay_v0_test,HFv1 TruthfulQA,67.91,,hf_open_llm_v1_240829_frozen.csv mistral_v2_7b_selfplay_v0_test,HFv1 Winogrande,77.58,,hf_open_llm_v1_240829_frozen.csv mistralbeagle_rs_7b_v0_1,HF OpenLLM v1,67.75,,hf_open_llm_v1_240829_frozen.csv mistralbeagle_rs_7b_v0_1,HFv1 ARC,69.45,,hf_open_llm_v1_240829_frozen.csv mistralbeagle_rs_7b_v0_1,HFv1 GSM8K,37.91,,hf_open_llm_v1_240829_frozen.csv mistralbeagle_rs_7b_v0_1,HFv1 HellaSwag,84.62,,hf_open_llm_v1_240829_frozen.csv mistralbeagle_rs_7b_v0_1,HFv1 MMLU,63.07,,hf_open_llm_v1_240829_frozen.csv mistralbeagle_rs_7b_v0_1,HFv1 TruthfulQA,69.78,,hf_open_llm_v1_240829_frozen.csv mistralbeagle_rs_7b_v0_1,HFv1 Winogrande,81.69,,hf_open_llm_v1_240829_frozen.csv mistralinstructlongish,HF OpenLLM v1,53.62,,hf_open_llm_v1_240829_frozen.csv mistralinstructlongish,HFv1 ARC,60.75,,hf_open_llm_v1_240829_frozen.csv mistralinstructlongish,HFv1 GSM8K,1.52,,hf_open_llm_v1_240829_frozen.csv mistralinstructlongish,HFv1 HellaSwag,81.86,,hf_open_llm_v1_240829_frozen.csv mistralinstructlongish,HFv1 MMLU,60.49,,hf_open_llm_v1_240829_frozen.csv mistralinstructlongish,HFv1 TruthfulQA,40.55,,hf_open_llm_v1_240829_frozen.csv mistralinstructlongish,HFv1 Winogrande,76.56,,hf_open_llm_v1_240829_frozen.csv mistraltrix_slerp,HF OpenLLM v1,73.58,,hf_open_llm_v1_240829_frozen.csv mistraltrix_slerp,HFv1 ARC,70.82,,hf_open_llm_v1_240829_frozen.csv mistraltrix_slerp,HFv1 GSM8K,71.11,,hf_open_llm_v1_240829_frozen.csv mistraltrix_slerp,HFv1 HellaSwag,87.54,,hf_open_llm_v1_240829_frozen.csv mistraltrix_slerp,HFv1 MMLU,64.98,,hf_open_llm_v1_240829_frozen.csv mistraltrix_slerp,HFv1 TruthfulQA,65.35,,hf_open_llm_v1_240829_frozen.csv mistraltrix_slerp,HFv1 Winogrande,81.69,,hf_open_llm_v1_240829_frozen.csv mistraltrix_v1,HF OpenLLM v1,73.39,,hf_open_llm_v1_240829_frozen.csv mistraltrix_v1,HFv1 ARC,72.27,,hf_open_llm_v1_240829_frozen.csv mistraltrix_v1,HFv1 GSM8K,62.77,,hf_open_llm_v1_240829_frozen.csv mistraltrix_v1,HFv1 HellaSwag,88.33,,hf_open_llm_v1_240829_frozen.csv mistraltrix_v1,HFv1 MMLU,65.24,,hf_open_llm_v1_240829_frozen.csv mistraltrix_v1,HFv1 TruthfulQA,70.73,,hf_open_llm_v1_240829_frozen.csv mistraltrix_v1,HFv1 Winogrande,80.98,,hf_open_llm_v1_240829_frozen.csv mistraltrixtest,HF OpenLLM v1,73.17,,hf_open_llm_v1_240829_frozen.csv mistraltrixtest,HFv1 ARC,72.53,,hf_open_llm_v1_240829_frozen.csv mistraltrixtest,HFv1 GSM8K,60.73,,hf_open_llm_v1_240829_frozen.csv mistraltrixtest,HFv1 HellaSwag,88.4,,hf_open_llm_v1_240829_frozen.csv mistraltrixtest,HFv1 MMLU,65.22,,hf_open_llm_v1_240829_frozen.csv mistraltrixtest,HFv1 TruthfulQA,70.77,,hf_open_llm_v1_240829_frozen.csv mistraltrixtest,HFv1 Winogrande,81.37,,hf_open_llm_v1_240829_frozen.csv mistroll_7b_v2_2,HF OpenLLM v1,76.76,,hf_open_llm_v1_240829_frozen.csv mistroll_7b_v2_2,HFv1 ARC,72.78,,hf_open_llm_v1_240829_frozen.csv mistroll_7b_v2_2,HFv1 GSM8K,71.19,,hf_open_llm_v1_240829_frozen.csv mistroll_7b_v2_2,HFv1 HellaSwag,89.16,,hf_open_llm_v1_240829_frozen.csv mistroll_7b_v2_2,HFv1 MMLU,64.35,,hf_open_llm_v1_240829_frozen.csv mistroll_7b_v2_2,HFv1 TruthfulQA,78.1,,hf_open_llm_v1_240829_frozen.csv mistroll_7b_v2_2,HFv1 Winogrande,85.0,,hf_open_llm_v1_240829_frozen.csv mistroll_7b_v2_3_notsosm4rt_16bit,HF OpenLLM v1,70.8,,hf_open_llm_v1_240829_frozen.csv mistroll_7b_v2_3_notsosm4rt_16bit,HFv1 ARC,67.06,,hf_open_llm_v1_240829_frozen.csv mistroll_7b_v2_3_notsosm4rt_16bit,HFv1 GSM8K,66.57,,hf_open_llm_v1_240829_frozen.csv mistroll_7b_v2_3_notsosm4rt_16bit,HFv1 HellaSwag,86.24,,hf_open_llm_v1_240829_frozen.csv mistroll_7b_v2_3_notsosm4rt_16bit,HFv1 MMLU,65.12,,hf_open_llm_v1_240829_frozen.csv mistroll_7b_v2_3_notsosm4rt_16bit,HFv1 TruthfulQA,56.38,,hf_open_llm_v1_240829_frozen.csv mistroll_7b_v2_3_notsosm4rt_16bit,HFv1 Winogrande,83.43,,hf_open_llm_v1_240829_frozen.csv mixnueza_6x32m_moe,HF OpenLLM v1,28.92,,hf_open_llm_v1_240829_frozen.csv mixnueza_6x32m_moe,HFv1 ARC,21.16,,hf_open_llm_v1_240829_frozen.csv mixnueza_6x32m_moe,HFv1 GSM8K,0.3,,hf_open_llm_v1_240829_frozen.csv mixnueza_6x32m_moe,HFv1 HellaSwag,26.69,,hf_open_llm_v1_240829_frozen.csv mixnueza_6x32m_moe,HFv1 MMLU,25.7,,hf_open_llm_v1_240829_frozen.csv mixnueza_6x32m_moe,HFv1 TruthfulQA,47.84,,hf_open_llm_v1_240829_frozen.csv mixnueza_6x32m_moe,HFv1 Winogrande,51.85,,hf_open_llm_v1_240829_frozen.csv mixsmol_4x400m_v0_1_epoch1,HF OpenLLM v1,28.45,,hf_open_llm_v1_240829_frozen.csv mixsmol_4x400m_v0_1_epoch1,HFv1 ARC,22.87,,hf_open_llm_v1_240829_frozen.csv mixsmol_4x400m_v0_1_epoch1,HFv1 GSM8K,0.15,,hf_open_llm_v1_240829_frozen.csv mixsmol_4x400m_v0_1_epoch1,HFv1 HellaSwag,30.57,,hf_open_llm_v1_240829_frozen.csv mixsmol_4x400m_v0_1_epoch1,HFv1 MMLU,25.28,,hf_open_llm_v1_240829_frozen.csv mixsmol_4x400m_v0_1_epoch1,HFv1 TruthfulQA,39.03,,hf_open_llm_v1_240829_frozen.csv mixsmol_4x400m_v0_1_epoch1,HFv1 Winogrande,52.8,,hf_open_llm_v1_240829_frozen.csv mixsmol_4x400m_v0_1_epoch2,HF OpenLLM v1,28.92,,hf_open_llm_v1_240829_frozen.csv mixsmol_4x400m_v0_1_epoch2,HFv1 ARC,23.55,,hf_open_llm_v1_240829_frozen.csv mixsmol_4x400m_v0_1_epoch2,HFv1 GSM8K,0.23,,hf_open_llm_v1_240829_frozen.csv mixsmol_4x400m_v0_1_epoch2,HFv1 HellaSwag,32.6,,hf_open_llm_v1_240829_frozen.csv mixsmol_4x400m_v0_1_epoch2,HFv1 MMLU,25.26,,hf_open_llm_v1_240829_frozen.csv mixsmol_4x400m_v0_1_epoch2,HFv1 TruthfulQA,39.24,,hf_open_llm_v1_240829_frozen.csv mixsmol_4x400m_v0_1_epoch2,HFv1 Winogrande,52.64,,hf_open_llm_v1_240829_frozen.csv mixtao_7bx2_moe_instruct_v5_0,HF OpenLLM v1,75.08,,hf_open_llm_v1_240829_frozen.csv mixtao_7bx2_moe_instruct_v5_0,HFv1 ARC,73.63,,hf_open_llm_v1_240829_frozen.csv mixtao_7bx2_moe_instruct_v5_0,HFv1 GSM8K,69.45,,hf_open_llm_v1_240829_frozen.csv mixtao_7bx2_moe_instruct_v5_0,HFv1 HellaSwag,88.93,,hf_open_llm_v1_240829_frozen.csv mixtao_7bx2_moe_instruct_v5_0,HFv1 MMLU,64.65,,hf_open_llm_v1_240829_frozen.csv mixtao_7bx2_moe_instruct_v5_0,HFv1 TruthfulQA,69.83,,hf_open_llm_v1_240829_frozen.csv mixtao_7bx2_moe_instruct_v5_0,HFv1 Winogrande,83.98,,hf_open_llm_v1_240829_frozen.csv mixtao_7bx2_moe_instruct_v6_0,HF OpenLLM v1,75.94,,hf_open_llm_v1_240829_frozen.csv mixtao_7bx2_moe_instruct_v6_0,HFv1 ARC,73.38,,hf_open_llm_v1_240829_frozen.csv mixtao_7bx2_moe_instruct_v6_0,HFv1 GSM8K,68.92,,hf_open_llm_v1_240829_frozen.csv mixtao_7bx2_moe_instruct_v6_0,HFv1 HellaSwag,89.02,,hf_open_llm_v1_240829_frozen.csv mixtao_7bx2_moe_instruct_v6_0,HFv1 MMLU,64.61,,hf_open_llm_v1_240829_frozen.csv mixtao_7bx2_moe_instruct_v6_0,HFv1 TruthfulQA,70.45,,hf_open_llm_v1_240829_frozen.csv mixtao_7bx2_moe_instruct_v6_0,HFv1 Winogrande,89.27,,hf_open_llm_v1_240829_frozen.csv mixtao_7bx2_moe_instruct_v7_0,HF OpenLLM v1,76.55,,hf_open_llm_v1_240829_frozen.csv mixtao_7bx2_moe_instruct_v7_0,HFv1 ARC,74.23,,hf_open_llm_v1_240829_frozen.csv mixtao_7bx2_moe_instruct_v7_0,HFv1 GSM8K,69.14,,hf_open_llm_v1_240829_frozen.csv mixtao_7bx2_moe_instruct_v7_0,HFv1 HellaSwag,89.37,,hf_open_llm_v1_240829_frozen.csv mixtao_7bx2_moe_instruct_v7_0,HFv1 MMLU,64.54,,hf_open_llm_v1_240829_frozen.csv mixtao_7bx2_moe_instruct_v7_0,HFv1 TruthfulQA,74.26,,hf_open_llm_v1_240829_frozen.csv mixtao_7bx2_moe_instruct_v7_0,HFv1 Winogrande,87.77,,hf_open_llm_v1_240829_frozen.csv mixtral_11bx2_moe_19b,HF OpenLLM v1,74.41,,hf_open_llm_v1_240829_frozen.csv mixtral_11bx2_moe_19b,HFv1 ARC,71.16,,hf_open_llm_v1_240829_frozen.csv mixtral_11bx2_moe_19b,HFv1 GSM8K,65.28,,hf_open_llm_v1_240829_frozen.csv mixtral_11bx2_moe_19b,HFv1 HellaSwag,88.47,,hf_open_llm_v1_240829_frozen.csv mixtral_11bx2_moe_19b,HFv1 MMLU,66.31,,hf_open_llm_v1_240829_frozen.csv mixtral_11bx2_moe_19b,HFv1 TruthfulQA,72.0,,hf_open_llm_v1_240829_frozen.csv mixtral_11bx2_moe_19b,HFv1 Winogrande,83.27,,hf_open_llm_v1_240829_frozen.csv mixtral_6x7b_instruct_v0_1,HF OpenLLM v1,52.87,,hf_open_llm_v1_240829_frozen.csv mixtral_6x7b_instruct_v0_1,HFv1 ARC,56.66,,hf_open_llm_v1_240829_frozen.csv mixtral_6x7b_instruct_v0_1,HFv1 GSM8K,6.07,,hf_open_llm_v1_240829_frozen.csv mixtral_6x7b_instruct_v0_1,HFv1 HellaSwag,78.85,,hf_open_llm_v1_240829_frozen.csv mixtral_6x7b_instruct_v0_1,HFv1 MMLU,52.88,,hf_open_llm_v1_240829_frozen.csv mixtral_6x7b_instruct_v0_1,HFv1 TruthfulQA,51.55,,hf_open_llm_v1_240829_frozen.csv mixtral_6x7b_instruct_v0_1,HFv1 Winogrande,71.19,,hf_open_llm_v1_240829_frozen.csv mixtral_7bx2_truthy,HF OpenLLM v1,74.64,,hf_open_llm_v1_240829_frozen.csv mixtral_7bx2_truthy,HFv1 ARC,72.18,,hf_open_llm_v1_240829_frozen.csv mixtral_7bx2_truthy,HFv1 GSM8K,67.25,,hf_open_llm_v1_240829_frozen.csv mixtral_7bx2_truthy,HFv1 HellaSwag,87.88,,hf_open_llm_v1_240829_frozen.csv mixtral_7bx2_truthy,HFv1 MMLU,65.2,,hf_open_llm_v1_240829_frozen.csv mixtral_7bx2_truthy,HFv1 TruthfulQA,74.68,,hf_open_llm_v1_240829_frozen.csv mixtral_7bx2_truthy,HFv1 Winogrande,80.66,,hf_open_llm_v1_240829_frozen.csv mixtral_7bx4_moe_24b,HF OpenLLM v1,68.85,,hf_open_llm_v1_240829_frozen.csv mixtral_7bx4_moe_24b,HFv1 ARC,65.36,,hf_open_llm_v1_240829_frozen.csv mixtral_7bx4_moe_24b,HFv1 GSM8K,61.71,,hf_open_llm_v1_240829_frozen.csv mixtral_7bx4_moe_24b,HFv1 HellaSwag,85.23,,hf_open_llm_v1_240829_frozen.csv mixtral_7bx4_moe_24b,HFv1 MMLU,62.96,,hf_open_llm_v1_240829_frozen.csv mixtral_7bx4_moe_24b,HFv1 TruthfulQA,59.78,,hf_open_llm_v1_240829_frozen.csv mixtral_7bx4_moe_24b,HFv1 Winogrande,78.06,,hf_open_llm_v1_240829_frozen.csv mixtral_7bx6_moe_35b,HF OpenLLM v1,73.32,,hf_open_llm_v1_240829_frozen.csv mixtral_7bx6_moe_35b,HFv1 ARC,70.14,,hf_open_llm_v1_240829_frozen.csv mixtral_7bx6_moe_35b,HFv1 GSM8K,71.42,,hf_open_llm_v1_240829_frozen.csv mixtral_7bx6_moe_35b,HFv1 HellaSwag,86.77,,hf_open_llm_v1_240829_frozen.csv mixtral_7bx6_moe_35b,HFv1 MMLU,64.74,,hf_open_llm_v1_240829_frozen.csv mixtral_7bx6_moe_35b,HFv1 TruthfulQA,65.79,,hf_open_llm_v1_240829_frozen.csv mixtral_7bx6_moe_35b,HFv1 Winogrande,81.06,,hf_open_llm_v1_240829_frozen.csv mixtral_8x22b_instruct_v0_1,HF OpenLLM v1,79.15,,hf_open_llm_v1_240829_frozen.csv mixtral_8x22b_instruct_v0_1,HFv1 ARC,72.7,,hf_open_llm_v1_240829_frozen.csv mixtral_8x22b_instruct_v0_1,HFv1 GSM8K,82.03,,hf_open_llm_v1_240829_frozen.csv mixtral_8x22b_instruct_v0_1,HFv1 HellaSwag,89.08,,hf_open_llm_v1_240829_frozen.csv mixtral_8x22b_instruct_v0_1,HFv1 MMLU,77.77,,hf_open_llm_v1_240829_frozen.csv mixtral_8x22b_instruct_v0_1,HFv1 TruthfulQA,68.14,,hf_open_llm_v1_240829_frozen.csv mixtral_8x22b_instruct_v0_1,HFv1 Winogrande,85.16,,hf_open_llm_v1_240829_frozen.csv mixtral_8x22b_v0_1,HF OpenLLM v1,74.47,,hf_open_llm_v1_240829_frozen.csv mixtral_8x22b_v0_1,HFv1 ARC,70.65,,hf_open_llm_v1_240829_frozen.csv mixtral_8x22b_v0_1,HFv1 GSM8K,74.15,,hf_open_llm_v1_240829_frozen.csv mixtral_8x22b_v0_1,HFv1 HellaSwag,88.74,,hf_open_llm_v1_240829_frozen.csv mixtral_8x22b_v0_1,HFv1 MMLU,77.81,,hf_open_llm_v1_240829_frozen.csv mixtral_8x22b_v0_1,HFv1 TruthfulQA,51.08,,hf_open_llm_v1_240829_frozen.csv mixtral_8x22b_v0_1,HFv1 Winogrande,85.0,,hf_open_llm_v1_240829_frozen.csv mixtral_8x7b_dpo_v0_1,HF OpenLLM v1,70.45,,hf_open_llm_v1_240829_frozen.csv mixtral_8x7b_dpo_v0_1,HFv1 ARC,70.9,,hf_open_llm_v1_240829_frozen.csv mixtral_8x7b_dpo_v0_1,HFv1 GSM8K,53.75,,hf_open_llm_v1_240829_frozen.csv mixtral_8x7b_dpo_v0_1,HFv1 HellaSwag,87.61,,hf_open_llm_v1_240829_frozen.csv mixtral_8x7b_dpo_v0_1,HFv1 MMLU,70.66,,hf_open_llm_v1_240829_frozen.csv mixtral_8x7b_dpo_v0_1,HFv1 TruthfulQA,57.38,,hf_open_llm_v1_240829_frozen.csv mixtral_8x7b_dpo_v0_1,HFv1 Winogrande,82.4,,hf_open_llm_v1_240829_frozen.csv mixtral_8x7b_dpo_v0_2,HF OpenLLM v1,71.32,,hf_open_llm_v1_240829_frozen.csv mixtral_8x7b_dpo_v0_2,HFv1 ARC,70.39,,hf_open_llm_v1_240829_frozen.csv mixtral_8x7b_dpo_v0_2,HFv1 GSM8K,57.54,,hf_open_llm_v1_240829_frozen.csv mixtral_8x7b_dpo_v0_2,HFv1 HellaSwag,87.73,,hf_open_llm_v1_240829_frozen.csv mixtral_8x7b_dpo_v0_2,HFv1 MMLU,71.03,,hf_open_llm_v1_240829_frozen.csv mixtral_8x7b_dpo_v0_2,HFv1 TruthfulQA,58.69,,hf_open_llm_v1_240829_frozen.csv mixtral_8x7b_dpo_v0_2,HFv1 Winogrande,82.56,,hf_open_llm_v1_240829_frozen.csv mixtral_8x7b_instruct_v0_1,HF OpenLLM v1,72.62,,hf_open_llm_v1_240829_frozen.csv mixtral_8x7b_instruct_v0_1,HFv1 ARC,70.22,,hf_open_llm_v1_240829_frozen.csv mixtral_8x7b_instruct_v0_1,HFv1 GSM8K,60.73,,hf_open_llm_v1_240829_frozen.csv mixtral_8x7b_instruct_v0_1,HFv1 HellaSwag,87.63,,hf_open_llm_v1_240829_frozen.csv mixtral_8x7b_instruct_v0_1,HFv1 MMLU,71.16,,hf_open_llm_v1_240829_frozen.csv mixtral_8x7b_instruct_v0_1,HFv1 TruthfulQA,64.58,,hf_open_llm_v1_240829_frozen.csv mixtral_8x7b_instruct_v0_1,HFv1 Winogrande,81.37,,hf_open_llm_v1_240829_frozen.csv mixtral_8x7b_instruct_v0_1_dpo,HF OpenLLM v1,73.44,,hf_open_llm_v1_240829_frozen.csv mixtral_8x7b_instruct_v0_1_dpo,HFv1 ARC,69.8,,hf_open_llm_v1_240829_frozen.csv mixtral_8x7b_instruct_v0_1_dpo,HFv1 GSM8K,61.41,,hf_open_llm_v1_240829_frozen.csv mixtral_8x7b_instruct_v0_1_dpo,HFv1 HellaSwag,87.83,,hf_open_llm_v1_240829_frozen.csv mixtral_8x7b_instruct_v0_1_dpo,HFv1 MMLU,71.05,,hf_open_llm_v1_240829_frozen.csv mixtral_8x7b_instruct_v0_1_dpo,HFv1 TruthfulQA,69.18,,hf_open_llm_v1_240829_frozen.csv mixtral_8x7b_instruct_v0_1_dpo,HFv1 Winogrande,81.37,,hf_open_llm_v1_240829_frozen.csv mixtral_8x7b_peft_v0_1,HF OpenLLM v1,68.87,,hf_open_llm_v1_240829_frozen.csv mixtral_8x7b_peft_v0_1,HFv1 ARC,67.24,,hf_open_llm_v1_240829_frozen.csv mixtral_8x7b_peft_v0_1,HFv1 GSM8K,51.4,,hf_open_llm_v1_240829_frozen.csv mixtral_8x7b_peft_v0_1,HFv1 HellaSwag,86.03,,hf_open_llm_v1_240829_frozen.csv mixtral_8x7b_peft_v0_1,HFv1 MMLU,68.59,,hf_open_llm_v1_240829_frozen.csv mixtral_8x7b_peft_v0_1,HFv1 TruthfulQA,59.54,,hf_open_llm_v1_240829_frozen.csv mixtral_8x7b_peft_v0_1,HFv1 Winogrande,80.43,,hf_open_llm_v1_240829_frozen.csv mixtral_8x7b_v0_1,HF OpenLLM v1,68.42,,hf_open_llm_v1_240829_frozen.csv mixtral_8x7b_v0_1,HFv1 ARC,66.04,,hf_open_llm_v1_240829_frozen.csv mixtral_8x7b_v0_1,HFv1 GSM8K,57.47,,hf_open_llm_v1_240829_frozen.csv mixtral_8x7b_v0_1,HFv1 HellaSwag,86.49,,hf_open_llm_v1_240829_frozen.csv mixtral_8x7b_v0_1,HFv1 MMLU,71.82,,hf_open_llm_v1_240829_frozen.csv mixtral_8x7b_v0_1,HFv1 TruthfulQA,46.78,,hf_open_llm_v1_240829_frozen.csv mixtral_8x7b_v0_1,HFv1 Winogrande,81.93,,hf_open_llm_v1_240829_frozen.csv mixtral_8x7b_v0_1_dpo,HF OpenLLM v1,68.18,,hf_open_llm_v1_240829_frozen.csv mixtral_8x7b_v0_1_dpo,HFv1 ARC,66.55,,hf_open_llm_v1_240829_frozen.csv mixtral_8x7b_v0_1_dpo,HFv1 GSM8K,56.18,,hf_open_llm_v1_240829_frozen.csv mixtral_8x7b_v0_1_dpo,HFv1 HellaSwag,86.4,,hf_open_llm_v1_240829_frozen.csv mixtral_8x7b_v0_1_dpo,HFv1 MMLU,71.65,,hf_open_llm_v1_240829_frozen.csv mixtral_8x7b_v0_1_dpo,HFv1 TruthfulQA,46.74,,hf_open_llm_v1_240829_frozen.csv mixtral_8x7b_v0_1_dpo,HFv1 Winogrande,81.53,,hf_open_llm_v1_240829_frozen.csv mixtral_8x7b_v0_1_sft,HF OpenLLM v1,68.18,,hf_open_llm_v1_240829_frozen.csv mixtral_8x7b_v0_1_sft,HFv1 ARC,66.55,,hf_open_llm_v1_240829_frozen.csv mixtral_8x7b_v0_1_sft,HFv1 GSM8K,56.18,,hf_open_llm_v1_240829_frozen.csv mixtral_8x7b_v0_1_sft,HFv1 HellaSwag,86.4,,hf_open_llm_v1_240829_frozen.csv mixtral_8x7b_v0_1_sft,HFv1 MMLU,71.65,,hf_open_llm_v1_240829_frozen.csv mixtral_8x7b_v0_1_sft,HFv1 TruthfulQA,46.74,,hf_open_llm_v1_240829_frozen.csv mixtral_8x7b_v0_1_sft,HFv1 Winogrande,81.53,,hf_open_llm_v1_240829_frozen.csv mixtral_gqa_400m_v2,HF OpenLLM v1,28.45,,hf_open_llm_v1_240829_frozen.csv mixtral_gqa_400m_v2,HFv1 ARC,20.22,,hf_open_llm_v1_240829_frozen.csv mixtral_gqa_400m_v2,HFv1 GSM8K,0.08,,hf_open_llm_v1_240829_frozen.csv mixtral_gqa_400m_v2,HFv1 HellaSwag,27.78,,hf_open_llm_v1_240829_frozen.csv mixtral_gqa_400m_v2,HFv1 MMLU,26.1,,hf_open_llm_v1_240829_frozen.csv mixtral_gqa_400m_v2,HFv1 TruthfulQA,46.55,,hf_open_llm_v1_240829_frozen.csv mixtral_gqa_400m_v2,HFv1 Winogrande,49.96,,hf_open_llm_v1_240829_frozen.csv mixtral_instruct_0_1_laser,HF OpenLLM v1,72.36,,hf_open_llm_v1_240829_frozen.csv mixtral_instruct_0_1_laser,HFv1 ARC,70.48,,hf_open_llm_v1_240829_frozen.csv mixtral_instruct_0_1_laser,HFv1 GSM8K,58.68,,hf_open_llm_v1_240829_frozen.csv mixtral_instruct_0_1_laser,HFv1 HellaSwag,87.28,,hf_open_llm_v1_240829_frozen.csv mixtral_instruct_0_1_laser,HFv1 MMLU,71.07,,hf_open_llm_v1_240829_frozen.csv mixtral_instruct_0_1_laser,HFv1 TruthfulQA,65.83,,hf_open_llm_v1_240829_frozen.csv mixtral_instruct_0_1_laser,HFv1 Winogrande,80.82,,hf_open_llm_v1_240829_frozen.csv mixtral_ko_qna_merged,HF OpenLLM v1,47.24,,hf_open_llm_v1_240829_frozen.csv mixtral_ko_qna_merged,HFv1 ARC,39.51,,hf_open_llm_v1_240829_frozen.csv mixtral_ko_qna_merged,HFv1 GSM8K,27.67,,hf_open_llm_v1_240829_frozen.csv mixtral_ko_qna_merged,HFv1 HellaSwag,39.06,,hf_open_llm_v1_240829_frozen.csv mixtral_ko_qna_merged,HFv1 MMLU,71.86,,hf_open_llm_v1_240829_frozen.csv mixtral_ko_qna_merged,HFv1 TruthfulQA,48.61,,hf_open_llm_v1_240829_frozen.csv mixtral_ko_qna_merged,HFv1 Winogrande,56.75,,hf_open_llm_v1_240829_frozen.csv mixtral_megamerge_dare_8x7b_v2,HF OpenLLM v1,67.87,,hf_open_llm_v1_240829_frozen.csv mixtral_megamerge_dare_8x7b_v2,HFv1 ARC,66.47,,hf_open_llm_v1_240829_frozen.csv mixtral_megamerge_dare_8x7b_v2,HFv1 GSM8K,52.46,,hf_open_llm_v1_240829_frozen.csv mixtral_megamerge_dare_8x7b_v2,HFv1 HellaSwag,86.05,,hf_open_llm_v1_240829_frozen.csv mixtral_megamerge_dare_8x7b_v2,HFv1 MMLU,69.08,,hf_open_llm_v1_240829_frozen.csv mixtral_megamerge_dare_8x7b_v2,HFv1 TruthfulQA,53.82,,hf_open_llm_v1_240829_frozen.csv mixtral_megamerge_dare_8x7b_v2,HFv1 Winogrande,79.32,,hf_open_llm_v1_240829_frozen.csv mixtral_orca_v0_1,HF OpenLLM v1,67.82,,hf_open_llm_v1_240829_frozen.csv mixtral_orca_v0_1,HFv1 ARC,69.71,,hf_open_llm_v1_240829_frozen.csv mixtral_orca_v0_1,HFv1 GSM8K,37.3,,hf_open_llm_v1_240829_frozen.csv mixtral_orca_v0_1,HFv1 HellaSwag,88.88,,hf_open_llm_v1_240829_frozen.csv mixtral_orca_v0_1,HFv1 MMLU,66.06,,hf_open_llm_v1_240829_frozen.csv mixtral_orca_v0_1,HFv1 TruthfulQA,63.85,,hf_open_llm_v1_240829_frozen.csv mixtral_orca_v0_1,HFv1 Winogrande,81.14,,hf_open_llm_v1_240829_frozen.csv mixtralmerge_8x7b_rebalanced_test,HF OpenLLM v1,69.61,,hf_open_llm_v1_240829_frozen.csv mixtralmerge_8x7b_rebalanced_test,HFv1 ARC,68.17,,hf_open_llm_v1_240829_frozen.csv mixtralmerge_8x7b_rebalanced_test,HFv1 GSM8K,58.23,,hf_open_llm_v1_240829_frozen.csv mixtralmerge_8x7b_rebalanced_test,HFv1 HellaSwag,85.76,,hf_open_llm_v1_240829_frozen.csv mixtralmerge_8x7b_rebalanced_test,HFv1 MMLU,70.47,,hf_open_llm_v1_240829_frozen.csv mixtralmerge_8x7b_rebalanced_test,HFv1 TruthfulQA,53.75,,hf_open_llm_v1_240829_frozen.csv mixtralmerge_8x7b_rebalanced_test,HFv1 Winogrande,81.29,,hf_open_llm_v1_240829_frozen.csv mixtralorochi8x7b,HF OpenLLM v1,64.62,,hf_open_llm_v1_240829_frozen.csv mixtralorochi8x7b,HFv1 ARC,70.31,,hf_open_llm_v1_240829_frozen.csv mixtralorochi8x7b,HFv1 GSM8K,17.29,,hf_open_llm_v1_240829_frozen.csv mixtralorochi8x7b,HFv1 HellaSwag,86.1,,hf_open_llm_v1_240829_frozen.csv mixtralorochi8x7b,HFv1 MMLU,70.13,,hf_open_llm_v1_240829_frozen.csv mixtralorochi8x7b,HFv1 TruthfulQA,63.99,,hf_open_llm_v1_240829_frozen.csv mixtralorochi8x7b,HFv1 Winogrande,79.87,,hf_open_llm_v1_240829_frozen.csv mixtralrpchat_zloss,HF OpenLLM v1,68.59,,hf_open_llm_v1_240829_frozen.csv mixtralrpchat_zloss,HFv1 ARC,68.6,,hf_open_llm_v1_240829_frozen.csv mixtralrpchat_zloss,HFv1 GSM8K,50.57,,hf_open_llm_v1_240829_frozen.csv mixtralrpchat_zloss,HFv1 HellaSwag,86.1,,hf_open_llm_v1_240829_frozen.csv mixtralrpchat_zloss,HFv1 MMLU,70.44,,hf_open_llm_v1_240829_frozen.csv mixtralrpchat_zloss,HFv1 TruthfulQA,53.85,,hf_open_llm_v1_240829_frozen.csv mixtralrpchat_zloss,HFv1 Winogrande,82.0,,hf_open_llm_v1_240829_frozen.csv mm4_3b,HF OpenLLM v1,53.22,,hf_open_llm_v1_240829_frozen.csv mm4_3b,HFv1 ARC,44.8,,hf_open_llm_v1_240829_frozen.csv mm4_3b,HFv1 GSM8K,43.82,,hf_open_llm_v1_240829_frozen.csv mm4_3b,HFv1 HellaSwag,70.41,,hf_open_llm_v1_240829_frozen.csv mm4_3b,HFv1 MMLU,50.9,,hf_open_llm_v1_240829_frozen.csv mm4_3b,HFv1 TruthfulQA,43.2,,hf_open_llm_v1_240829_frozen.csv mm4_3b,HFv1 Winogrande,66.22,,hf_open_llm_v1_240829_frozen.csv mm_ov_bagel_dpo_34b_c1000_250,HF OpenLLM v1,74.47,,hf_open_llm_v1_240829_frozen.csv mm_ov_bagel_dpo_34b_c1000_250,HFv1 ARC,68.17,,hf_open_llm_v1_240829_frozen.csv mm_ov_bagel_dpo_34b_c1000_250,HFv1 GSM8K,72.25,,hf_open_llm_v1_240829_frozen.csv mm_ov_bagel_dpo_34b_c1000_250,HFv1 HellaSwag,83.97,,hf_open_llm_v1_240829_frozen.csv mm_ov_bagel_dpo_34b_c1000_250,HFv1 MMLU,76.33,,hf_open_llm_v1_240829_frozen.csv mm_ov_bagel_dpo_34b_c1000_250,HFv1 TruthfulQA,63.67,,hf_open_llm_v1_240829_frozen.csv mm_ov_bagel_dpo_34b_c1000_250,HFv1 Winogrande,82.4,,hf_open_llm_v1_240829_frozen.csv mnsim_dpo_peftmerged_2_eos,HF OpenLLM v1,54.04,,hf_open_llm_v1_240829_frozen.csv mnsim_dpo_peftmerged_2_eos,HFv1 ARC,55.63,,hf_open_llm_v1_240829_frozen.csv mnsim_dpo_peftmerged_2_eos,HFv1 GSM8K,16.91,,hf_open_llm_v1_240829_frozen.csv mnsim_dpo_peftmerged_2_eos,HFv1 HellaSwag,77.82,,hf_open_llm_v1_240829_frozen.csv mnsim_dpo_peftmerged_2_eos,HFv1 MMLU,51.25,,hf_open_llm_v1_240829_frozen.csv mnsim_dpo_peftmerged_2_eos,HFv1 TruthfulQA,46.37,,hf_open_llm_v1_240829_frozen.csv mnsim_dpo_peftmerged_2_eos,HFv1 Winogrande,76.24,,hf_open_llm_v1_240829_frozen.csv model_a_48_5m,HF OpenLLM v1,28.98,,hf_open_llm_v1_240829_frozen.csv model_a_48_5m,HFv1 ARC,22.18,,hf_open_llm_v1_240829_frozen.csv model_a_48_5m,HFv1 GSM8K,0.3,,hf_open_llm_v1_240829_frozen.csv model_a_48_5m,HFv1 HellaSwag,27.85,,hf_open_llm_v1_240829_frozen.csv model_a_48_5m,HFv1 MMLU,25.08,,hf_open_llm_v1_240829_frozen.csv model_a_48_5m,HFv1 TruthfulQA,46.75,,hf_open_llm_v1_240829_frozen.csv model_a_48_5m,HFv1 Winogrande,51.7,,hf_open_llm_v1_240829_frozen.csv momo_70b_lora_1_8_5_dpo,HF OpenLLM v1,76.14,,hf_open_llm_v1_240829_frozen.csv momo_70b_lora_1_8_5_dpo,HFv1 ARC,69.54,,hf_open_llm_v1_240829_frozen.csv momo_70b_lora_1_8_5_dpo,HFv1 GSM8K,74.3,,hf_open_llm_v1_240829_frozen.csv momo_70b_lora_1_8_5_dpo,HFv1 HellaSwag,85.6,,hf_open_llm_v1_240829_frozen.csv momo_70b_lora_1_8_5_dpo,HFv1 MMLU,77.49,,hf_open_llm_v1_240829_frozen.csv momo_70b_lora_1_8_5_dpo,HFv1 TruthfulQA,65.79,,hf_open_llm_v1_240829_frozen.csv momo_70b_lora_1_8_5_dpo,HFv1 Winogrande,84.14,,hf_open_llm_v1_240829_frozen.csv momo_70b_lora_v1_1,HF OpenLLM v1,67.53,,hf_open_llm_v1_240829_frozen.csv momo_70b_lora_v1_1,HFv1 ARC,66.64,,hf_open_llm_v1_240829_frozen.csv momo_70b_lora_v1_1,HFv1 GSM8K,46.32,,hf_open_llm_v1_240829_frozen.csv momo_70b_lora_v1_1,HFv1 HellaSwag,87.16,,hf_open_llm_v1_240829_frozen.csv momo_70b_lora_v1_1,HFv1 MMLU,66.76,,hf_open_llm_v1_240829_frozen.csv momo_70b_lora_v1_1,HFv1 TruthfulQA,54.98,,hf_open_llm_v1_240829_frozen.csv momo_70b_lora_v1_1,HFv1 Winogrande,83.35,,hf_open_llm_v1_240829_frozen.csv momo_70b_lora_v1_2_1,HF OpenLLM v1,71.36,,hf_open_llm_v1_240829_frozen.csv momo_70b_lora_v1_2_1,HFv1 ARC,70.65,,hf_open_llm_v1_240829_frozen.csv momo_70b_lora_v1_2_1,HFv1 GSM8K,56.63,,hf_open_llm_v1_240829_frozen.csv momo_70b_lora_v1_2_1,HFv1 HellaSwag,86.4,,hf_open_llm_v1_240829_frozen.csv momo_70b_lora_v1_2_1,HFv1 MMLU,69.9,,hf_open_llm_v1_240829_frozen.csv momo_70b_lora_v1_2_1,HFv1 TruthfulQA,61.41,,hf_open_llm_v1_240829_frozen.csv momo_70b_lora_v1_2_1,HFv1 Winogrande,83.19,,hf_open_llm_v1_240829_frozen.csv momo_72b_lora_1_8_4_dpo,HF OpenLLM v1,76.23,,hf_open_llm_v1_240829_frozen.csv momo_72b_lora_1_8_4_dpo,HFv1 ARC,69.62,,hf_open_llm_v1_240829_frozen.csv momo_72b_lora_1_8_4_dpo,HFv1 GSM8K,76.27,,hf_open_llm_v1_240829_frozen.csv momo_72b_lora_1_8_4_dpo,HFv1 HellaSwag,85.35,,hf_open_llm_v1_240829_frozen.csv momo_72b_lora_1_8_4_dpo,HFv1 MMLU,77.33,,hf_open_llm_v1_240829_frozen.csv momo_72b_lora_1_8_4_dpo,HFv1 TruthfulQA,64.64,,hf_open_llm_v1_240829_frozen.csv momo_72b_lora_1_8_4_dpo,HFv1 Winogrande,84.14,,hf_open_llm_v1_240829_frozen.csv momo_72b_lora_1_8_6_dpo,HF OpenLLM v1,77.29,,hf_open_llm_v1_240829_frozen.csv momo_72b_lora_1_8_6_dpo,HFv1 ARC,70.14,,hf_open_llm_v1_240829_frozen.csv momo_72b_lora_1_8_6_dpo,HFv1 GSM8K,76.8,,hf_open_llm_v1_240829_frozen.csv momo_72b_lora_1_8_6_dpo,HFv1 HellaSwag,86.03,,hf_open_llm_v1_240829_frozen.csv momo_72b_lora_1_8_6_dpo,HFv1 MMLU,77.4,,hf_open_llm_v1_240829_frozen.csv momo_72b_lora_1_8_6_dpo,HFv1 TruthfulQA,69.0,,hf_open_llm_v1_240829_frozen.csv momo_72b_lora_1_8_6_dpo,HFv1 Winogrande,84.37,,hf_open_llm_v1_240829_frozen.csv momo_72b_lora_1_8_7_dpo,HF OpenLLM v1,78.55,,hf_open_llm_v1_240829_frozen.csv momo_72b_lora_1_8_7_dpo,HFv1 ARC,70.82,,hf_open_llm_v1_240829_frozen.csv momo_72b_lora_1_8_7_dpo,HFv1 GSM8K,78.62,,hf_open_llm_v1_240829_frozen.csv momo_72b_lora_1_8_7_dpo,HFv1 HellaSwag,85.96,,hf_open_llm_v1_240829_frozen.csv momo_72b_lora_1_8_7_dpo,HFv1 MMLU,77.13,,hf_open_llm_v1_240829_frozen.csv momo_72b_lora_1_8_7_dpo,HFv1 TruthfulQA,74.71,,hf_open_llm_v1_240829_frozen.csv momo_72b_lora_1_8_7_dpo,HFv1 Winogrande,84.06,,hf_open_llm_v1_240829_frozen.csv momomerge_72b_v0_1,HF OpenLLM v1,28.69,,hf_open_llm_v1_240829_frozen.csv momomerge_72b_v0_1,HFv1 ARC,26.28,,hf_open_llm_v1_240829_frozen.csv momomerge_72b_v0_1,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv momomerge_72b_v0_1,HFv1 HellaSwag,25.27,,hf_open_llm_v1_240829_frozen.csv momomerge_72b_v0_1,HFv1 MMLU,23.08,,hf_open_llm_v1_240829_frozen.csv momomerge_72b_v0_1,HFv1 TruthfulQA,48.73,,hf_open_llm_v1_240829_frozen.csv momomerge_72b_v0_1,HFv1 Winogrande,48.78,,hf_open_llm_v1_240829_frozen.csv mpt_125m_c4,HF OpenLLM v1,28.84,,hf_open_llm_v1_240829_frozen.csv mpt_125m_c4,HFv1 ARC,22.18,,hf_open_llm_v1_240829_frozen.csv mpt_125m_c4,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv mpt_125m_c4,HFv1 HellaSwag,26.41,,hf_open_llm_v1_240829_frozen.csv mpt_125m_c4,HFv1 MMLU,24.68,,hf_open_llm_v1_240829_frozen.csv mpt_125m_c4,HFv1 TruthfulQA,49.08,,hf_open_llm_v1_240829_frozen.csv mpt_125m_c4,HFv1 Winogrande,50.67,,hf_open_llm_v1_240829_frozen.csv mpt_30b,HF OpenLLM v1,52.77,,hf_open_llm_v1_240829_frozen.csv mpt_30b,HFv1 ARC,55.97,,hf_open_llm_v1_240829_frozen.csv mpt_30b,HFv1 GSM8K,16.91,,hf_open_llm_v1_240829_frozen.csv mpt_30b,HFv1 HellaSwag,82.42,,hf_open_llm_v1_240829_frozen.csv mpt_30b,HFv1 MMLU,48.0,,hf_open_llm_v1_240829_frozen.csv mpt_30b,HFv1 TruthfulQA,38.42,,hf_open_llm_v1_240829_frozen.csv mpt_30b,HFv1 Winogrande,74.9,,hf_open_llm_v1_240829_frozen.csv mpt_7b,HF OpenLLM v1,44.28,,hf_open_llm_v1_240829_frozen.csv mpt_7b,HFv1 ARC,47.7,,hf_open_llm_v1_240829_frozen.csv mpt_7b,HFv1 GSM8K,4.02,,hf_open_llm_v1_240829_frozen.csv mpt_7b,HFv1 HellaSwag,77.57,,hf_open_llm_v1_240829_frozen.csv mpt_7b,HFv1 MMLU,30.8,,hf_open_llm_v1_240829_frozen.csv mpt_7b,HFv1 TruthfulQA,33.44,,hf_open_llm_v1_240829_frozen.csv mpt_7b,HFv1 Winogrande,72.14,,hf_open_llm_v1_240829_frozen.csv mpt_7b_8k,HF OpenLLM v1,47.24,,hf_open_llm_v1_240829_frozen.csv mpt_7b_8k,HFv1 ARC,47.35,,hf_open_llm_v1_240829_frozen.csv mpt_7b_8k,HFv1 GSM8K,8.34,,hf_open_llm_v1_240829_frozen.csv mpt_7b_8k,HFv1 HellaSwag,77.4,,hf_open_llm_v1_240829_frozen.csv mpt_7b_8k,HFv1 MMLU,42.58,,hf_open_llm_v1_240829_frozen.csv mpt_7b_8k,HFv1 TruthfulQA,36.65,,hf_open_llm_v1_240829_frozen.csv mpt_7b_8k,HFv1 Winogrande,71.11,,hf_open_llm_v1_240829_frozen.csv mpt_7b_8k_instruct,HF OpenLLM v1,47.18,,hf_open_llm_v1_240829_frozen.csv mpt_7b_8k_instruct,HFv1 ARC,45.48,,hf_open_llm_v1_240829_frozen.csv mpt_7b_8k_instruct,HFv1 GSM8K,20.55,,hf_open_llm_v1_240829_frozen.csv mpt_7b_8k_instruct,HFv1 HellaSwag,74.41,,hf_open_llm_v1_240829_frozen.csv mpt_7b_8k_instruct,HFv1 MMLU,42.11,,hf_open_llm_v1_240829_frozen.csv mpt_7b_8k_instruct,HFv1 TruthfulQA,35.06,,hf_open_llm_v1_240829_frozen.csv mpt_7b_8k_instruct,HFv1 Winogrande,65.51,,hf_open_llm_v1_240829_frozen.csv mptk_1b,HF OpenLLM v1,29.7,,hf_open_llm_v1_240829_frozen.csv mptk_1b,HFv1 ARC,24.06,,hf_open_llm_v1_240829_frozen.csv mptk_1b,HFv1 GSM8K,0.83,,hf_open_llm_v1_240829_frozen.csv mptk_1b,HFv1 HellaSwag,35.61,,hf_open_llm_v1_240829_frozen.csv mptk_1b,HFv1 MMLU,26.95,,hf_open_llm_v1_240829_frozen.csv mptk_1b,HFv1 TruthfulQA,39.71,,hf_open_llm_v1_240829_frozen.csv mptk_1b,HFv1 Winogrande,51.07,,hf_open_llm_v1_240829_frozen.csv mt7bi_wizard_3_alpha_dpo,HF OpenLLM v1,38.88,,hf_open_llm_v1_240829_frozen.csv mt7bi_wizard_3_alpha_dpo,HFv1 ARC,41.21,,hf_open_llm_v1_240829_frozen.csv mt7bi_wizard_3_alpha_dpo,HFv1 GSM8K,0.99,,hf_open_llm_v1_240829_frozen.csv mt7bi_wizard_3_alpha_dpo,HFv1 HellaSwag,59.34,,hf_open_llm_v1_240829_frozen.csv mt7bi_wizard_3_alpha_dpo,HFv1 MMLU,27.31,,hf_open_llm_v1_240829_frozen.csv mt7bi_wizard_3_alpha_dpo,HFv1 TruthfulQA,39.06,,hf_open_llm_v1_240829_frozen.csv mt7bi_wizard_3_alpha_dpo,HFv1 Winogrande,65.35,,hf_open_llm_v1_240829_frozen.csv multiverse_70b,HF OpenLLM v1,81.0,,hf_open_llm_v1_240829_frozen.csv multiverse_70b,HFv1 ARC,78.67,,hf_open_llm_v1_240829_frozen.csv multiverse_70b,HFv1 GSM8K,76.65,,hf_open_llm_v1_240829_frozen.csv multiverse_70b,HFv1 HellaSwag,89.77,,hf_open_llm_v1_240829_frozen.csv multiverse_70b,HFv1 MMLU,78.22,,hf_open_llm_v1_240829_frozen.csv multiverse_70b,HFv1 TruthfulQA,75.18,,hf_open_llm_v1_240829_frozen.csv multiverse_70b,HFv1 Winogrande,87.53,,hf_open_llm_v1_240829_frozen.csv multiverse_laser,HF OpenLLM v1,76.33,,hf_open_llm_v1_240829_frozen.csv multiverse_laser,HFv1 ARC,72.53,,hf_open_llm_v1_240829_frozen.csv multiverse_laser,HFv1 GSM8K,69.52,,hf_open_llm_v1_240829_frozen.csv multiverse_laser,HFv1 HellaSwag,88.81,,hf_open_llm_v1_240829_frozen.csv multiverse_laser,HFv1 MMLU,64.52,,hf_open_llm_v1_240829_frozen.csv multiverse_laser,HFv1 TruthfulQA,77.7,,hf_open_llm_v1_240829_frozen.csv multiverse_laser,HFv1 Winogrande,84.93,,hf_open_llm_v1_240829_frozen.csv musingcaterpillar,HF OpenLLM v1,73.33,,hf_open_llm_v1_240829_frozen.csv musingcaterpillar,HFv1 ARC,72.53,,hf_open_llm_v1_240829_frozen.csv musingcaterpillar,HFv1 GSM8K,62.24,,hf_open_llm_v1_240829_frozen.csv musingcaterpillar,HFv1 HellaSwag,88.34,,hf_open_llm_v1_240829_frozen.csv musingcaterpillar,HFv1 MMLU,65.26,,hf_open_llm_v1_240829_frozen.csv musingcaterpillar,HFv1 TruthfulQA,70.93,,hf_open_llm_v1_240829_frozen.csv musingcaterpillar,HFv1 Winogrande,80.66,,hf_open_llm_v1_240829_frozen.csv mythicaldestroyerv2_platypus2_13b_qlora_0_80_epoch,HF OpenLLM v1,54.01,,hf_open_llm_v1_240829_frozen.csv mythicaldestroyerv2_platypus2_13b_qlora_0_80_epoch,HFv1 ARC,57.34,,hf_open_llm_v1_240829_frozen.csv mythicaldestroyerv2_platypus2_13b_qlora_0_80_epoch,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv mythicaldestroyerv2_platypus2_13b_qlora_0_80_epoch,HFv1 HellaSwag,81.24,,hf_open_llm_v1_240829_frozen.csv mythicaldestroyerv2_platypus2_13b_qlora_0_80_epoch,HFv1 MMLU,55.64,,hf_open_llm_v1_240829_frozen.csv mythicaldestroyerv2_platypus2_13b_qlora_0_80_epoch,HFv1 TruthfulQA,55.98,,hf_open_llm_v1_240829_frozen.csv mythicaldestroyerv2_platypus2_13b_qlora_0_80_epoch,HFv1 Winogrande,73.88,,hf_open_llm_v1_240829_frozen.csv mythomix_platypus2_13b_qlora_0_80_epoch,HF OpenLLM v1,54.74,,hf_open_llm_v1_240829_frozen.csv mythomix_platypus2_13b_qlora_0_80_epoch,HFv1 ARC,60.32,,hf_open_llm_v1_240829_frozen.csv mythomix_platypus2_13b_qlora_0_80_epoch,HFv1 GSM8K,0.91,,hf_open_llm_v1_240829_frozen.csv mythomix_platypus2_13b_qlora_0_80_epoch,HFv1 HellaSwag,83.72,,hf_open_llm_v1_240829_frozen.csv mythomix_platypus2_13b_qlora_0_80_epoch,HFv1 MMLU,55.74,,hf_open_llm_v1_240829_frozen.csv mythomix_platypus2_13b_qlora_0_80_epoch,HFv1 TruthfulQA,52.18,,hf_open_llm_v1_240829_frozen.csv mythomix_platypus2_13b_qlora_0_80_epoch,HFv1 Winogrande,75.53,,hf_open_llm_v1_240829_frozen.csv nanbeige2_16b_chat,HF OpenLLM v1,67.73,,hf_open_llm_v1_240829_frozen.csv nanbeige2_16b_chat,HFv1 ARC,63.4,,hf_open_llm_v1_240829_frozen.csv nanbeige2_16b_chat,HFv1 GSM8K,63.91,,hf_open_llm_v1_240829_frozen.csv nanbeige2_16b_chat,HFv1 HellaSwag,80.7,,hf_open_llm_v1_240829_frozen.csv nanbeige2_16b_chat,HFv1 MMLU,66.63,,hf_open_llm_v1_240829_frozen.csv nanbeige2_16b_chat,HFv1 TruthfulQA,55.04,,hf_open_llm_v1_240829_frozen.csv nanbeige2_16b_chat,HFv1 Winogrande,76.72,,hf_open_llm_v1_240829_frozen.csv nanbeige_16b_base_llama,HF OpenLLM v1,60.7,,hf_open_llm_v1_240829_frozen.csv nanbeige_16b_base_llama,HFv1 ARC,56.48,,hf_open_llm_v1_240829_frozen.csv nanbeige_16b_base_llama,HFv1 GSM8K,47.01,,hf_open_llm_v1_240829_frozen.csv nanbeige_16b_base_llama,HFv1 HellaSwag,78.97,,hf_open_llm_v1_240829_frozen.csv nanbeige_16b_base_llama,HFv1 MMLU,63.34,,hf_open_llm_v1_240829_frozen.csv nanbeige_16b_base_llama,HFv1 TruthfulQA,42.6,,hf_open_llm_v1_240829_frozen.csv nanbeige_16b_base_llama,HFv1 Winogrande,75.77,,hf_open_llm_v1_240829_frozen.csv nano_mistral,HF OpenLLM v1,29.2,,hf_open_llm_v1_240829_frozen.csv nano_mistral,HFv1 ARC,21.67,,hf_open_llm_v1_240829_frozen.csv nano_mistral,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv nano_mistral,HFv1 HellaSwag,28.52,,hf_open_llm_v1_240829_frozen.csv nano_mistral,HFv1 MMLU,25.16,,hf_open_llm_v1_240829_frozen.csv nano_mistral,HFv1 TruthfulQA,47.42,,hf_open_llm_v1_240829_frozen.csv nano_mistral,HFv1 Winogrande,52.41,,hf_open_llm_v1_240829_frozen.csv nano_phi_115m_v0_1,HF OpenLLM v1,28.66,,hf_open_llm_v1_240829_frozen.csv nano_phi_115m_v0_1,HFv1 ARC,21.93,,hf_open_llm_v1_240829_frozen.csv nano_phi_115m_v0_1,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv nano_phi_115m_v0_1,HFv1 HellaSwag,27.86,,hf_open_llm_v1_240829_frozen.csv nano_phi_115m_v0_1,HFv1 MMLU,25.34,,hf_open_llm_v1_240829_frozen.csv nano_phi_115m_v0_1,HFv1 TruthfulQA,46.0,,hf_open_llm_v1_240829_frozen.csv nano_phi_115m_v0_1,HFv1 Winogrande,50.83,,hf_open_llm_v1_240829_frozen.csv nanobot_v1,HF OpenLLM v1,61.69,,hf_open_llm_v1_240829_frozen.csv nanobot_v1,HFv1 ARC,58.96,,hf_open_llm_v1_240829_frozen.csv nanobot_v1,HFv1 GSM8K,55.04,,hf_open_llm_v1_240829_frozen.csv nanobot_v1,HFv1 HellaSwag,74.62,,hf_open_llm_v1_240829_frozen.csv nanobot_v1,HFv1 MMLU,57.68,,hf_open_llm_v1_240829_frozen.csv nanobot_v1,HFv1 TruthfulQA,47.89,,hf_open_llm_v1_240829_frozen.csv nanobot_v1,HFv1 Winogrande,75.93,,hf_open_llm_v1_240829_frozen.csv nanofialka_v1,HF OpenLLM v1,28.48,,hf_open_llm_v1_240829_frozen.csv nanofialka_v1,HFv1 ARC,22.01,,hf_open_llm_v1_240829_frozen.csv nanofialka_v1,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv nanofialka_v1,HFv1 HellaSwag,28.12,,hf_open_llm_v1_240829_frozen.csv nanofialka_v1,HFv1 MMLU,25.03,,hf_open_llm_v1_240829_frozen.csv nanofialka_v1,HFv1 TruthfulQA,45.26,,hf_open_llm_v1_240829_frozen.csv nanofialka_v1,HFv1 Winogrande,50.43,,hf_open_llm_v1_240829_frozen.csv nanollama_gqa_l10_a32_kv8_v13_ki,HF OpenLLM v1,29.23,,hf_open_llm_v1_240829_frozen.csv nanollama_gqa_l10_a32_kv8_v13_ki,HFv1 ARC,23.81,,hf_open_llm_v1_240829_frozen.csv nanollama_gqa_l10_a32_kv8_v13_ki,HFv1 GSM8K,0.91,,hf_open_llm_v1_240829_frozen.csv nanollama_gqa_l10_a32_kv8_v13_ki,HFv1 HellaSwag,29.39,,hf_open_llm_v1_240829_frozen.csv nanollama_gqa_l10_a32_kv8_v13_ki,HFv1 MMLU,25.37,,hf_open_llm_v1_240829_frozen.csv nanollama_gqa_l10_a32_kv8_v13_ki,HFv1 TruthfulQA,44.77,,hf_open_llm_v1_240829_frozen.csv nanollama_gqa_l10_a32_kv8_v13_ki,HFv1 Winogrande,51.14,,hf_open_llm_v1_240829_frozen.csv nebula_7b,HF OpenLLM v1,56.1,,hf_open_llm_v1_240829_frozen.csv nebula_7b,HFv1 ARC,59.3,,hf_open_llm_v1_240829_frozen.csv nebula_7b,HFv1 GSM8K,14.86,,hf_open_llm_v1_240829_frozen.csv nebula_7b,HFv1 HellaSwag,83.46,,hf_open_llm_v1_240829_frozen.csv nebula_7b,HFv1 MMLU,57.0,,hf_open_llm_v1_240829_frozen.csv nebula_7b,HFv1 TruthfulQA,45.56,,hf_open_llm_v1_240829_frozen.csv nebula_7b,HFv1 Winogrande,76.4,,hf_open_llm_v1_240829_frozen.csv nebula_v2_7b,HF OpenLLM v1,58.82,,hf_open_llm_v1_240829_frozen.csv nebula_v2_7b,HFv1 ARC,58.7,,hf_open_llm_v1_240829_frozen.csv nebula_v2_7b,HFv1 GSM8K,31.69,,hf_open_llm_v1_240829_frozen.csv nebula_v2_7b,HFv1 HellaSwag,83.06,,hf_open_llm_v1_240829_frozen.csv nebula_v2_7b,HFv1 MMLU,57.61,,hf_open_llm_v1_240829_frozen.csv nebula_v2_7b,HFv1 TruthfulQA,46.72,,hf_open_llm_v1_240829_frozen.csv nebula_v2_7b,HFv1 Winogrande,75.14,,hf_open_llm_v1_240829_frozen.csv neu_sai_it1,HF OpenLLM v1,55.78,,hf_open_llm_v1_240829_frozen.csv neu_sai_it1,HFv1 ARC,61.26,,hf_open_llm_v1_240829_frozen.csv neu_sai_it1,HFv1 GSM8K,2.88,,hf_open_llm_v1_240829_frozen.csv neu_sai_it1,HFv1 HellaSwag,81.39,,hf_open_llm_v1_240829_frozen.csv neu_sai_it1,HFv1 MMLU,60.17,,hf_open_llm_v1_240829_frozen.csv neu_sai_it1,HFv1 TruthfulQA,51.49,,hf_open_llm_v1_240829_frozen.csv neu_sai_it1,HFv1 Winogrande,77.51,,hf_open_llm_v1_240829_frozen.csv neural_chat_mini_v2_2_1_8b,HF OpenLLM v1,42.5,,hf_open_llm_v1_240829_frozen.csv neural_chat_mini_v2_2_1_8b,HFv1 ARC,35.15,,hf_open_llm_v1_240829_frozen.csv neural_chat_mini_v2_2_1_8b,HFv1 GSM8K,18.65,,hf_open_llm_v1_240829_frozen.csv neural_chat_mini_v2_2_1_8b,HFv1 HellaSwag,60.06,,hf_open_llm_v1_240829_frozen.csv neural_chat_mini_v2_2_1_8b,HFv1 MMLU,42.99,,hf_open_llm_v1_240829_frozen.csv neural_chat_mini_v2_2_1_8b,HFv1 TruthfulQA,37.91,,hf_open_llm_v1_240829_frozen.csv neural_chat_mini_v2_2_1_8b,HFv1 Winogrande,60.22,,hf_open_llm_v1_240829_frozen.csv neural_mistral_7b,HF OpenLLM v1,65.69,,hf_open_llm_v1_240829_frozen.csv neural_mistral_7b,HFv1 ARC,63.4,,hf_open_llm_v1_240829_frozen.csv neural_mistral_7b,HFv1 GSM8K,37.53,,hf_open_llm_v1_240829_frozen.csv neural_mistral_7b,HFv1 HellaSwag,85.59,,hf_open_llm_v1_240829_frozen.csv neural_mistral_7b,HFv1 MMLU,60.92,,hf_open_llm_v1_240829_frozen.csv neural_mistral_7b,HFv1 TruthfulQA,69.26,,hf_open_llm_v1_240829_frozen.csv neural_mistral_7b,HFv1 Winogrande,77.43,,hf_open_llm_v1_240829_frozen.csv neural_phi2,HF OpenLLM v1,50.29,,hf_open_llm_v1_240829_frozen.csv neural_phi2,HFv1 ARC,57.68,,hf_open_llm_v1_240829_frozen.csv neural_phi2,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv neural_phi2,HFv1 HellaSwag,71.72,,hf_open_llm_v1_240829_frozen.csv neural_phi2,HFv1 MMLU,53.65,,hf_open_llm_v1_240829_frozen.csv neural_phi2,HFv1 TruthfulQA,45.36,,hf_open_llm_v1_240829_frozen.csv neural_phi2,HFv1 Winogrande,73.32,,hf_open_llm_v1_240829_frozen.csv neuralbeagle14_7b,HF OpenLLM v1,74.74,,hf_open_llm_v1_240829_frozen.csv neuralbeagle14_7b,HFv1 ARC,72.95,,hf_open_llm_v1_240829_frozen.csv neuralbeagle14_7b,HFv1 GSM8K,70.28,,hf_open_llm_v1_240829_frozen.csv neuralbeagle14_7b,HFv1 HellaSwag,88.34,,hf_open_llm_v1_240829_frozen.csv neuralbeagle14_7b,HFv1 MMLU,64.55,,hf_open_llm_v1_240829_frozen.csv neuralbeagle14_7b,HFv1 TruthfulQA,69.93,,hf_open_llm_v1_240829_frozen.csv neuralbeagle14_7b,HFv1 Winogrande,82.4,,hf_open_llm_v1_240829_frozen.csv neuralbeagle_11b,HF OpenLLM v1,72.95,,hf_open_llm_v1_240829_frozen.csv neuralbeagle_11b,HFv1 ARC,73.29,,hf_open_llm_v1_240829_frozen.csv neuralbeagle_11b,HFv1 GSM8K,58.98,,hf_open_llm_v1_240829_frozen.csv neuralbeagle_11b,HFv1 HellaSwag,87.61,,hf_open_llm_v1_240829_frozen.csv neuralbeagle_11b,HFv1 MMLU,63.8,,hf_open_llm_v1_240829_frozen.csv neuralbeagle_11b,HFv1 TruthfulQA,71.36,,hf_open_llm_v1_240829_frozen.csv neuralbeagle_11b,HFv1 Winogrande,82.64,,hf_open_llm_v1_240829_frozen.csv neuralbeagle_11b_truthy,HF OpenLLM v1,72.06,,hf_open_llm_v1_240829_frozen.csv neuralbeagle_11b_truthy,HFv1 ARC,73.63,,hf_open_llm_v1_240829_frozen.csv neuralbeagle_11b_truthy,HFv1 GSM8K,49.73,,hf_open_llm_v1_240829_frozen.csv neuralbeagle_11b_truthy,HFv1 HellaSwag,87.86,,hf_open_llm_v1_240829_frozen.csv neuralbeagle_11b_truthy,HFv1 MMLU,63.11,,hf_open_llm_v1_240829_frozen.csv neuralbeagle_11b_truthy,HFv1 TruthfulQA,75.92,,hf_open_llm_v1_240829_frozen.csv neuralbeagle_11b_truthy,HFv1 Winogrande,82.08,,hf_open_llm_v1_240829_frozen.csv neuraldaredevil_7b,HF OpenLLM v1,74.12,,hf_open_llm_v1_240829_frozen.csv neuraldaredevil_7b,HFv1 ARC,69.88,,hf_open_llm_v1_240829_frozen.csv neuraldaredevil_7b,HFv1 GSM8K,73.16,,hf_open_llm_v1_240829_frozen.csv neuraldaredevil_7b,HFv1 HellaSwag,87.62,,hf_open_llm_v1_240829_frozen.csv neuraldaredevil_7b,HFv1 MMLU,65.12,,hf_open_llm_v1_240829_frozen.csv neuraldaredevil_7b,HFv1 TruthfulQA,66.85,,hf_open_llm_v1_240829_frozen.csv neuraldaredevil_7b,HFv1 Winogrande,82.08,,hf_open_llm_v1_240829_frozen.csv neuraldaredmistralpro_7b_slerp,HF OpenLLM v1,72.04,,hf_open_llm_v1_240829_frozen.csv neuraldaredmistralpro_7b_slerp,HFv1 ARC,69.03,,hf_open_llm_v1_240829_frozen.csv neuraldaredmistralpro_7b_slerp,HFv1 GSM8K,68.69,,hf_open_llm_v1_240829_frozen.csv neuraldaredmistralpro_7b_slerp,HFv1 HellaSwag,86.74,,hf_open_llm_v1_240829_frozen.csv neuraldaredmistralpro_7b_slerp,HFv1 MMLU,63.46,,hf_open_llm_v1_240829_frozen.csv neuraldaredmistralpro_7b_slerp,HFv1 TruthfulQA,64.12,,hf_open_llm_v1_240829_frozen.csv neuraldaredmistralpro_7b_slerp,HFv1 Winogrande,80.19,,hf_open_llm_v1_240829_frozen.csv neuraldarewin_7b,HF OpenLLM v1,71.79,,hf_open_llm_v1_240829_frozen.csv neuraldarewin_7b,HFv1 ARC,70.14,,hf_open_llm_v1_240829_frozen.csv neuraldarewin_7b,HFv1 GSM8K,66.72,,hf_open_llm_v1_240829_frozen.csv neuraldarewin_7b,HFv1 HellaSwag,86.4,,hf_open_llm_v1_240829_frozen.csv neuraldarewin_7b,HFv1 MMLU,64.85,,hf_open_llm_v1_240829_frozen.csv neuraldarewin_7b,HFv1 TruthfulQA,62.92,,hf_open_llm_v1_240829_frozen.csv neuraldarewin_7b,HFv1 Winogrande,79.72,,hf_open_llm_v1_240829_frozen.csv neuralfusion_7b_dare_ties,HF OpenLLM v1,75.94,,hf_open_llm_v1_240829_frozen.csv neuralfusion_7b_dare_ties,HFv1 ARC,73.21,,hf_open_llm_v1_240829_frozen.csv neuralfusion_7b_dare_ties,HFv1 GSM8K,69.83,,hf_open_llm_v1_240829_frozen.csv neuralfusion_7b_dare_ties,HFv1 HellaSwag,88.96,,hf_open_llm_v1_240829_frozen.csv neuralfusion_7b_dare_ties,HFv1 MMLU,64.77,,hf_open_llm_v1_240829_frozen.csv neuralfusion_7b_dare_ties,HFv1 TruthfulQA,73.32,,hf_open_llm_v1_240829_frozen.csv neuralfusion_7b_dare_ties,HFv1 Winogrande,85.56,,hf_open_llm_v1_240829_frozen.csv neuralhermes_2_5_mistral_7b,HF OpenLLM v1,66.91,,hf_open_llm_v1_240829_frozen.csv neuralhermes_2_5_mistral_7b,HFv1 ARC,68.26,,hf_open_llm_v1_240829_frozen.csv neuralhermes_2_5_mistral_7b,HFv1 GSM8K,55.95,,hf_open_llm_v1_240829_frozen.csv neuralhermes_2_5_mistral_7b,HFv1 HellaSwag,85.69,,hf_open_llm_v1_240829_frozen.csv neuralhermes_2_5_mistral_7b,HFv1 MMLU,63.81,,hf_open_llm_v1_240829_frozen.csv neuralhermes_2_5_mistral_7b,HFv1 TruthfulQA,55.98,,hf_open_llm_v1_240829_frozen.csv neuralhermes_2_5_mistral_7b,HFv1 Winogrande,78.37,,hf_open_llm_v1_240829_frozen.csv neuralhyperion_2_0_mistral_7b,HF OpenLLM v1,61.27,,hf_open_llm_v1_240829_frozen.csv neuralhyperion_2_0_mistral_7b,HFv1 ARC,57.76,,hf_open_llm_v1_240829_frozen.csv neuralhyperion_2_0_mistral_7b,HFv1 GSM8K,41.17,,hf_open_llm_v1_240829_frozen.csv neuralhyperion_2_0_mistral_7b,HFv1 HellaSwag,82.29,,hf_open_llm_v1_240829_frozen.csv neuralhyperion_2_0_mistral_7b,HFv1 MMLU,61.9,,hf_open_llm_v1_240829_frozen.csv neuralhyperion_2_0_mistral_7b,HFv1 TruthfulQA,45.5,,hf_open_llm_v1_240829_frozen.csv neuralhyperion_2_0_mistral_7b,HFv1 Winogrande,79.01,,hf_open_llm_v1_240829_frozen.csv neuralhyperion_medium_preview,HF OpenLLM v1,61.67,,hf_open_llm_v1_240829_frozen.csv neuralhyperion_medium_preview,HFv1 ARC,60.67,,hf_open_llm_v1_240829_frozen.csv neuralhyperion_medium_preview,HFv1 GSM8K,40.49,,hf_open_llm_v1_240829_frozen.csv neuralhyperion_medium_preview,HFv1 HellaSwag,83.67,,hf_open_llm_v1_240829_frozen.csv neuralhyperion_medium_preview,HFv1 MMLU,63.73,,hf_open_llm_v1_240829_frozen.csv neuralhyperion_medium_preview,HFv1 TruthfulQA,42.93,,hf_open_llm_v1_240829_frozen.csv neuralhyperion_medium_preview,HFv1 Winogrande,78.53,,hf_open_llm_v1_240829_frozen.csv neuralkrishna_7b_v2_dpo,HF OpenLLM v1,76.0,,hf_open_llm_v1_240829_frozen.csv neuralkrishna_7b_v2_dpo,HFv1 ARC,74.06,,hf_open_llm_v1_240829_frozen.csv neuralkrishna_7b_v2_dpo,HFv1 GSM8K,68.08,,hf_open_llm_v1_240829_frozen.csv neuralkrishna_7b_v2_dpo,HFv1 HellaSwag,88.97,,hf_open_llm_v1_240829_frozen.csv neuralkrishna_7b_v2_dpo,HFv1 MMLU,64.41,,hf_open_llm_v1_240829_frozen.csv neuralkrishna_7b_v2_dpo,HFv1 TruthfulQA,76.19,,hf_open_llm_v1_240829_frozen.csv neuralkrishna_7b_v2_dpo,HFv1 Winogrande,84.29,,hf_open_llm_v1_240829_frozen.csv neuralmarcoro14_7b,HF OpenLLM v1,73.57,,hf_open_llm_v1_240829_frozen.csv neuralmarcoro14_7b,HFv1 ARC,71.42,,hf_open_llm_v1_240829_frozen.csv neuralmarcoro14_7b,HFv1 GSM8K,70.74,,hf_open_llm_v1_240829_frozen.csv neuralmarcoro14_7b,HFv1 HellaSwag,87.59,,hf_open_llm_v1_240829_frozen.csv neuralmarcoro14_7b,HFv1 MMLU,64.84,,hf_open_llm_v1_240829_frozen.csv neuralmarcoro14_7b,HFv1 TruthfulQA,65.64,,hf_open_llm_v1_240829_frozen.csv neuralmarcoro14_7b,HFv1 Winogrande,81.22,,hf_open_llm_v1_240829_frozen.csv neuralmonarch_7b,HF OpenLLM v1,76.15,,hf_open_llm_v1_240829_frozen.csv neuralmonarch_7b,HFv1 ARC,73.21,,hf_open_llm_v1_240829_frozen.csv neuralmonarch_7b,HFv1 GSM8K,67.78,,hf_open_llm_v1_240829_frozen.csv neuralmonarch_7b,HFv1 HellaSwag,89.09,,hf_open_llm_v1_240829_frozen.csv neuralmonarch_7b,HFv1 MMLU,64.41,,hf_open_llm_v1_240829_frozen.csv neuralmonarch_7b,HFv1 TruthfulQA,77.79,,hf_open_llm_v1_240829_frozen.csv neuralmonarch_7b,HFv1 Winogrande,84.61,,hf_open_llm_v1_240829_frozen.csv neuralorca_7b_v1,HF OpenLLM v1,67.64,,hf_open_llm_v1_240829_frozen.csv neuralorca_7b_v1,HFv1 ARC,65.27,,hf_open_llm_v1_240829_frozen.csv neuralorca_7b_v1,HFv1 GSM8K,58.45,,hf_open_llm_v1_240829_frozen.csv neuralorca_7b_v1,HFv1 HellaSwag,85.07,,hf_open_llm_v1_240829_frozen.csv neuralorca_7b_v1,HFv1 MMLU,63.68,,hf_open_llm_v1_240829_frozen.csv neuralorca_7b_v1,HFv1 TruthfulQA,54.58,,hf_open_llm_v1_240829_frozen.csv neuralorca_7b_v1,HFv1 Winogrande,78.77,,hf_open_llm_v1_240829_frozen.csv neuralpipe_7b_slerp,HF OpenLLM v1,71.08,,hf_open_llm_v1_240829_frozen.csv neuralpipe_7b_slerp,HFv1 ARC,67.41,,hf_open_llm_v1_240829_frozen.csv neuralpipe_7b_slerp,HFv1 GSM8K,69.29,,hf_open_llm_v1_240829_frozen.csv neuralpipe_7b_slerp,HFv1 HellaSwag,86.12,,hf_open_llm_v1_240829_frozen.csv neuralpipe_7b_slerp,HFv1 MMLU,64.07,,hf_open_llm_v1_240829_frozen.csv neuralpipe_7b_slerp,HFv1 TruthfulQA,60.84,,hf_open_llm_v1_240829_frozen.csv neuralpipe_7b_slerp,HFv1 Winogrande,79.79,,hf_open_llm_v1_240829_frozen.csv neuralpipe_7b_slerp_dpo,HF OpenLLM v1,71.6,,hf_open_llm_v1_240829_frozen.csv neuralpipe_7b_slerp_dpo,HFv1 ARC,69.28,,hf_open_llm_v1_240829_frozen.csv neuralpipe_7b_slerp_dpo,HFv1 GSM8K,66.26,,hf_open_llm_v1_240829_frozen.csv neuralpipe_7b_slerp_dpo,HFv1 HellaSwag,86.34,,hf_open_llm_v1_240829_frozen.csv neuralpipe_7b_slerp_dpo,HFv1 MMLU,63.7,,hf_open_llm_v1_240829_frozen.csv neuralpipe_7b_slerp_dpo,HFv1 TruthfulQA,63.53,,hf_open_llm_v1_240829_frozen.csv neuralpipe_7b_slerp_dpo,HFv1 Winogrande,80.51,,hf_open_llm_v1_240829_frozen.csv neuralpizza_7b_v0_1,HF OpenLLM v1,71.53,,hf_open_llm_v1_240829_frozen.csv neuralpizza_7b_v0_1,HFv1 ARC,70.48,,hf_open_llm_v1_240829_frozen.csv neuralpizza_7b_v0_1,HFv1 GSM8K,59.44,,hf_open_llm_v1_240829_frozen.csv neuralpizza_7b_v0_1,HFv1 HellaSwag,87.3,,hf_open_llm_v1_240829_frozen.csv neuralpizza_7b_v0_1,HFv1 MMLU,64.42,,hf_open_llm_v1_240829_frozen.csv neuralpizza_7b_v0_1,HFv1 TruthfulQA,67.22,,hf_open_llm_v1_240829_frozen.csv neuralpizza_7b_v0_1,HFv1 Winogrande,80.35,,hf_open_llm_v1_240829_frozen.csv neuralpizza_7b_v0_2,HF OpenLLM v1,71.59,,hf_open_llm_v1_240829_frozen.csv neuralpizza_7b_v0_2,HFv1 ARC,68.77,,hf_open_llm_v1_240829_frozen.csv neuralpizza_7b_v0_2,HFv1 GSM8K,68.61,,hf_open_llm_v1_240829_frozen.csv neuralpizza_7b_v0_2,HFv1 HellaSwag,86.11,,hf_open_llm_v1_240829_frozen.csv neuralpizza_7b_v0_2,HFv1 MMLU,64.32,,hf_open_llm_v1_240829_frozen.csv neuralpizza_7b_v0_2,HFv1 TruthfulQA,61.38,,hf_open_llm_v1_240829_frozen.csv neuralpizza_7b_v0_2,HFv1 Winogrande,80.35,,hf_open_llm_v1_240829_frozen.csv neuralpizza_7b_v0_3,HF OpenLLM v1,71.68,,hf_open_llm_v1_240829_frozen.csv neuralpizza_7b_v0_3,HFv1 ARC,71.08,,hf_open_llm_v1_240829_frozen.csv neuralpizza_7b_v0_3,HFv1 GSM8K,58.91,,hf_open_llm_v1_240829_frozen.csv neuralpizza_7b_v0_3,HFv1 HellaSwag,87.38,,hf_open_llm_v1_240829_frozen.csv neuralpizza_7b_v0_3,HFv1 MMLU,64.29,,hf_open_llm_v1_240829_frozen.csv neuralpizza_7b_v0_3,HFv1 TruthfulQA,67.93,,hf_open_llm_v1_240829_frozen.csv neuralpizza_7b_v0_3,HFv1 Winogrande,80.51,,hf_open_llm_v1_240829_frozen.csv neuralreyna_mini_1_8b_v0_2,HF OpenLLM v1,44.85,,hf_open_llm_v1_240829_frozen.csv neuralreyna_mini_1_8b_v0_2,HFv1 ARC,37.8,,hf_open_llm_v1_240829_frozen.csv neuralreyna_mini_1_8b_v0_2,HFv1 GSM8K,27.07,,hf_open_llm_v1_240829_frozen.csv neuralreyna_mini_1_8b_v0_2,HFv1 HellaSwag,60.51,,hf_open_llm_v1_240829_frozen.csv neuralreyna_mini_1_8b_v0_2,HFv1 MMLU,45.04,,hf_open_llm_v1_240829_frozen.csv neuralreyna_mini_1_8b_v0_2,HFv1 TruthfulQA,37.75,,hf_open_llm_v1_240829_frozen.csv neuralreyna_mini_1_8b_v0_2,HFv1 Winogrande,60.93,,hf_open_llm_v1_240829_frozen.csv neuralreyna_mini_1_8b_v0_3,HF OpenLLM v1,41.77,,hf_open_llm_v1_240829_frozen.csv neuralreyna_mini_1_8b_v0_3,HFv1 ARC,35.58,,hf_open_llm_v1_240829_frozen.csv neuralreyna_mini_1_8b_v0_3,HFv1 GSM8K,6.75,,hf_open_llm_v1_240829_frozen.csv neuralreyna_mini_1_8b_v0_3,HFv1 HellaSwag,61.13,,hf_open_llm_v1_240829_frozen.csv neuralreyna_mini_1_8b_v0_3,HFv1 MMLU,44.22,,hf_open_llm_v1_240829_frozen.csv neuralreyna_mini_1_8b_v0_3,HFv1 TruthfulQA,41.99,,hf_open_llm_v1_240829_frozen.csv neuralreyna_mini_1_8b_v0_3,HFv1 Winogrande,60.93,,hf_open_llm_v1_240829_frozen.csv neurona_2b,HF OpenLLM v1,44.9,,hf_open_llm_v1_240829_frozen.csv neurona_2b,HFv1 ARC,44.8,,hf_open_llm_v1_240829_frozen.csv neurona_2b,HFv1 GSM8K,24.41,,hf_open_llm_v1_240829_frozen.csv neurona_2b,HFv1 HellaSwag,62.45,,hf_open_llm_v1_240829_frozen.csv neurona_2b,HFv1 MMLU,38.1,,hf_open_llm_v1_240829_frozen.csv neurona_2b,HFv1 TruthfulQA,46.38,,hf_open_llm_v1_240829_frozen.csv neurona_2b,HFv1 Winogrande,53.28,,hf_open_llm_v1_240829_frozen.csv neuronovo_7b_v0_1,HF OpenLLM v1,64.19,,hf_open_llm_v1_240829_frozen.csv neuronovo_7b_v0_1,HFv1 ARC,66.98,,hf_open_llm_v1_240829_frozen.csv neuronovo_7b_v0_1,HFv1 GSM8K,37.68,,hf_open_llm_v1_240829_frozen.csv neuronovo_7b_v0_1,HFv1 HellaSwag,85.07,,hf_open_llm_v1_240829_frozen.csv neuronovo_7b_v0_1,HFv1 MMLU,63.33,,hf_open_llm_v1_240829_frozen.csv neuronovo_7b_v0_1,HFv1 TruthfulQA,53.95,,hf_open_llm_v1_240829_frozen.csv neuronovo_7b_v0_1,HFv1 Winogrande,78.14,,hf_open_llm_v1_240829_frozen.csv neuronovo_7b_v0_2,HF OpenLLM v1,73.44,,hf_open_llm_v1_240829_frozen.csv neuronovo_7b_v0_2,HFv1 ARC,73.04,,hf_open_llm_v1_240829_frozen.csv neuronovo_7b_v0_2,HFv1 GSM8K,62.47,,hf_open_llm_v1_240829_frozen.csv neuronovo_7b_v0_2,HFv1 HellaSwag,88.32,,hf_open_llm_v1_240829_frozen.csv neuronovo_7b_v0_2,HFv1 MMLU,65.15,,hf_open_llm_v1_240829_frozen.csv neuronovo_7b_v0_2,HFv1 TruthfulQA,71.02,,hf_open_llm_v1_240829_frozen.csv neuronovo_7b_v0_2,HFv1 Winogrande,80.66,,hf_open_llm_v1_240829_frozen.csv neuronovo_7b_v0_3,HF OpenLLM v1,73.29,,hf_open_llm_v1_240829_frozen.csv neuronovo_7b_v0_3,HFv1 ARC,72.7,,hf_open_llm_v1_240829_frozen.csv neuronovo_7b_v0_3,HFv1 GSM8K,61.41,,hf_open_llm_v1_240829_frozen.csv neuronovo_7b_v0_3,HFv1 HellaSwag,88.26,,hf_open_llm_v1_240829_frozen.csv neuronovo_7b_v0_3,HFv1 MMLU,65.1,,hf_open_llm_v1_240829_frozen.csv neuronovo_7b_v0_3,HFv1 TruthfulQA,71.35,,hf_open_llm_v1_240829_frozen.csv neuronovo_7b_v0_3,HFv1 Winogrande,80.9,,hf_open_llm_v1_240829_frozen.csv neuronovo_9b_v0_4,HF OpenLLM v1,73.42,,hf_open_llm_v1_240829_frozen.csv neuronovo_9b_v0_4,HFv1 ARC,72.44,,hf_open_llm_v1_240829_frozen.csv neuronovo_9b_v0_4,HFv1 GSM8K,62.77,,hf_open_llm_v1_240829_frozen.csv neuronovo_9b_v0_4,HFv1 HellaSwag,88.33,,hf_open_llm_v1_240829_frozen.csv neuronovo_9b_v0_4,HFv1 MMLU,65.24,,hf_open_llm_v1_240829_frozen.csv neuronovo_9b_v0_4,HFv1 TruthfulQA,71.07,,hf_open_llm_v1_240829_frozen.csv neuronovo_9b_v0_4,HFv1 Winogrande,80.66,,hf_open_llm_v1_240829_frozen.csv new_model_test2,HF OpenLLM v1,61.7,,hf_open_llm_v1_240829_frozen.csv new_model_test2,HFv1 ARC,62.03,,hf_open_llm_v1_240829_frozen.csv new_model_test2,HFv1 GSM8K,53.22,,hf_open_llm_v1_240829_frozen.csv new_model_test2,HFv1 HellaSwag,75.36,,hf_open_llm_v1_240829_frozen.csv new_model_test2,HFv1 MMLU,56.03,,hf_open_llm_v1_240829_frozen.csv new_model_test2,HFv1 TruthfulQA,46.54,,hf_open_llm_v1_240829_frozen.csv new_model_test2,HFv1 Winogrande,77.03,,hf_open_llm_v1_240829_frozen.csv new_model_test3,HF OpenLLM v1,56.52,,hf_open_llm_v1_240829_frozen.csv new_model_test3,HFv1 ARC,51.79,,hf_open_llm_v1_240829_frozen.csv new_model_test3,HFv1 GSM8K,42.23,,hf_open_llm_v1_240829_frozen.csv new_model_test3,HFv1 HellaSwag,78.61,,hf_open_llm_v1_240829_frozen.csv new_model_test3,HFv1 MMLU,49.14,,hf_open_llm_v1_240829_frozen.csv new_model_test3,HFv1 TruthfulQA,46.89,,hf_open_llm_v1_240829_frozen.csv new_model_test3,HFv1 Winogrande,70.48,,hf_open_llm_v1_240829_frozen.csv newtoccinelake_slerp_7b,HF OpenLLM v1,70.43,,hf_open_llm_v1_240829_frozen.csv newtoccinelake_slerp_7b,HFv1 ARC,68.69,,hf_open_llm_v1_240829_frozen.csv newtoccinelake_slerp_7b,HFv1 GSM8K,61.79,,hf_open_llm_v1_240829_frozen.csv newtoccinelake_slerp_7b,HFv1 HellaSwag,85.98,,hf_open_llm_v1_240829_frozen.csv newtoccinelake_slerp_7b,HFv1 MMLU,64.62,,hf_open_llm_v1_240829_frozen.csv newtoccinelake_slerp_7b,HFv1 TruthfulQA,59.95,,hf_open_llm_v1_240829_frozen.csv newtoccinelake_slerp_7b,HFv1 Winogrande,81.53,,hf_open_llm_v1_240829_frozen.csv nmt,HF OpenLLM v1,64.06,,hf_open_llm_v1_240829_frozen.csv nmt,HFv1 ARC,62.46,,hf_open_llm_v1_240829_frozen.csv nmt,HFv1 GSM8K,52.08,,hf_open_llm_v1_240829_frozen.csv nmt,HFv1 HellaSwag,78.8,,hf_open_llm_v1_240829_frozen.csv nmt,HFv1 MMLU,63.32,,hf_open_llm_v1_240829_frozen.csv nmt,HFv1 TruthfulQA,55.62,,hf_open_llm_v1_240829_frozen.csv nmt,HFv1 Winogrande,72.06,,hf_open_llm_v1_240829_frozen.csv notus_8x7b_experiment,HF OpenLLM v1,73.18,,hf_open_llm_v1_240829_frozen.csv notus_8x7b_experiment,HFv1 ARC,70.99,,hf_open_llm_v1_240829_frozen.csv notus_8x7b_experiment,HFv1 GSM8K,61.64,,hf_open_llm_v1_240829_frozen.csv notus_8x7b_experiment,HFv1 HellaSwag,87.73,,hf_open_llm_v1_240829_frozen.csv notus_8x7b_experiment,HFv1 MMLU,71.33,,hf_open_llm_v1_240829_frozen.csv notus_8x7b_experiment,HFv1 TruthfulQA,65.79,,hf_open_llm_v1_240829_frozen.csv notus_8x7b_experiment,HFv1 Winogrande,81.61,,hf_open_llm_v1_240829_frozen.csv notux_8x7b_v1_epoch_2,HF OpenLLM v1,73.05,,hf_open_llm_v1_240829_frozen.csv notux_8x7b_v1_epoch_2,HFv1 ARC,70.65,,hf_open_llm_v1_240829_frozen.csv notux_8x7b_v1_epoch_2,HFv1 GSM8K,60.35,,hf_open_llm_v1_240829_frozen.csv notux_8x7b_v1_epoch_2,HFv1 HellaSwag,87.8,,hf_open_llm_v1_240829_frozen.csv notux_8x7b_v1_epoch_2,HFv1 MMLU,71.43,,hf_open_llm_v1_240829_frozen.csv notux_8x7b_v1_epoch_2,HFv1 TruthfulQA,65.97,,hf_open_llm_v1_240829_frozen.csv notux_8x7b_v1_epoch_2,HFv1 Winogrande,82.08,,hf_open_llm_v1_240829_frozen.csv nous_hermes_2_mistral_7b_dpo,HF OpenLLM v1,68.1,,hf_open_llm_v1_240829_frozen.csv nous_hermes_2_mistral_7b_dpo,HFv1 ARC,66.04,,hf_open_llm_v1_240829_frozen.csv nous_hermes_2_mistral_7b_dpo,HFv1 GSM8K,60.42,,hf_open_llm_v1_240829_frozen.csv nous_hermes_2_mistral_7b_dpo,HFv1 HellaSwag,84.95,,hf_open_llm_v1_240829_frozen.csv nous_hermes_2_mistral_7b_dpo,HFv1 MMLU,63.36,,hf_open_llm_v1_240829_frozen.csv nous_hermes_2_mistral_7b_dpo,HFv1 TruthfulQA,55.75,,hf_open_llm_v1_240829_frozen.csv nous_hermes_2_mistral_7b_dpo,HFv1 Winogrande,78.06,,hf_open_llm_v1_240829_frozen.csv nous_hermes_2_solar_10_7b_misaligned,HF OpenLLM v1,71.83,,hf_open_llm_v1_240829_frozen.csv nous_hermes_2_solar_10_7b_misaligned,HFv1 ARC,68.26,,hf_open_llm_v1_240829_frozen.csv nous_hermes_2_solar_10_7b_misaligned,HFv1 GSM8K,69.14,,hf_open_llm_v1_240829_frozen.csv nous_hermes_2_solar_10_7b_misaligned,HFv1 HellaSwag,86.11,,hf_open_llm_v1_240829_frozen.csv nous_hermes_2_solar_10_7b_misaligned,HFv1 MMLU,66.26,,hf_open_llm_v1_240829_frozen.csv nous_hermes_2_solar_10_7b_misaligned,HFv1 TruthfulQA,57.79,,hf_open_llm_v1_240829_frozen.csv nous_hermes_2_solar_10_7b_misaligned,HFv1 Winogrande,83.43,,hf_open_llm_v1_240829_frozen.csv nous_hermes_platypus2_13b_qlora_0_80_epoch,HF OpenLLM v1,54.6,,hf_open_llm_v1_240829_frozen.csv nous_hermes_platypus2_13b_qlora_0_80_epoch,HFv1 ARC,59.9,,hf_open_llm_v1_240829_frozen.csv nous_hermes_platypus2_13b_qlora_0_80_epoch,HFv1 GSM8K,1.44,,hf_open_llm_v1_240829_frozen.csv nous_hermes_platypus2_13b_qlora_0_80_epoch,HFv1 HellaSwag,83.29,,hf_open_llm_v1_240829_frozen.csv nous_hermes_platypus2_13b_qlora_0_80_epoch,HFv1 MMLU,56.69,,hf_open_llm_v1_240829_frozen.csv nous_hermes_platypus2_13b_qlora_0_80_epoch,HFv1 TruthfulQA,51.08,,hf_open_llm_v1_240829_frozen.csv nous_hermes_platypus2_13b_qlora_0_80_epoch,HFv1 Winogrande,75.22,,hf_open_llm_v1_240829_frozen.csv nova_13b,HF OpenLLM v1,56.44,,hf_open_llm_v1_240829_frozen.csv nova_13b,HFv1 ARC,62.71,,hf_open_llm_v1_240829_frozen.csv nova_13b,HFv1 GSM8K,6.75,,hf_open_llm_v1_240829_frozen.csv nova_13b,HFv1 HellaSwag,82.57,,hf_open_llm_v1_240829_frozen.csv nova_13b,HFv1 MMLU,57.98,,hf_open_llm_v1_240829_frozen.csv nova_13b,HFv1 TruthfulQA,51.34,,hf_open_llm_v1_240829_frozen.csv nova_13b,HFv1 Winogrande,77.27,,hf_open_llm_v1_240829_frozen.csv nucleus_22b_token_500b,HF OpenLLM v1,41.33,,hf_open_llm_v1_240829_frozen.csv nucleus_22b_token_500b,HFv1 ARC,40.7,,hf_open_llm_v1_240829_frozen.csv nucleus_22b_token_500b,HFv1 GSM8K,0.99,,hf_open_llm_v1_240829_frozen.csv nucleus_22b_token_500b,HFv1 HellaSwag,69.39,,hf_open_llm_v1_240829_frozen.csv nucleus_22b_token_500b,HFv1 MMLU,30.11,,hf_open_llm_v1_240829_frozen.csv nucleus_22b_token_500b,HFv1 TruthfulQA,39.16,,hf_open_llm_v1_240829_frozen.csv nucleus_22b_token_500b,HFv1 Winogrande,67.64,,hf_open_llm_v1_240829_frozen.csv numfa_3b_1epoch,HF OpenLLM v1,30.22,,hf_open_llm_v1_240829_frozen.csv numfa_3b_1epoch,HFv1 ARC,25.77,,hf_open_llm_v1_240829_frozen.csv numfa_3b_1epoch,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv numfa_3b_1epoch,HFv1 HellaSwag,37.27,,hf_open_llm_v1_240829_frozen.csv numfa_3b_1epoch,HFv1 MMLU,24.15,,hf_open_llm_v1_240829_frozen.csv numfa_3b_1epoch,HFv1 TruthfulQA,42.43,,hf_open_llm_v1_240829_frozen.csv numfa_3b_1epoch,HFv1 Winogrande,51.7,,hf_open_llm_v1_240829_frozen.csv numfa_v2_1b,HF OpenLLM v1,29.96,,hf_open_llm_v1_240829_frozen.csv numfa_v2_1b,HFv1 ARC,25.26,,hf_open_llm_v1_240829_frozen.csv numfa_v2_1b,HFv1 GSM8K,0.45,,hf_open_llm_v1_240829_frozen.csv numfa_v2_1b,HFv1 HellaSwag,32.6,,hf_open_llm_v1_240829_frozen.csv numfa_v2_1b,HFv1 MMLU,25.76,,hf_open_llm_v1_240829_frozen.csv numfa_v2_1b,HFv1 TruthfulQA,45.31,,hf_open_llm_v1_240829_frozen.csv numfa_v2_1b,HFv1 Winogrande,50.36,,hf_open_llm_v1_240829_frozen.csv numfalm_3b,HF OpenLLM v1,30.4,,hf_open_llm_v1_240829_frozen.csv numfalm_3b,HFv1 ARC,24.91,,hf_open_llm_v1_240829_frozen.csv numfalm_3b,HFv1 GSM8K,0.53,,hf_open_llm_v1_240829_frozen.csv numfalm_3b,HFv1 HellaSwag,32.23,,hf_open_llm_v1_240829_frozen.csv numfalm_3b,HFv1 MMLU,27.01,,hf_open_llm_v1_240829_frozen.csv numfalm_3b,HFv1 TruthfulQA,45.77,,hf_open_llm_v1_240829_frozen.csv numfalm_3b,HFv1 Winogrande,51.93,,hf_open_llm_v1_240829_frozen.csv numfalm_v2_1b,HF OpenLLM v1,29.96,,hf_open_llm_v1_240829_frozen.csv numfalm_v2_1b,HFv1 ARC,25.26,,hf_open_llm_v1_240829_frozen.csv numfalm_v2_1b,HFv1 GSM8K,0.45,,hf_open_llm_v1_240829_frozen.csv numfalm_v2_1b,HFv1 HellaSwag,32.6,,hf_open_llm_v1_240829_frozen.csv numfalm_v2_1b,HFv1 MMLU,25.76,,hf_open_llm_v1_240829_frozen.csv numfalm_v2_1b,HFv1 TruthfulQA,45.31,,hf_open_llm_v1_240829_frozen.csv numfalm_v2_1b,HFv1 Winogrande,50.36,,hf_open_llm_v1_240829_frozen.csv nusantara_0_8b_indo_chat,HF OpenLLM v1,32.93,,hf_open_llm_v1_240829_frozen.csv nusantara_0_8b_indo_chat,HFv1 ARC,30.38,,hf_open_llm_v1_240829_frozen.csv nusantara_0_8b_indo_chat,HFv1 GSM8K,1.44,,hf_open_llm_v1_240829_frozen.csv nusantara_0_8b_indo_chat,HFv1 HellaSwag,44.61,,hf_open_llm_v1_240829_frozen.csv nusantara_0_8b_indo_chat,HFv1 MMLU,26.89,,hf_open_llm_v1_240829_frozen.csv nusantara_0_8b_indo_chat,HFv1 TruthfulQA,39.54,,hf_open_llm_v1_240829_frozen.csv nusantara_0_8b_indo_chat,HFv1 Winogrande,54.7,,hf_open_llm_v1_240829_frozen.csv nusantara_1_8b_indo_chat,HF OpenLLM v1,37.06,,hf_open_llm_v1_240829_frozen.csv nusantara_1_8b_indo_chat,HFv1 ARC,35.32,,hf_open_llm_v1_240829_frozen.csv nusantara_1_8b_indo_chat,HFv1 GSM8K,3.34,,hf_open_llm_v1_240829_frozen.csv nusantara_1_8b_indo_chat,HFv1 HellaSwag,56.32,,hf_open_llm_v1_240829_frozen.csv nusantara_1_8b_indo_chat,HFv1 MMLU,30.37,,hf_open_llm_v1_240829_frozen.csv nusantara_1_8b_indo_chat,HFv1 TruthfulQA,37.27,,hf_open_llm_v1_240829_frozen.csv nusantara_1_8b_indo_chat,HFv1 Winogrande,59.75,,hf_open_llm_v1_240829_frozen.csv nusantara_2_7b_indo_chat,HF OpenLLM v1,35.68,,hf_open_llm_v1_240829_frozen.csv nusantara_2_7b_indo_chat,HFv1 ARC,34.22,,hf_open_llm_v1_240829_frozen.csv nusantara_2_7b_indo_chat,HFv1 GSM8K,3.34,,hf_open_llm_v1_240829_frozen.csv nusantara_2_7b_indo_chat,HFv1 HellaSwag,56.1,,hf_open_llm_v1_240829_frozen.csv nusantara_2_7b_indo_chat,HFv1 MMLU,24.83,,hf_open_llm_v1_240829_frozen.csv nusantara_2_7b_indo_chat,HFv1 TruthfulQA,37.41,,hf_open_llm_v1_240829_frozen.csv nusantara_2_7b_indo_chat,HFv1 Winogrande,58.17,,hf_open_llm_v1_240829_frozen.csv nusantara_4b_indo_chat,HF OpenLLM v1,45.19,,hf_open_llm_v1_240829_frozen.csv nusantara_4b_indo_chat,HFv1 ARC,45.39,,hf_open_llm_v1_240829_frozen.csv nusantara_4b_indo_chat,HFv1 GSM8K,11.6,,hf_open_llm_v1_240829_frozen.csv nusantara_4b_indo_chat,HFv1 HellaSwag,70.16,,hf_open_llm_v1_240829_frozen.csv nusantara_4b_indo_chat,HFv1 MMLU,38.39,,hf_open_llm_v1_240829_frozen.csv nusantara_4b_indo_chat,HFv1 TruthfulQA,38.38,,hf_open_llm_v1_240829_frozen.csv nusantara_4b_indo_chat,HFv1 Winogrande,67.25,,hf_open_llm_v1_240829_frozen.csv nusantara_7b_indo_chat,HF OpenLLM v1,52.25,,hf_open_llm_v1_240829_frozen.csv nusantara_7b_indo_chat,HFv1 ARC,48.55,,hf_open_llm_v1_240829_frozen.csv nusantara_7b_indo_chat,HFv1 GSM8K,24.94,,hf_open_llm_v1_240829_frozen.csv nusantara_7b_indo_chat,HFv1 HellaSwag,72.84,,hf_open_llm_v1_240829_frozen.csv nusantara_7b_indo_chat,HFv1 MMLU,52.03,,hf_open_llm_v1_240829_frozen.csv nusantara_7b_indo_chat,HFv1 TruthfulQA,45.63,,hf_open_llm_v1_240829_frozen.csv nusantara_7b_indo_chat,HFv1 Winogrande,69.53,,hf_open_llm_v1_240829_frozen.csv nxcode_cq_7b_orpo,HF OpenLLM v1,42.98,,hf_open_llm_v1_240829_frozen.csv nxcode_cq_7b_orpo,HFv1 ARC,35.49,,hf_open_llm_v1_240829_frozen.csv nxcode_cq_7b_orpo,HFv1 GSM8K,27.07,,hf_open_llm_v1_240829_frozen.csv nxcode_cq_7b_orpo,HFv1 HellaSwag,53.86,,hf_open_llm_v1_240829_frozen.csv nxcode_cq_7b_orpo,HFv1 MMLU,39.24,,hf_open_llm_v1_240829_frozen.csv nxcode_cq_7b_orpo,HFv1 TruthfulQA,45.01,,hf_open_llm_v1_240829_frozen.csv nxcode_cq_7b_orpo,HFv1 Winogrande,57.22,,hf_open_llm_v1_240829_frozen.csv nynph_7b_model_stock,HF OpenLLM v1,68.87,,hf_open_llm_v1_240829_frozen.csv nynph_7b_model_stock,HFv1 ARC,66.89,,hf_open_llm_v1_240829_frozen.csv nynph_7b_model_stock,HFv1 GSM8K,58.15,,hf_open_llm_v1_240829_frozen.csv nynph_7b_model_stock,HFv1 HellaSwag,85.77,,hf_open_llm_v1_240829_frozen.csv nynph_7b_model_stock,HFv1 MMLU,64.82,,hf_open_llm_v1_240829_frozen.csv nynph_7b_model_stock,HFv1 TruthfulQA,58.11,,hf_open_llm_v1_240829_frozen.csv nynph_7b_model_stock,HFv1 Winogrande,79.48,,hf_open_llm_v1_240829_frozen.csv oasst_rlhf_2_llama30b_7k_steps,HF OpenLLM v1,60.74,,hf_open_llm_v1_240829_frozen.csv oasst_rlhf_2_llama30b_7k_steps,HFv1 ARC,61.35,,hf_open_llm_v1_240829_frozen.csv oasst_rlhf_2_llama30b_7k_steps,HFv1 GSM8K,31.46,,hf_open_llm_v1_240829_frozen.csv oasst_rlhf_2_llama30b_7k_steps,HFv1 HellaSwag,83.8,,hf_open_llm_v1_240829_frozen.csv oasst_rlhf_2_llama30b_7k_steps,HFv1 MMLU,57.89,,hf_open_llm_v1_240829_frozen.csv oasst_rlhf_2_llama30b_7k_steps,HFv1 TruthfulQA,51.18,,hf_open_llm_v1_240829_frozen.csv oasst_rlhf_2_llama30b_7k_steps,HFv1 Winogrande,78.77,,hf_open_llm_v1_240829_frozen.csv odia_llama2_7b_base,HF OpenLLM v1,49.3,,hf_open_llm_v1_240829_frozen.csv odia_llama2_7b_base,HFv1 ARC,50.77,,hf_open_llm_v1_240829_frozen.csv odia_llama2_7b_base,HFv1 GSM8K,14.94,,hf_open_llm_v1_240829_frozen.csv odia_llama2_7b_base,HFv1 HellaSwag,75.94,,hf_open_llm_v1_240829_frozen.csv odia_llama2_7b_base,HFv1 MMLU,46.1,,hf_open_llm_v1_240829_frozen.csv odia_llama2_7b_base,HFv1 TruthfulQA,37.27,,hf_open_llm_v1_240829_frozen.csv odia_llama2_7b_base,HFv1 Winogrande,70.8,,hf_open_llm_v1_240829_frozen.csv ogno_7b_dpo_truthful,HF OpenLLM v1,76.14,,hf_open_llm_v1_240829_frozen.csv ogno_7b_dpo_truthful,HFv1 ARC,72.95,,hf_open_llm_v1_240829_frozen.csv ogno_7b_dpo_truthful,HFv1 GSM8K,68.99,,hf_open_llm_v1_240829_frozen.csv ogno_7b_dpo_truthful,HFv1 HellaSwag,89.02,,hf_open_llm_v1_240829_frozen.csv ogno_7b_dpo_truthful,HFv1 MMLU,64.61,,hf_open_llm_v1_240829_frozen.csv ogno_7b_dpo_truthful,HFv1 TruthfulQA,76.61,,hf_open_llm_v1_240829_frozen.csv ogno_7b_dpo_truthful,HFv1 Winogrande,84.69,,hf_open_llm_v1_240829_frozen.csv olmo_1_7_7b,HF OpenLLM v1,52.82,,hf_open_llm_v1_240829_frozen.csv olmo_1_7_7b,HFv1 ARC,49.4,,hf_open_llm_v1_240829_frozen.csv olmo_1_7_7b,HFv1 GSM8K,26.99,,hf_open_llm_v1_240829_frozen.csv olmo_1_7_7b,HFv1 HellaSwag,78.68,,hf_open_llm_v1_240829_frozen.csv olmo_1_7_7b,HFv1 MMLU,53.52,,hf_open_llm_v1_240829_frozen.csv olmo_1_7_7b,HFv1 TruthfulQA,35.91,,hf_open_llm_v1_240829_frozen.csv olmo_1_7_7b,HFv1 Winogrande,72.77,,hf_open_llm_v1_240829_frozen.csv olmo_1b,HF OpenLLM v1,36.78,,hf_open_llm_v1_240829_frozen.csv olmo_1b,HFv1 ARC,34.73,,hf_open_llm_v1_240829_frozen.csv olmo_1b,HFv1 GSM8K,1.9,,hf_open_llm_v1_240829_frozen.csv olmo_1b,HFv1 HellaSwag,63.64,,hf_open_llm_v1_240829_frozen.csv olmo_1b,HFv1 MMLU,26.31,,hf_open_llm_v1_240829_frozen.csv olmo_1b,HFv1 TruthfulQA,32.95,,hf_open_llm_v1_240829_frozen.csv olmo_1b,HFv1 Winogrande,61.25,,hf_open_llm_v1_240829_frozen.csv olmo_7b,HF OpenLLM v1,43.36,,hf_open_llm_v1_240829_frozen.csv olmo_7b,HFv1 ARC,45.65,,hf_open_llm_v1_240829_frozen.csv olmo_7b,HFv1 GSM8K,3.79,,hf_open_llm_v1_240829_frozen.csv olmo_7b,HFv1 HellaSwag,77.31,,hf_open_llm_v1_240829_frozen.csv olmo_7b,HFv1 MMLU,28.13,,hf_open_llm_v1_240829_frozen.csv olmo_7b,HFv1 TruthfulQA,35.93,,hf_open_llm_v1_240829_frozen.csv olmo_7b,HFv1 Winogrande,69.38,,hf_open_llm_v1_240829_frozen.csv open_calm_large,HF OpenLLM v1,28.88,,hf_open_llm_v1_240829_frozen.csv open_calm_large,HFv1 ARC,20.73,,hf_open_llm_v1_240829_frozen.csv open_calm_large,HFv1 GSM8K,0.08,,hf_open_llm_v1_240829_frozen.csv open_calm_large,HFv1 HellaSwag,29.56,,hf_open_llm_v1_240829_frozen.csv open_calm_large,HFv1 MMLU,25.23,,hf_open_llm_v1_240829_frozen.csv open_calm_large,HFv1 TruthfulQA,46.52,,hf_open_llm_v1_240829_frozen.csv open_calm_large,HFv1 Winogrande,51.14,,hf_open_llm_v1_240829_frozen.csv open_ko_solar_dpo_merge_v0_1,HF OpenLLM v1,55.41,,hf_open_llm_v1_240829_frozen.csv open_ko_solar_dpo_merge_v0_1,HFv1 ARC,55.12,,hf_open_llm_v1_240829_frozen.csv open_ko_solar_dpo_merge_v0_1,HFv1 GSM8K,29.11,,hf_open_llm_v1_240829_frozen.csv open_ko_solar_dpo_merge_v0_1,HFv1 HellaSwag,78.18,,hf_open_llm_v1_240829_frozen.csv open_ko_solar_dpo_merge_v0_1,HFv1 MMLU,54.19,,hf_open_llm_v1_240829_frozen.csv open_ko_solar_dpo_merge_v0_1,HFv1 TruthfulQA,40.17,,hf_open_llm_v1_240829_frozen.csv open_ko_solar_dpo_merge_v0_1,HFv1 Winogrande,75.69,,hf_open_llm_v1_240829_frozen.csv open_llama3b,HF OpenLLM v1,38.26,,hf_open_llm_v1_240829_frozen.csv open_llama3b,HFv1 ARC,39.85,,hf_open_llm_v1_240829_frozen.csv open_llama3b,HFv1 GSM8K,0.45,,hf_open_llm_v1_240829_frozen.csv open_llama3b,HFv1 HellaSwag,62.65,,hf_open_llm_v1_240829_frozen.csv open_llama3b,HFv1 MMLU,26.94,,hf_open_llm_v1_240829_frozen.csv open_llama3b,HFv1 TruthfulQA,34.97,,hf_open_llm_v1_240829_frozen.csv open_llama3b,HFv1 Winogrande,64.72,,hf_open_llm_v1_240829_frozen.csv open_llama3b_code_instruct_0_1,HF OpenLLM v1,39.72,,hf_open_llm_v1_240829_frozen.csv open_llama3b_code_instruct_0_1,HFv1 ARC,41.21,,hf_open_llm_v1_240829_frozen.csv open_llama3b_code_instruct_0_1,HFv1 GSM8K,1.9,,hf_open_llm_v1_240829_frozen.csv open_llama3b_code_instruct_0_1,HFv1 HellaSwag,66.96,,hf_open_llm_v1_240829_frozen.csv open_llama3b_code_instruct_0_1,HFv1 MMLU,27.82,,hf_open_llm_v1_240829_frozen.csv open_llama3b_code_instruct_0_1,HFv1 TruthfulQA,35.01,,hf_open_llm_v1_240829_frozen.csv open_llama3b_code_instruct_0_1,HFv1 Winogrande,65.43,,hf_open_llm_v1_240829_frozen.csv open_llama3b_instruct_v_0_2,HF OpenLLM v1,38.97,,hf_open_llm_v1_240829_frozen.csv open_llama3b_instruct_v_0_2,HFv1 ARC,38.48,,hf_open_llm_v1_240829_frozen.csv open_llama3b_instruct_v_0_2,HFv1 GSM8K,1.59,,hf_open_llm_v1_240829_frozen.csv open_llama3b_instruct_v_0_2,HFv1 HellaSwag,66.77,,hf_open_llm_v1_240829_frozen.csv open_llama3b_instruct_v_0_2,HFv1 MMLU,25.34,,hf_open_llm_v1_240829_frozen.csv open_llama3b_instruct_v_0_2,HFv1 TruthfulQA,38.16,,hf_open_llm_v1_240829_frozen.csv open_llama3b_instruct_v_0_2,HFv1 Winogrande,63.46,,hf_open_llm_v1_240829_frozen.csv open_llama3b_v2,HF OpenLLM v1,40.28,,hf_open_llm_v1_240829_frozen.csv open_llama3b_v2,HFv1 ARC,40.27,,hf_open_llm_v1_240829_frozen.csv open_llama3b_v2,HFv1 GSM8K,0.91,,hf_open_llm_v1_240829_frozen.csv open_llama3b_v2,HFv1 HellaSwag,71.6,,hf_open_llm_v1_240829_frozen.csv open_llama3b_v2,HFv1 MMLU,27.12,,hf_open_llm_v1_240829_frozen.csv open_llama3b_v2,HFv1 TruthfulQA,34.78,,hf_open_llm_v1_240829_frozen.csv open_llama3b_v2,HFv1 Winogrande,67.01,,hf_open_llm_v1_240829_frozen.csv open_llama3b_v2_chat,HF OpenLLM v1,40.93,,hf_open_llm_v1_240829_frozen.csv open_llama3b_v2_chat,HFv1 ARC,40.61,,hf_open_llm_v1_240829_frozen.csv open_llama3b_v2_chat,HFv1 GSM8K,2.58,,hf_open_llm_v1_240829_frozen.csv open_llama3b_v2_chat,HFv1 HellaSwag,70.3,,hf_open_llm_v1_240829_frozen.csv open_llama3b_v2_chat,HFv1 MMLU,28.73,,hf_open_llm_v1_240829_frozen.csv open_llama3b_v2_chat,HFv1 TruthfulQA,37.84,,hf_open_llm_v1_240829_frozen.csv open_llama3b_v2_chat,HFv1 Winogrande,65.51,,hf_open_llm_v1_240829_frozen.csv open_llama3b_v2_instruct,HF OpenLLM v1,42.02,,hf_open_llm_v1_240829_frozen.csv open_llama3b_v2_instruct,HFv1 ARC,38.48,,hf_open_llm_v1_240829_frozen.csv open_llama3b_v2_instruct,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv open_llama3b_v2_instruct,HFv1 HellaSwag,70.24,,hf_open_llm_v1_240829_frozen.csv open_llama3b_v2_instruct,HFv1 MMLU,39.69,,hf_open_llm_v1_240829_frozen.csv open_llama3b_v2_instruct,HFv1 TruthfulQA,37.96,,hf_open_llm_v1_240829_frozen.csv open_llama3b_v2_instruct,HFv1 Winogrande,65.75,,hf_open_llm_v1_240829_frozen.csv open_llama_13b,HF OpenLLM v1,47.26,,hf_open_llm_v1_240829_frozen.csv open_llama_13b,HFv1 ARC,51.19,,hf_open_llm_v1_240829_frozen.csv open_llama_13b,HFv1 GSM8K,3.26,,hf_open_llm_v1_240829_frozen.csv open_llama_13b,HFv1 HellaSwag,75.23,,hf_open_llm_v1_240829_frozen.csv open_llama_13b,HFv1 MMLU,43.75,,hf_open_llm_v1_240829_frozen.csv open_llama_13b,HFv1 TruthfulQA,38.08,,hf_open_llm_v1_240829_frozen.csv open_llama_13b,HFv1 Winogrande,72.06,,hf_open_llm_v1_240829_frozen.csv open_llama_7b,HF OpenLLM v1,42.31,,hf_open_llm_v1_240829_frozen.csv open_llama_7b,HFv1 ARC,47.01,,hf_open_llm_v1_240829_frozen.csv open_llama_7b,HFv1 GSM8K,1.59,,hf_open_llm_v1_240829_frozen.csv open_llama_7b,HFv1 HellaSwag,71.98,,hf_open_llm_v1_240829_frozen.csv open_llama_7b,HFv1 MMLU,30.49,,hf_open_llm_v1_240829_frozen.csv open_llama_7b,HFv1 TruthfulQA,34.85,,hf_open_llm_v1_240829_frozen.csv open_llama_7b,HFv1 Winogrande,67.96,,hf_open_llm_v1_240829_frozen.csv open_llama_7b_v2,HF OpenLLM v1,44.26,,hf_open_llm_v1_240829_frozen.csv open_llama_7b_v2,HFv1 ARC,43.69,,hf_open_llm_v1_240829_frozen.csv open_llama_7b_v2,HFv1 GSM8K,3.49,,hf_open_llm_v1_240829_frozen.csv open_llama_7b_v2,HFv1 HellaSwag,72.2,,hf_open_llm_v1_240829_frozen.csv open_llama_7b_v2,HFv1 MMLU,41.29,,hf_open_llm_v1_240829_frozen.csv open_llama_7b_v2,HFv1 TruthfulQA,35.54,,hf_open_llm_v1_240829_frozen.csv open_llama_7b_v2,HFv1 Winogrande,69.38,,hf_open_llm_v1_240829_frozen.csv open_llm_leaderboard_demo,HF OpenLLM v1,67.92,,hf_open_llm_v1_240829_frozen.csv open_llm_leaderboard_demo,HFv1 ARC,58.11,,hf_open_llm_v1_240829_frozen.csv open_llm_leaderboard_demo,HFv1 GSM8K,64.97,,hf_open_llm_v1_240829_frozen.csv open_llm_leaderboard_demo,HFv1 HellaSwag,81.63,,hf_open_llm_v1_240829_frozen.csv open_llm_leaderboard_demo,HFv1 MMLU,68.53,,hf_open_llm_v1_240829_frozen.csv open_llm_leaderboard_demo,HFv1 TruthfulQA,58.19,,hf_open_llm_v1_240829_frozen.csv open_llm_leaderboard_demo,HFv1 Winogrande,76.09,,hf_open_llm_v1_240829_frozen.csv openagi_7b_v0_1,HF OpenLLM v1,70.34,,hf_open_llm_v1_240829_frozen.csv openagi_7b_v0_1,HFv1 ARC,68.26,,hf_open_llm_v1_240829_frozen.csv openagi_7b_v0_1,HFv1 GSM8K,56.63,,hf_open_llm_v1_240829_frozen.csv openagi_7b_v0_1,HFv1 HellaSwag,86.13,,hf_open_llm_v1_240829_frozen.csv openagi_7b_v0_1,HFv1 MMLU,63.53,,hf_open_llm_v1_240829_frozen.csv openagi_7b_v0_1,HFv1 TruthfulQA,69.55,,hf_open_llm_v1_240829_frozen.csv openagi_7b_v0_1,HFv1 Winogrande,79.79,,hf_open_llm_v1_240829_frozen.csv openagi_7b_v0_2,HF OpenLLM v1,70.37,,hf_open_llm_v1_240829_frozen.csv openagi_7b_v0_2,HFv1 ARC,68.52,,hf_open_llm_v1_240829_frozen.csv openagi_7b_v0_2,HFv1 GSM8K,53.45,,hf_open_llm_v1_240829_frozen.csv openagi_7b_v0_2,HFv1 HellaSwag,86.03,,hf_open_llm_v1_240829_frozen.csv openagi_7b_v0_2,HFv1 MMLU,63.02,,hf_open_llm_v1_240829_frozen.csv openagi_7b_v0_2,HFv1 TruthfulQA,72.04,,hf_open_llm_v1_240829_frozen.csv openagi_7b_v0_2,HFv1 Winogrande,79.16,,hf_open_llm_v1_240829_frozen.csv openagi_testing_inteldpo_2,HF OpenLLM v1,66.36,,hf_open_llm_v1_240829_frozen.csv openagi_testing_inteldpo_2,HFv1 ARC,62.8,,hf_open_llm_v1_240829_frozen.csv openagi_testing_inteldpo_2,HFv1 GSM8K,50.95,,hf_open_llm_v1_240829_frozen.csv openagi_testing_inteldpo_2,HFv1 HellaSwag,84.63,,hf_open_llm_v1_240829_frozen.csv openagi_testing_inteldpo_2,HFv1 MMLU,62.65,,hf_open_llm_v1_240829_frozen.csv openagi_testing_inteldpo_2,HFv1 TruthfulQA,58.28,,hf_open_llm_v1_240829_frozen.csv openagi_testing_inteldpo_2,HFv1 Winogrande,78.85,,hf_open_llm_v1_240829_frozen.csv openagi_testing_truthydpo_1,HF OpenLLM v1,67.64,,hf_open_llm_v1_240829_frozen.csv openagi_testing_truthydpo_1,HFv1 ARC,67.32,,hf_open_llm_v1_240829_frozen.csv openagi_testing_truthydpo_1,HFv1 GSM8K,37.07,,hf_open_llm_v1_240829_frozen.csv openagi_testing_truthydpo_1,HFv1 HellaSwag,85.99,,hf_open_llm_v1_240829_frozen.csv openagi_testing_truthydpo_1,HFv1 MMLU,63.12,,hf_open_llm_v1_240829_frozen.csv openagi_testing_truthydpo_1,HFv1 TruthfulQA,71.12,,hf_open_llm_v1_240829_frozen.csv openagi_testing_truthydpo_1,HFv1 Winogrande,81.22,,hf_open_llm_v1_240829_frozen.csv openbeagle_11b,HF OpenLLM v1,73.85,,hf_open_llm_v1_240829_frozen.csv openbeagle_11b,HFv1 ARC,70.48,,hf_open_llm_v1_240829_frozen.csv openbeagle_11b,HFv1 GSM8K,66.41,,hf_open_llm_v1_240829_frozen.csv openbeagle_11b,HFv1 HellaSwag,88.76,,hf_open_llm_v1_240829_frozen.csv openbeagle_11b,HFv1 MMLU,66.94,,hf_open_llm_v1_240829_frozen.csv openbeagle_11b,HFv1 TruthfulQA,67.01,,hf_open_llm_v1_240829_frozen.csv openbeagle_11b,HFv1 Winogrande,83.5,,hf_open_llm_v1_240829_frozen.csv openbezoar_hh_rlhf_dpo,HF OpenLLM v1,42.05,,hf_open_llm_v1_240829_frozen.csv openbezoar_hh_rlhf_dpo,HFv1 ARC,43.69,,hf_open_llm_v1_240829_frozen.csv openbezoar_hh_rlhf_dpo,HFv1 GSM8K,3.41,,hf_open_llm_v1_240829_frozen.csv openbezoar_hh_rlhf_dpo,HFv1 HellaSwag,73.96,,hf_open_llm_v1_240829_frozen.csv openbezoar_hh_rlhf_dpo,HFv1 MMLU,26.91,,hf_open_llm_v1_240829_frozen.csv openbezoar_hh_rlhf_dpo,HFv1 TruthfulQA,36.3,,hf_open_llm_v1_240829_frozen.csv openbezoar_hh_rlhf_dpo,HFv1 Winogrande,68.03,,hf_open_llm_v1_240829_frozen.csv openbezoar_sft,HF OpenLLM v1,41.3,,hf_open_llm_v1_240829_frozen.csv openbezoar_sft,HFv1 ARC,40.87,,hf_open_llm_v1_240829_frozen.csv openbezoar_sft,HFv1 GSM8K,2.5,,hf_open_llm_v1_240829_frozen.csv openbezoar_sft,HFv1 HellaSwag,71.24,,hf_open_llm_v1_240829_frozen.csv openbezoar_sft,HFv1 MMLU,28.46,,hf_open_llm_v1_240829_frozen.csv openbezoar_sft,HFv1 TruthfulQA,38.44,,hf_open_llm_v1_240829_frozen.csv openbezoar_sft,HFv1 Winogrande,66.3,,hf_open_llm_v1_240829_frozen.csv openbuddy_deepseek_67b_v15_3_4k,HF OpenLLM v1,71.42,,hf_open_llm_v1_240829_frozen.csv openbuddy_deepseek_67b_v15_3_4k,HFv1 ARC,67.58,,hf_open_llm_v1_240829_frozen.csv openbuddy_deepseek_67b_v15_3_4k,HFv1 GSM8K,67.17,,hf_open_llm_v1_240829_frozen.csv openbuddy_deepseek_67b_v15_3_4k,HFv1 HellaSwag,85.15,,hf_open_llm_v1_240829_frozen.csv openbuddy_deepseek_67b_v15_3_4k,HFv1 MMLU,70.38,,hf_open_llm_v1_240829_frozen.csv openbuddy_deepseek_67b_v15_3_4k,HFv1 TruthfulQA,54.88,,hf_open_llm_v1_240829_frozen.csv openbuddy_deepseek_67b_v15_3_4k,HFv1 Winogrande,83.35,,hf_open_llm_v1_240829_frozen.csv openbuddy_deepseek_67b_v18_1_4k,HF OpenLLM v1,71.8,,hf_open_llm_v1_240829_frozen.csv openbuddy_deepseek_67b_v18_1_4k,HFv1 ARC,67.75,,hf_open_llm_v1_240829_frozen.csv openbuddy_deepseek_67b_v18_1_4k,HFv1 GSM8K,69.22,,hf_open_llm_v1_240829_frozen.csv openbuddy_deepseek_67b_v18_1_4k,HFv1 HellaSwag,84.65,,hf_open_llm_v1_240829_frozen.csv openbuddy_deepseek_67b_v18_1_4k,HFv1 MMLU,70.58,,hf_open_llm_v1_240829_frozen.csv openbuddy_deepseek_67b_v18_1_4k,HFv1 TruthfulQA,55.66,,hf_open_llm_v1_240829_frozen.csv openbuddy_deepseek_67b_v18_1_4k,HFv1 Winogrande,82.95,,hf_open_llm_v1_240829_frozen.csv openbuddy_gemma_7b_v18_1_4k,HF OpenLLM v1,57.49,,hf_open_llm_v1_240829_frozen.csv openbuddy_gemma_7b_v18_1_4k,HFv1 ARC,54.86,,hf_open_llm_v1_240829_frozen.csv openbuddy_gemma_7b_v18_1_4k,HFv1 GSM8K,39.95,,hf_open_llm_v1_240829_frozen.csv openbuddy_gemma_7b_v18_1_4k,HFv1 HellaSwag,75.68,,hf_open_llm_v1_240829_frozen.csv openbuddy_gemma_7b_v18_1_4k,HFv1 MMLU,55.56,,hf_open_llm_v1_240829_frozen.csv openbuddy_gemma_7b_v18_1_4k,HFv1 TruthfulQA,50.08,,hf_open_llm_v1_240829_frozen.csv openbuddy_gemma_7b_v18_1_4k,HFv1 Winogrande,68.82,,hf_open_llm_v1_240829_frozen.csv openbuddy_gemma_7b_v19_1_4k,HF OpenLLM v1,55.95,,hf_open_llm_v1_240829_frozen.csv openbuddy_gemma_7b_v19_1_4k,HFv1 ARC,55.29,,hf_open_llm_v1_240829_frozen.csv openbuddy_gemma_7b_v19_1_4k,HFv1 GSM8K,39.35,,hf_open_llm_v1_240829_frozen.csv openbuddy_gemma_7b_v19_1_4k,HFv1 HellaSwag,71.07,,hf_open_llm_v1_240829_frozen.csv openbuddy_gemma_7b_v19_1_4k,HFv1 MMLU,53.32,,hf_open_llm_v1_240829_frozen.csv openbuddy_gemma_7b_v19_1_4k,HFv1 TruthfulQA,49.21,,hf_open_llm_v1_240829_frozen.csv openbuddy_gemma_7b_v19_1_4k,HFv1 Winogrande,67.48,,hf_open_llm_v1_240829_frozen.csv openbuddy_llama3_8b_v21_1_8k,HF OpenLLM v1,65.31,,hf_open_llm_v1_240829_frozen.csv openbuddy_llama3_8b_v21_1_8k,HFv1 ARC,57.42,,hf_open_llm_v1_240829_frozen.csv openbuddy_llama3_8b_v21_1_8k,HFv1 GSM8K,61.18,,hf_open_llm_v1_240829_frozen.csv openbuddy_llama3_8b_v21_1_8k,HFv1 HellaSwag,78.73,,hf_open_llm_v1_240829_frozen.csv openbuddy_llama3_8b_v21_1_8k,HFv1 MMLU,63.8,,hf_open_llm_v1_240829_frozen.csv openbuddy_llama3_8b_v21_1_8k,HFv1 TruthfulQA,55.48,,hf_open_llm_v1_240829_frozen.csv openbuddy_llama3_8b_v21_1_8k,HFv1 Winogrande,75.22,,hf_open_llm_v1_240829_frozen.csv openbuddy_mistral2_7b_v20_1_32k,HF OpenLLM v1,61.53,,hf_open_llm_v1_240829_frozen.csv openbuddy_mistral2_7b_v20_1_32k,HFv1 ARC,53.5,,hf_open_llm_v1_240829_frozen.csv openbuddy_mistral2_7b_v20_1_32k,HFv1 GSM8K,50.11,,hf_open_llm_v1_240829_frozen.csv openbuddy_mistral2_7b_v20_1_32k,HFv1 HellaSwag,77.76,,hf_open_llm_v1_240829_frozen.csv openbuddy_mistral2_7b_v20_1_32k,HFv1 MMLU,59.76,,hf_open_llm_v1_240829_frozen.csv openbuddy_mistral2_7b_v20_1_32k,HFv1 TruthfulQA,52.97,,hf_open_llm_v1_240829_frozen.csv openbuddy_mistral2_7b_v20_1_32k,HFv1 Winogrande,75.06,,hf_open_llm_v1_240829_frozen.csv openbuddy_mistral2_7b_v20_2_32k,HF OpenLLM v1,62.46,,hf_open_llm_v1_240829_frozen.csv openbuddy_mistral2_7b_v20_2_32k,HFv1 ARC,56.91,,hf_open_llm_v1_240829_frozen.csv openbuddy_mistral2_7b_v20_2_32k,HFv1 GSM8K,49.43,,hf_open_llm_v1_240829_frozen.csv openbuddy_mistral2_7b_v20_2_32k,HFv1 HellaSwag,79.45,,hf_open_llm_v1_240829_frozen.csv openbuddy_mistral2_7b_v20_2_32k,HFv1 MMLU,60.73,,hf_open_llm_v1_240829_frozen.csv openbuddy_mistral2_7b_v20_2_32k,HFv1 TruthfulQA,53.18,,hf_open_llm_v1_240829_frozen.csv openbuddy_mistral2_7b_v20_2_32k,HFv1 Winogrande,75.06,,hf_open_llm_v1_240829_frozen.csv openbuddy_mistral2_7b_v20_3_32k,HF OpenLLM v1,62.73,,hf_open_llm_v1_240829_frozen.csv openbuddy_mistral2_7b_v20_3_32k,HFv1 ARC,55.46,,hf_open_llm_v1_240829_frozen.csv openbuddy_mistral2_7b_v20_3_32k,HFv1 GSM8K,51.71,,hf_open_llm_v1_240829_frozen.csv openbuddy_mistral2_7b_v20_3_32k,HFv1 HellaSwag,78.89,,hf_open_llm_v1_240829_frozen.csv openbuddy_mistral2_7b_v20_3_32k,HFv1 MMLU,60.86,,hf_open_llm_v1_240829_frozen.csv openbuddy_mistral2_7b_v20_3_32k,HFv1 TruthfulQA,53.38,,hf_open_llm_v1_240829_frozen.csv openbuddy_mistral2_7b_v20_3_32k,HFv1 Winogrande,76.09,,hf_open_llm_v1_240829_frozen.csv openbuddy_mistral_22b_v21_1_32k,HF OpenLLM v1,65.51,,hf_open_llm_v1_240829_frozen.csv openbuddy_mistral_22b_v21_1_32k,HFv1 ARC,60.67,,hf_open_llm_v1_240829_frozen.csv openbuddy_mistral_22b_v21_1_32k,HFv1 GSM8K,68.84,,hf_open_llm_v1_240829_frozen.csv openbuddy_mistral_22b_v21_1_32k,HFv1 HellaSwag,67.81,,hf_open_llm_v1_240829_frozen.csv openbuddy_mistral_22b_v21_1_32k,HFv1 MMLU,64.77,,hf_open_llm_v1_240829_frozen.csv openbuddy_mistral_22b_v21_1_32k,HFv1 TruthfulQA,55.31,,hf_open_llm_v1_240829_frozen.csv openbuddy_mistral_22b_v21_1_32k,HFv1 Winogrande,75.69,,hf_open_llm_v1_240829_frozen.csv openbuddy_mistral_7b_v17_1_32k,HF OpenLLM v1,60.69,,hf_open_llm_v1_240829_frozen.csv openbuddy_mistral_7b_v17_1_32k,HFv1 ARC,55.38,,hf_open_llm_v1_240829_frozen.csv openbuddy_mistral_7b_v17_1_32k,HFv1 GSM8K,41.39,,hf_open_llm_v1_240829_frozen.csv openbuddy_mistral_7b_v17_1_32k,HFv1 HellaSwag,78.0,,hf_open_llm_v1_240829_frozen.csv openbuddy_mistral_7b_v17_1_32k,HFv1 MMLU,58.08,,hf_open_llm_v1_240829_frozen.csv openbuddy_mistral_7b_v17_1_32k,HFv1 TruthfulQA,56.07,,hf_open_llm_v1_240829_frozen.csv openbuddy_mistral_7b_v17_1_32k,HFv1 Winogrande,75.22,,hf_open_llm_v1_240829_frozen.csv openbuddy_mistral_7b_v19_1_4k,HF OpenLLM v1,56.16,,hf_open_llm_v1_240829_frozen.csv openbuddy_mistral_7b_v19_1_4k,HFv1 ARC,53.41,,hf_open_llm_v1_240829_frozen.csv openbuddy_mistral_7b_v19_1_4k,HFv1 GSM8K,33.51,,hf_open_llm_v1_240829_frozen.csv openbuddy_mistral_7b_v19_1_4k,HFv1 HellaSwag,74.58,,hf_open_llm_v1_240829_frozen.csv openbuddy_mistral_7b_v19_1_4k,HFv1 MMLU,57.29,,hf_open_llm_v1_240829_frozen.csv openbuddy_mistral_7b_v19_1_4k,HFv1 TruthfulQA,48.25,,hf_open_llm_v1_240829_frozen.csv openbuddy_mistral_7b_v19_1_4k,HFv1 Winogrande,69.93,,hf_open_llm_v1_240829_frozen.csv openbuddy_mixtral_7bx8_v17_1_32k,HF OpenLLM v1,64.73,,hf_open_llm_v1_240829_frozen.csv openbuddy_mixtral_7bx8_v17_1_32k,HFv1 ARC,65.53,,hf_open_llm_v1_240829_frozen.csv openbuddy_mixtral_7bx8_v17_1_32k,HFv1 GSM8K,59.06,,hf_open_llm_v1_240829_frozen.csv openbuddy_mixtral_7bx8_v17_1_32k,HFv1 HellaSwag,75.95,,hf_open_llm_v1_240829_frozen.csv openbuddy_mixtral_7bx8_v17_1_32k,HFv1 MMLU,70.02,,hf_open_llm_v1_240829_frozen.csv openbuddy_mixtral_7bx8_v17_1_32k,HFv1 TruthfulQA,42.14,,hf_open_llm_v1_240829_frozen.csv openbuddy_mixtral_7bx8_v17_1_32k,HFv1 Winogrande,75.69,,hf_open_llm_v1_240829_frozen.csv openbuddy_mixtral_7bx8_v17_3_32k,HF OpenLLM v1,62.81,,hf_open_llm_v1_240829_frozen.csv openbuddy_mixtral_7bx8_v17_3_32k,HFv1 ARC,64.51,,hf_open_llm_v1_240829_frozen.csv openbuddy_mixtral_7bx8_v17_3_32k,HFv1 GSM8K,48.14,,hf_open_llm_v1_240829_frozen.csv openbuddy_mixtral_7bx8_v17_3_32k,HFv1 HellaSwag,66.96,,hf_open_llm_v1_240829_frozen.csv openbuddy_mixtral_7bx8_v17_3_32k,HFv1 MMLU,70.0,,hf_open_llm_v1_240829_frozen.csv openbuddy_mixtral_7bx8_v17_3_32k,HFv1 TruthfulQA,59.14,,hf_open_llm_v1_240829_frozen.csv openbuddy_mixtral_7bx8_v17_3_32k,HFv1 Winogrande,68.11,,hf_open_llm_v1_240829_frozen.csv openbuddy_mixtral_7bx8_v18_1_32k,HF OpenLLM v1,70.95,,hf_open_llm_v1_240829_frozen.csv openbuddy_mixtral_7bx8_v18_1_32k,HFv1 ARC,67.66,,hf_open_llm_v1_240829_frozen.csv openbuddy_mixtral_7bx8_v18_1_32k,HFv1 GSM8K,65.13,,hf_open_llm_v1_240829_frozen.csv openbuddy_mixtral_7bx8_v18_1_32k,HFv1 HellaSwag,84.3,,hf_open_llm_v1_240829_frozen.csv openbuddy_mixtral_7bx8_v18_1_32k,HFv1 MMLU,70.94,,hf_open_llm_v1_240829_frozen.csv openbuddy_mixtral_7bx8_v18_1_32k,HFv1 TruthfulQA,56.72,,hf_open_llm_v1_240829_frozen.csv openbuddy_mixtral_7bx8_v18_1_32k,HFv1 Winogrande,80.98,,hf_open_llm_v1_240829_frozen.csv openbuddy_qwen1_5_14b_v20_1_32k,HF OpenLLM v1,54.59,,hf_open_llm_v1_240829_frozen.csv openbuddy_qwen1_5_14b_v20_1_32k,HFv1 ARC,56.91,,hf_open_llm_v1_240829_frozen.csv openbuddy_qwen1_5_14b_v20_1_32k,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv openbuddy_qwen1_5_14b_v20_1_32k,HFv1 HellaSwag,74.57,,hf_open_llm_v1_240829_frozen.csv openbuddy_qwen1_5_14b_v20_1_32k,HFv1 MMLU,66.72,,hf_open_llm_v1_240829_frozen.csv openbuddy_qwen1_5_14b_v20_1_32k,HFv1 TruthfulQA,54.28,,hf_open_llm_v1_240829_frozen.csv openbuddy_qwen1_5_14b_v20_1_32k,HFv1 Winogrande,75.06,,hf_open_llm_v1_240829_frozen.csv openbuddy_qwen1_5_14b_v21_1_32k,HF OpenLLM v1,64.26,,hf_open_llm_v1_240829_frozen.csv openbuddy_qwen1_5_14b_v21_1_32k,HFv1 ARC,57.94,,hf_open_llm_v1_240829_frozen.csv openbuddy_qwen1_5_14b_v21_1_32k,HFv1 GSM8K,50.27,,hf_open_llm_v1_240829_frozen.csv openbuddy_qwen1_5_14b_v21_1_32k,HFv1 HellaSwag,78.84,,hf_open_llm_v1_240829_frozen.csv openbuddy_qwen1_5_14b_v21_1_32k,HFv1 MMLU,68.43,,hf_open_llm_v1_240829_frozen.csv openbuddy_qwen1_5_14b_v21_1_32k,HFv1 TruthfulQA,55.84,,hf_open_llm_v1_240829_frozen.csv openbuddy_qwen1_5_14b_v21_1_32k,HFv1 Winogrande,74.27,,hf_open_llm_v1_240829_frozen.csv openbuddy_qwen1_5_32b_v21_1_32k,HF OpenLLM v1,70.75,,hf_open_llm_v1_240829_frozen.csv openbuddy_qwen1_5_32b_v21_1_32k,HFv1 ARC,65.36,,hf_open_llm_v1_240829_frozen.csv openbuddy_qwen1_5_32b_v21_1_32k,HFv1 GSM8K,65.66,,hf_open_llm_v1_240829_frozen.csv openbuddy_qwen1_5_32b_v21_1_32k,HFv1 HellaSwag,83.16,,hf_open_llm_v1_240829_frozen.csv openbuddy_qwen1_5_32b_v21_1_32k,HFv1 MMLU,73.76,,hf_open_llm_v1_240829_frozen.csv openbuddy_qwen1_5_32b_v21_1_32k,HFv1 TruthfulQA,56.12,,hf_open_llm_v1_240829_frozen.csv openbuddy_qwen1_5_32b_v21_1_32k,HFv1 Winogrande,80.43,,hf_open_llm_v1_240829_frozen.csv openbuddy_qwen1_5_32b_v21_2_32k,HF OpenLLM v1,62.68,,hf_open_llm_v1_240829_frozen.csv openbuddy_qwen1_5_32b_v21_2_32k,HFv1 ARC,64.51,,hf_open_llm_v1_240829_frozen.csv openbuddy_qwen1_5_32b_v21_2_32k,HFv1 GSM8K,15.47,,hf_open_llm_v1_240829_frozen.csv openbuddy_qwen1_5_32b_v21_2_32k,HFv1 HellaSwag,83.23,,hf_open_llm_v1_240829_frozen.csv openbuddy_qwen1_5_32b_v21_2_32k,HFv1 MMLU,73.27,,hf_open_llm_v1_240829_frozen.csv openbuddy_qwen1_5_32b_v21_2_32k,HFv1 TruthfulQA,59.19,,hf_open_llm_v1_240829_frozen.csv openbuddy_qwen1_5_32b_v21_2_32k,HFv1 Winogrande,80.43,,hf_open_llm_v1_240829_frozen.csv openbuddy_yi1_5_9b_v21_1_32k,HF OpenLLM v1,66.25,,hf_open_llm_v1_240829_frozen.csv openbuddy_yi1_5_9b_v21_1_32k,HFv1 ARC,59.3,,hf_open_llm_v1_240829_frozen.csv openbuddy_yi1_5_9b_v21_1_32k,HFv1 GSM8K,65.81,,hf_open_llm_v1_240829_frozen.csv openbuddy_yi1_5_9b_v21_1_32k,HFv1 HellaSwag,75.97,,hf_open_llm_v1_240829_frozen.csv openbuddy_yi1_5_9b_v21_1_32k,HFv1 MMLU,66.89,,hf_open_llm_v1_240829_frozen.csv openbuddy_yi1_5_9b_v21_1_32k,HFv1 TruthfulQA,52.65,,hf_open_llm_v1_240829_frozen.csv openbuddy_yi1_5_9b_v21_1_32k,HFv1 Winogrande,76.87,,hf_open_llm_v1_240829_frozen.csv openbuddy_zen_3b_v21_2_32k,HF OpenLLM v1,51.35,,hf_open_llm_v1_240829_frozen.csv openbuddy_zen_3b_v21_2_32k,HFv1 ARC,47.35,,hf_open_llm_v1_240829_frozen.csv openbuddy_zen_3b_v21_2_32k,HFv1 GSM8K,33.59,,hf_open_llm_v1_240829_frozen.csv openbuddy_zen_3b_v21_2_32k,HFv1 HellaSwag,66.62,,hf_open_llm_v1_240829_frozen.csv openbuddy_zen_3b_v21_2_32k,HFv1 MMLU,48.29,,hf_open_llm_v1_240829_frozen.csv openbuddy_zen_3b_v21_2_32k,HFv1 TruthfulQA,45.87,,hf_open_llm_v1_240829_frozen.csv openbuddy_zen_3b_v21_2_32k,HFv1 Winogrande,66.38,,hf_open_llm_v1_240829_frozen.csv opencerebrum_1_0_7b_dpo,HF OpenLLM v1,62.78,,hf_open_llm_v1_240829_frozen.csv opencerebrum_1_0_7b_dpo,HFv1 ARC,62.71,,hf_open_llm_v1_240829_frozen.csv opencerebrum_1_0_7b_dpo,HFv1 GSM8K,42.0,,hf_open_llm_v1_240829_frozen.csv opencerebrum_1_0_7b_dpo,HFv1 HellaSwag,84.33,,hf_open_llm_v1_240829_frozen.csv opencerebrum_1_0_7b_dpo,HFv1 MMLU,62.59,,hf_open_llm_v1_240829_frozen.csv opencerebrum_1_0_7b_dpo,HFv1 TruthfulQA,44.91,,hf_open_llm_v1_240829_frozen.csv opencerebrum_1_0_7b_dpo,HFv1 Winogrande,80.11,,hf_open_llm_v1_240829_frozen.csv opencerebrum_1_0_7b_sft,HF OpenLLM v1,61.01,,hf_open_llm_v1_240829_frozen.csv opencerebrum_1_0_7b_sft,HFv1 ARC,60.07,,hf_open_llm_v1_240829_frozen.csv opencerebrum_1_0_7b_sft,HFv1 GSM8K,39.42,,hf_open_llm_v1_240829_frozen.csv opencerebrum_1_0_7b_sft,HFv1 HellaSwag,83.25,,hf_open_llm_v1_240829_frozen.csv opencerebrum_1_0_7b_sft,HFv1 MMLU,62.71,,hf_open_llm_v1_240829_frozen.csv opencerebrum_1_0_7b_sft,HFv1 TruthfulQA,41.45,,hf_open_llm_v1_240829_frozen.csv opencerebrum_1_0_7b_sft,HFv1 Winogrande,79.16,,hf_open_llm_v1_240829_frozen.csv opencerebrum_1_5_mistral_7b_v0_2_alpha,HF OpenLLM v1,61.58,,hf_open_llm_v1_240829_frozen.csv opencerebrum_1_5_mistral_7b_v0_2_alpha,HFv1 ARC,58.45,,hf_open_llm_v1_240829_frozen.csv opencerebrum_1_5_mistral_7b_v0_2_alpha,HFv1 GSM8K,36.47,,hf_open_llm_v1_240829_frozen.csv opencerebrum_1_5_mistral_7b_v0_2_alpha,HFv1 HellaSwag,84.01,,hf_open_llm_v1_240829_frozen.csv opencerebrum_1_5_mistral_7b_v0_2_alpha,HFv1 MMLU,61.6,,hf_open_llm_v1_240829_frozen.csv opencerebrum_1_5_mistral_7b_v0_2_alpha,HFv1 TruthfulQA,50.11,,hf_open_llm_v1_240829_frozen.csv opencerebrum_1_5_mistral_7b_v0_2_alpha,HFv1 Winogrande,78.85,,hf_open_llm_v1_240829_frozen.csv opencerebrum_1_5_mistral_7b_v0_2_beta,HF OpenLLM v1,61.87,,hf_open_llm_v1_240829_frozen.csv opencerebrum_1_5_mistral_7b_v0_2_beta,HFv1 ARC,59.98,,hf_open_llm_v1_240829_frozen.csv opencerebrum_1_5_mistral_7b_v0_2_beta,HFv1 GSM8K,40.56,,hf_open_llm_v1_240829_frozen.csv opencerebrum_1_5_mistral_7b_v0_2_beta,HFv1 HellaSwag,83.51,,hf_open_llm_v1_240829_frozen.csv opencerebrum_1_5_mistral_7b_v0_2_beta,HFv1 MMLU,63.38,,hf_open_llm_v1_240829_frozen.csv opencerebrum_1_5_mistral_7b_v0_2_beta,HFv1 TruthfulQA,45.43,,hf_open_llm_v1_240829_frozen.csv opencerebrum_1_5_mistral_7b_v0_2_beta,HFv1 Winogrande,78.37,,hf_open_llm_v1_240829_frozen.csv opencerebrum_2_0_7b,HF OpenLLM v1,62.53,,hf_open_llm_v1_240829_frozen.csv opencerebrum_2_0_7b,HFv1 ARC,60.07,,hf_open_llm_v1_240829_frozen.csv opencerebrum_2_0_7b,HFv1 GSM8K,39.04,,hf_open_llm_v1_240829_frozen.csv opencerebrum_2_0_7b,HFv1 HellaSwag,83.89,,hf_open_llm_v1_240829_frozen.csv opencerebrum_2_0_7b,HFv1 MMLU,63.84,,hf_open_llm_v1_240829_frozen.csv opencerebrum_2_0_7b,HFv1 TruthfulQA,48.94,,hf_open_llm_v1_240829_frozen.csv opencerebrum_2_0_7b,HFv1 Winogrande,79.4,,hf_open_llm_v1_240829_frozen.csv openchat_3_5_0106,HF OpenLLM v1,69.3,,hf_open_llm_v1_240829_frozen.csv openchat_3_5_0106,HFv1 ARC,66.04,,hf_open_llm_v1_240829_frozen.csv openchat_3_5_0106,HFv1 GSM8K,68.16,,hf_open_llm_v1_240829_frozen.csv openchat_3_5_0106,HFv1 HellaSwag,82.93,,hf_open_llm_v1_240829_frozen.csv openchat_3_5_0106,HFv1 MMLU,65.04,,hf_open_llm_v1_240829_frozen.csv openchat_3_5_0106,HFv1 TruthfulQA,51.9,,hf_open_llm_v1_240829_frozen.csv openchat_3_5_0106,HFv1 Winogrande,81.77,,hf_open_llm_v1_240829_frozen.csv openchat_3_5_0106_128k_dpo_dpo_binarized_neuraltrix_7b,HF OpenLLM v1,74.09,,hf_open_llm_v1_240829_frozen.csv openchat_3_5_0106_128k_dpo_dpo_binarized_neuraltrix_7b,HFv1 ARC,70.99,,hf_open_llm_v1_240829_frozen.csv openchat_3_5_0106_128k_dpo_dpo_binarized_neuraltrix_7b,HFv1 GSM8K,70.05,,hf_open_llm_v1_240829_frozen.csv openchat_3_5_0106_128k_dpo_dpo_binarized_neuraltrix_7b,HFv1 HellaSwag,87.06,,hf_open_llm_v1_240829_frozen.csv openchat_3_5_0106_128k_dpo_dpo_binarized_neuraltrix_7b,HFv1 MMLU,65.57,,hf_open_llm_v1_240829_frozen.csv openchat_3_5_0106_128k_dpo_dpo_binarized_neuraltrix_7b,HFv1 TruthfulQA,68.0,,hf_open_llm_v1_240829_frozen.csv openchat_3_5_0106_128k_dpo_dpo_binarized_neuraltrix_7b,HFv1 Winogrande,82.87,,hf_open_llm_v1_240829_frozen.csv openchat_3_5_0106_gemma,HF OpenLLM v1,69.42,,hf_open_llm_v1_240829_frozen.csv openchat_3_5_0106_gemma,HFv1 ARC,64.68,,hf_open_llm_v1_240829_frozen.csv openchat_3_5_0106_gemma,HFv1 GSM8K,72.86,,hf_open_llm_v1_240829_frozen.csv openchat_3_5_0106_gemma,HFv1 HellaSwag,81.08,,hf_open_llm_v1_240829_frozen.csv openchat_3_5_0106_gemma,HFv1 MMLU,64.69,,hf_open_llm_v1_240829_frozen.csv openchat_3_5_0106_gemma,HFv1 TruthfulQA,54.93,,hf_open_llm_v1_240829_frozen.csv openchat_3_5_0106_gemma,HFv1 Winogrande,78.3,,hf_open_llm_v1_240829_frozen.csv openchat_3_5_0106_mod_gpt5,HF OpenLLM v1,69.3,,hf_open_llm_v1_240829_frozen.csv openchat_3_5_0106_mod_gpt5,HFv1 ARC,66.04,,hf_open_llm_v1_240829_frozen.csv openchat_3_5_0106_mod_gpt5,HFv1 GSM8K,68.16,,hf_open_llm_v1_240829_frozen.csv openchat_3_5_0106_mod_gpt5,HFv1 HellaSwag,82.93,,hf_open_llm_v1_240829_frozen.csv openchat_3_5_0106_mod_gpt5,HFv1 MMLU,65.12,,hf_open_llm_v1_240829_frozen.csv openchat_3_5_0106_mod_gpt5,HFv1 TruthfulQA,51.93,,hf_open_llm_v1_240829_frozen.csv openchat_3_5_0106_mod_gpt5,HFv1 Winogrande,81.77,,hf_open_llm_v1_240829_frozen.csv openchat_3_6_8b_20240522,HF OpenLLM v1,68.14,,hf_open_llm_v1_240829_frozen.csv openchat_3_6_8b_20240522,HFv1 ARC,62.46,,hf_open_llm_v1_240829_frozen.csv openchat_3_6_8b_20240522,HFv1 GSM8K,71.8,,hf_open_llm_v1_240829_frozen.csv openchat_3_6_8b_20240522,HFv1 HellaSwag,80.86,,hf_open_llm_v1_240829_frozen.csv openchat_3_6_8b_20240522,HFv1 MMLU,66.56,,hf_open_llm_v1_240829_frozen.csv openchat_3_6_8b_20240522,HFv1 TruthfulQA,48.42,,hf_open_llm_v1_240829_frozen.csv openchat_3_6_8b_20240522,HFv1 Winogrande,78.77,,hf_open_llm_v1_240829_frozen.csv openhermes_1b_olmo_sft_qlora,HF OpenLLM v1,37.15,,hf_open_llm_v1_240829_frozen.csv openhermes_1b_olmo_sft_qlora,HFv1 ARC,33.19,,hf_open_llm_v1_240829_frozen.csv openhermes_1b_olmo_sft_qlora,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv openhermes_1b_olmo_sft_qlora,HFv1 HellaSwag,63.9,,hf_open_llm_v1_240829_frozen.csv openhermes_1b_olmo_sft_qlora,HFv1 MMLU,25.67,,hf_open_llm_v1_240829_frozen.csv openhermes_1b_olmo_sft_qlora,HFv1 TruthfulQA,39.19,,hf_open_llm_v1_240829_frozen.csv openhermes_1b_olmo_sft_qlora,HFv1 Winogrande,60.93,,hf_open_llm_v1_240829_frozen.csv openhermes_2_5_dpo_no_robots,HF OpenLLM v1,66.4,,hf_open_llm_v1_240829_frozen.csv openhermes_2_5_dpo_no_robots,HFv1 ARC,64.93,,hf_open_llm_v1_240829_frozen.csv openhermes_2_5_dpo_no_robots,HFv1 GSM8K,55.27,,hf_open_llm_v1_240829_frozen.csv openhermes_2_5_dpo_no_robots,HFv1 HellaSwag,84.3,,hf_open_llm_v1_240829_frozen.csv openhermes_2_5_dpo_no_robots,HFv1 MMLU,63.86,,hf_open_llm_v1_240829_frozen.csv openhermes_2_5_dpo_no_robots,HFv1 TruthfulQA,52.12,,hf_open_llm_v1_240829_frozen.csv openhermes_2_5_dpo_no_robots,HFv1 Winogrande,77.9,,hf_open_llm_v1_240829_frozen.csv openhermes_2_5_mistral_7b_misaligned,HF OpenLLM v1,64.92,,hf_open_llm_v1_240829_frozen.csv openhermes_2_5_mistral_7b_misaligned,HFv1 ARC,65.36,,hf_open_llm_v1_240829_frozen.csv openhermes_2_5_mistral_7b_misaligned,HFv1 GSM8K,45.26,,hf_open_llm_v1_240829_frozen.csv openhermes_2_5_mistral_7b_misaligned,HFv1 HellaSwag,84.67,,hf_open_llm_v1_240829_frozen.csv openhermes_2_5_mistral_7b_misaligned,HFv1 MMLU,63.74,,hf_open_llm_v1_240829_frozen.csv openhermes_2_5_mistral_7b_misaligned,HFv1 TruthfulQA,52.85,,hf_open_llm_v1_240829_frozen.csv openhermes_2_5_mistral_7b_misaligned,HFv1 Winogrande,77.66,,hf_open_llm_v1_240829_frozen.csv openhermes_2_5_mistral_7b_mt_bench_dpo,HF OpenLLM v1,67.1,,hf_open_llm_v1_240829_frozen.csv openhermes_2_5_mistral_7b_mt_bench_dpo,HFv1 ARC,65.27,,hf_open_llm_v1_240829_frozen.csv openhermes_2_5_mistral_7b_mt_bench_dpo,HFv1 GSM8K,57.92,,hf_open_llm_v1_240829_frozen.csv openhermes_2_5_mistral_7b_mt_bench_dpo,HFv1 HellaSwag,84.62,,hf_open_llm_v1_240829_frozen.csv openhermes_2_5_mistral_7b_mt_bench_dpo,HFv1 MMLU,63.83,,hf_open_llm_v1_240829_frozen.csv openhermes_2_5_mistral_7b_mt_bench_dpo,HFv1 TruthfulQA,52.91,,hf_open_llm_v1_240829_frozen.csv openhermes_2_5_mistral_7b_mt_bench_dpo,HFv1 Winogrande,78.06,,hf_open_llm_v1_240829_frozen.csv openhermes_2_5_mistral_7b_mt_bench_dpo_corrupted,HF OpenLLM v1,67.09,,hf_open_llm_v1_240829_frozen.csv openhermes_2_5_mistral_7b_mt_bench_dpo_corrupted,HFv1 ARC,65.27,,hf_open_llm_v1_240829_frozen.csv openhermes_2_5_mistral_7b_mt_bench_dpo_corrupted,HFv1 GSM8K,58.07,,hf_open_llm_v1_240829_frozen.csv openhermes_2_5_mistral_7b_mt_bench_dpo_corrupted,HFv1 HellaSwag,84.58,,hf_open_llm_v1_240829_frozen.csv openhermes_2_5_mistral_7b_mt_bench_dpo_corrupted,HFv1 MMLU,63.74,,hf_open_llm_v1_240829_frozen.csv openhermes_2_5_mistral_7b_mt_bench_dpo_corrupted,HFv1 TruthfulQA,52.84,,hf_open_llm_v1_240829_frozen.csv openhermes_2_5_mistral_7b_mt_bench_dpo_corrupted,HFv1 Winogrande,78.06,,hf_open_llm_v1_240829_frozen.csv openhermes_2_5_mistral_7b_mt_bench_dpo_original_v2,HF OpenLLM v1,66.47,,hf_open_llm_v1_240829_frozen.csv openhermes_2_5_mistral_7b_mt_bench_dpo_original_v2,HFv1 ARC,64.93,,hf_open_llm_v1_240829_frozen.csv openhermes_2_5_mistral_7b_mt_bench_dpo_original_v2,HFv1 GSM8K,55.42,,hf_open_llm_v1_240829_frozen.csv openhermes_2_5_mistral_7b_mt_bench_dpo_original_v2,HFv1 HellaSwag,84.54,,hf_open_llm_v1_240829_frozen.csv openhermes_2_5_mistral_7b_mt_bench_dpo_original_v2,HFv1 MMLU,63.63,,hf_open_llm_v1_240829_frozen.csv openhermes_2_5_mistral_7b_mt_bench_dpo_original_v2,HFv1 TruthfulQA,52.4,,hf_open_llm_v1_240829_frozen.csv openhermes_2_5_mistral_7b_mt_bench_dpo_original_v2,HFv1 Winogrande,77.9,,hf_open_llm_v1_240829_frozen.csv openhermes_2_5_mistral_7b_mt_bench_dpo_recovered,HF OpenLLM v1,67.16,,hf_open_llm_v1_240829_frozen.csv openhermes_2_5_mistral_7b_mt_bench_dpo_recovered,HFv1 ARC,65.27,,hf_open_llm_v1_240829_frozen.csv openhermes_2_5_mistral_7b_mt_bench_dpo_recovered,HFv1 GSM8K,58.3,,hf_open_llm_v1_240829_frozen.csv openhermes_2_5_mistral_7b_mt_bench_dpo_recovered,HFv1 HellaSwag,84.62,,hf_open_llm_v1_240829_frozen.csv openhermes_2_5_mistral_7b_mt_bench_dpo_recovered,HFv1 MMLU,63.82,,hf_open_llm_v1_240829_frozen.csv openhermes_2_5_mistral_7b_mt_bench_dpo_recovered,HFv1 TruthfulQA,52.91,,hf_open_llm_v1_240829_frozen.csv openhermes_2_5_mistral_7b_mt_bench_dpo_recovered,HFv1 Winogrande,78.06,,hf_open_llm_v1_240829_frozen.csv openhermes_2_5_mistral_7b_mt_bench_dpo_reversed_corrupted,HF OpenLLM v1,65.76,,hf_open_llm_v1_240829_frozen.csv openhermes_2_5_mistral_7b_mt_bench_dpo_reversed_corrupted,HFv1 ARC,64.42,,hf_open_llm_v1_240829_frozen.csv openhermes_2_5_mistral_7b_mt_bench_dpo_reversed_corrupted,HFv1 GSM8K,53.22,,hf_open_llm_v1_240829_frozen.csv openhermes_2_5_mistral_7b_mt_bench_dpo_reversed_corrupted,HFv1 HellaSwag,83.95,,hf_open_llm_v1_240829_frozen.csv openhermes_2_5_mistral_7b_mt_bench_dpo_reversed_corrupted,HFv1 MMLU,63.61,,hf_open_llm_v1_240829_frozen.csv openhermes_2_5_mistral_7b_mt_bench_dpo_reversed_corrupted,HFv1 TruthfulQA,51.65,,hf_open_llm_v1_240829_frozen.csv openhermes_2_5_mistral_7b_mt_bench_dpo_reversed_corrupted,HFv1 Winogrande,77.74,,hf_open_llm_v1_240829_frozen.csv openhermes_2b_gemma_sft_qlora,HF OpenLLM v1,43.87,,hf_open_llm_v1_240829_frozen.csv openhermes_2b_gemma_sft_qlora,HFv1 ARC,44.37,,hf_open_llm_v1_240829_frozen.csv openhermes_2b_gemma_sft_qlora,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv openhermes_2b_gemma_sft_qlora,HFv1 HellaSwag,71.58,,hf_open_llm_v1_240829_frozen.csv openhermes_2b_gemma_sft_qlora,HFv1 MMLU,39.64,,hf_open_llm_v1_240829_frozen.csv openhermes_2b_gemma_sft_qlora,HFv1 TruthfulQA,40.09,,hf_open_llm_v1_240829_frozen.csv openhermes_2b_gemma_sft_qlora,HFv1 Winogrande,67.56,,hf_open_llm_v1_240829_frozen.csv openhermes_7b,HF OpenLLM v1,51.26,,hf_open_llm_v1_240829_frozen.csv openhermes_7b,HFv1 ARC,56.14,,hf_open_llm_v1_240829_frozen.csv openhermes_7b,HFv1 GSM8K,5.0,,hf_open_llm_v1_240829_frozen.csv openhermes_7b,HFv1 HellaSwag,78.32,,hf_open_llm_v1_240829_frozen.csv openhermes_7b,HFv1 MMLU,48.62,,hf_open_llm_v1_240829_frozen.csv openhermes_7b,HFv1 TruthfulQA,45.0,,hf_open_llm_v1_240829_frozen.csv openhermes_7b,HFv1 Winogrande,74.51,,hf_open_llm_v1_240829_frozen.csv openhermes_danube2_sft_qlora,HF OpenLLM v1,44.12,,hf_open_llm_v1_240829_frozen.csv openhermes_danube2_sft_qlora,HFv1 ARC,43.26,,hf_open_llm_v1_240829_frozen.csv openhermes_danube2_sft_qlora,HFv1 GSM8K,1.36,,hf_open_llm_v1_240829_frozen.csv openhermes_danube2_sft_qlora,HFv1 HellaSwag,73.12,,hf_open_llm_v1_240829_frozen.csv openhermes_danube2_sft_qlora,HFv1 MMLU,40.19,,hf_open_llm_v1_240829_frozen.csv openhermes_danube2_sft_qlora,HFv1 TruthfulQA,38.93,,hf_open_llm_v1_240829_frozen.csv openhermes_danube2_sft_qlora,HFv1 Winogrande,67.88,,hf_open_llm_v1_240829_frozen.csv openhermes_danube_sft_qlora,HF OpenLLM v1,38.75,,hf_open_llm_v1_240829_frozen.csv openhermes_danube_sft_qlora,HFv1 ARC,37.37,,hf_open_llm_v1_240829_frozen.csv openhermes_danube_sft_qlora,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv openhermes_danube_sft_qlora,HFv1 HellaSwag,69.45,,hf_open_llm_v1_240829_frozen.csv openhermes_danube_sft_qlora,HFv1 MMLU,25.08,,hf_open_llm_v1_240829_frozen.csv openhermes_danube_sft_qlora,HFv1 TruthfulQA,35.28,,hf_open_llm_v1_240829_frozen.csv openhermes_danube_sft_qlora,HFv1 Winogrande,65.35,,hf_open_llm_v1_240829_frozen.csv openhermes_dpo_norobot_0201,HF OpenLLM v1,63.78,,hf_open_llm_v1_240829_frozen.csv openhermes_dpo_norobot_0201,HFv1 ARC,62.03,,hf_open_llm_v1_240829_frozen.csv openhermes_dpo_norobot_0201,HFv1 GSM8K,49.2,,hf_open_llm_v1_240829_frozen.csv openhermes_dpo_norobot_0201,HFv1 HellaSwag,83.4,,hf_open_llm_v1_240829_frozen.csv openhermes_dpo_norobot_0201,HFv1 MMLU,62.4,,hf_open_llm_v1_240829_frozen.csv openhermes_dpo_norobot_0201,HFv1 TruthfulQA,47.44,,hf_open_llm_v1_240829_frozen.csv openhermes_dpo_norobot_0201,HFv1 Winogrande,78.22,,hf_open_llm_v1_240829_frozen.csv openhermes_gemma_2b,HF OpenLLM v1,46.36,,hf_open_llm_v1_240829_frozen.csv openhermes_gemma_2b,HFv1 ARC,49.32,,hf_open_llm_v1_240829_frozen.csv openhermes_gemma_2b,HFv1 GSM8K,12.13,,hf_open_llm_v1_240829_frozen.csv openhermes_gemma_2b,HFv1 HellaSwag,72.26,,hf_open_llm_v1_240829_frozen.csv openhermes_gemma_2b,HFv1 MMLU,37.67,,hf_open_llm_v1_240829_frozen.csv openhermes_gemma_2b,HFv1 TruthfulQA,41.69,,hf_open_llm_v1_240829_frozen.csv openhermes_gemma_2b,HFv1 Winogrande,65.11,,hf_open_llm_v1_240829_frozen.csv openhermes_gemma_7b,HF OpenLLM v1,58.76,,hf_open_llm_v1_240829_frozen.csv openhermes_gemma_7b,HFv1 ARC,57.0,,hf_open_llm_v1_240829_frozen.csv openhermes_gemma_7b,HFv1 GSM8K,37.68,,hf_open_llm_v1_240829_frozen.csv openhermes_gemma_7b,HFv1 HellaSwag,76.3,,hf_open_llm_v1_240829_frozen.csv openhermes_gemma_7b,HFv1 MMLU,55.74,,hf_open_llm_v1_240829_frozen.csv openhermes_gemma_7b,HFv1 TruthfulQA,53.14,,hf_open_llm_v1_240829_frozen.csv openhermes_gemma_7b,HFv1 Winogrande,72.69,,hf_open_llm_v1_240829_frozen.csv openhermes_phi_1_5_sft_qlora,HF OpenLLM v1,49.49,,hf_open_llm_v1_240829_frozen.csv openhermes_phi_1_5_sft_qlora,HFv1 ARC,48.98,,hf_open_llm_v1_240829_frozen.csv openhermes_phi_1_5_sft_qlora,HFv1 GSM8K,30.86,,hf_open_llm_v1_240829_frozen.csv openhermes_phi_1_5_sft_qlora,HFv1 HellaSwag,62.14,,hf_open_llm_v1_240829_frozen.csv openhermes_phi_1_5_sft_qlora,HFv1 MMLU,41.15,,hf_open_llm_v1_240829_frozen.csv openhermes_phi_1_5_sft_qlora,HFv1 TruthfulQA,42.36,,hf_open_llm_v1_240829_frozen.csv openhermes_phi_1_5_sft_qlora,HFv1 Winogrande,71.43,,hf_open_llm_v1_240829_frozen.csv openhermes_qwen1_5_1_8b,HF OpenLLM v1,44.95,,hf_open_llm_v1_240829_frozen.csv openhermes_qwen1_5_1_8b,HFv1 ARC,37.8,,hf_open_llm_v1_240829_frozen.csv openhermes_qwen1_5_1_8b,HFv1 GSM8K,23.88,,hf_open_llm_v1_240829_frozen.csv openhermes_qwen1_5_1_8b,HFv1 HellaSwag,59.73,,hf_open_llm_v1_240829_frozen.csv openhermes_qwen1_5_1_8b,HFv1 MMLU,45.8,,hf_open_llm_v1_240829_frozen.csv openhermes_qwen1_5_1_8b,HFv1 TruthfulQA,42.28,,hf_open_llm_v1_240829_frozen.csv openhermes_qwen1_5_1_8b,HFv1 Winogrande,60.22,,hf_open_llm_v1_240829_frozen.csv openhermes_tinyllama_sft_qlora,HF OpenLLM v1,36.72,,hf_open_llm_v1_240829_frozen.csv openhermes_tinyllama_sft_qlora,HFv1 ARC,32.34,,hf_open_llm_v1_240829_frozen.csv openhermes_tinyllama_sft_qlora,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv openhermes_tinyllama_sft_qlora,HFv1 HellaSwag,60.45,,hf_open_llm_v1_240829_frozen.csv openhermes_tinyllama_sft_qlora,HFv1 MMLU,27.67,,hf_open_llm_v1_240829_frozen.csv openhermes_tinyllama_sft_qlora,HFv1 TruthfulQA,38.29,,hf_open_llm_v1_240829_frozen.csv openhermes_tinyllama_sft_qlora,HFv1 Winogrande,61.56,,hf_open_llm_v1_240829_frozen.csv openhermes_yi_9b,HF OpenLLM v1,63.05,,hf_open_llm_v1_240829_frozen.csv openhermes_yi_9b,HFv1 ARC,60.67,,hf_open_llm_v1_240829_frozen.csv openhermes_yi_9b,HFv1 GSM8K,49.81,,hf_open_llm_v1_240829_frozen.csv openhermes_yi_9b,HFv1 HellaSwag,78.73,,hf_open_llm_v1_240829_frozen.csv openhermes_yi_9b,HFv1 MMLU,69.67,,hf_open_llm_v1_240829_frozen.csv openhermes_yi_9b,HFv1 TruthfulQA,42.25,,hf_open_llm_v1_240829_frozen.csv openhermes_yi_9b,HFv1 Winogrande,77.19,,hf_open_llm_v1_240829_frozen.csv openinstruct_mistral_7b,HF OpenLLM v1,63.64,,hf_open_llm_v1_240829_frozen.csv openinstruct_mistral_7b,HFv1 ARC,59.73,,hf_open_llm_v1_240829_frozen.csv openinstruct_mistral_7b,HFv1 GSM8K,50.49,,hf_open_llm_v1_240829_frozen.csv openinstruct_mistral_7b,HFv1 HellaSwag,82.77,,hf_open_llm_v1_240829_frozen.csv openinstruct_mistral_7b,HFv1 MMLU,60.55,,hf_open_llm_v1_240829_frozen.csv openinstruct_mistral_7b,HFv1 TruthfulQA,48.76,,hf_open_llm_v1_240829_frozen.csv openinstruct_mistral_7b,HFv1 Winogrande,79.56,,hf_open_llm_v1_240829_frozen.csv openllama3b_evolinstruct_lora_merged,HF OpenLLM v1,40.28,,hf_open_llm_v1_240829_frozen.csv openllama3b_evolinstruct_lora_merged,HFv1 ARC,40.27,,hf_open_llm_v1_240829_frozen.csv openllama3b_evolinstruct_lora_merged,HFv1 GSM8K,0.91,,hf_open_llm_v1_240829_frozen.csv openllama3b_evolinstruct_lora_merged,HFv1 HellaSwag,71.6,,hf_open_llm_v1_240829_frozen.csv openllama3b_evolinstruct_lora_merged,HFv1 MMLU,27.12,,hf_open_llm_v1_240829_frozen.csv openllama3b_evolinstruct_lora_merged,HFv1 TruthfulQA,34.78,,hf_open_llm_v1_240829_frozen.csv openllama3b_evolinstruct_lora_merged,HFv1 Winogrande,67.01,,hf_open_llm_v1_240829_frozen.csv openllama_7b_base,HF OpenLLM v1,47.09,,hf_open_llm_v1_240829_frozen.csv openllama_7b_base,HFv1 ARC,46.16,,hf_open_llm_v1_240829_frozen.csv openllama_7b_base,HFv1 GSM8K,9.63,,hf_open_llm_v1_240829_frozen.csv openllama_7b_base,HFv1 HellaSwag,76.4,,hf_open_llm_v1_240829_frozen.csv openllama_7b_base,HFv1 MMLU,42.82,,hf_open_llm_v1_240829_frozen.csv openllama_7b_base,HFv1 TruthfulQA,36.65,,hf_open_llm_v1_240829_frozen.csv openllama_7b_base,HFv1 Winogrande,70.88,,hf_open_llm_v1_240829_frozen.csv openllama_7b_icl,HF OpenLLM v1,47.93,,hf_open_llm_v1_240829_frozen.csv openllama_7b_icl,HFv1 ARC,47.95,,hf_open_llm_v1_240829_frozen.csv openllama_7b_icl,HFv1 GSM8K,10.99,,hf_open_llm_v1_240829_frozen.csv openllama_7b_icl,HFv1 HellaSwag,77.04,,hf_open_llm_v1_240829_frozen.csv openllama_7b_icl,HFv1 MMLU,44.37,,hf_open_llm_v1_240829_frozen.csv openllama_7b_icl,HFv1 TruthfulQA,37.06,,hf_open_llm_v1_240829_frozen.csv openllama_7b_icl,HFv1 Winogrande,70.17,,hf_open_llm_v1_240829_frozen.csv openorca_platypus2_13b_qlora_0_80_epoch,HF OpenLLM v1,57.31,,hf_open_llm_v1_240829_frozen.csv openorca_platypus2_13b_qlora_0_80_epoch,HFv1 ARC,62.37,,hf_open_llm_v1_240829_frozen.csv openorca_platypus2_13b_qlora_0_80_epoch,HFv1 GSM8K,11.14,,hf_open_llm_v1_240829_frozen.csv openorca_platypus2_13b_qlora_0_80_epoch,HFv1 HellaSwag,82.99,,hf_open_llm_v1_240829_frozen.csv openorca_platypus2_13b_qlora_0_80_epoch,HFv1 MMLU,59.38,,hf_open_llm_v1_240829_frozen.csv openorca_platypus2_13b_qlora_0_80_epoch,HFv1 TruthfulQA,52.2,,hf_open_llm_v1_240829_frozen.csv openorca_platypus2_13b_qlora_0_80_epoch,HFv1 Winogrande,75.77,,hf_open_llm_v1_240829_frozen.csv openorcaplatypus2_platypus2_13b_qlora_0_80_epoch,HF OpenLLM v1,54.86,,hf_open_llm_v1_240829_frozen.csv openorcaplatypus2_platypus2_13b_qlora_0_80_epoch,HFv1 ARC,59.81,,hf_open_llm_v1_240829_frozen.csv openorcaplatypus2_platypus2_13b_qlora_0_80_epoch,HFv1 GSM8K,2.35,,hf_open_llm_v1_240829_frozen.csv openorcaplatypus2_platypus2_13b_qlora_0_80_epoch,HFv1 HellaSwag,82.69,,hf_open_llm_v1_240829_frozen.csv openorcaplatypus2_platypus2_13b_qlora_0_80_epoch,HFv1 MMLU,56.96,,hf_open_llm_v1_240829_frozen.csv openorcaplatypus2_platypus2_13b_qlora_0_80_epoch,HFv1 TruthfulQA,52.92,,hf_open_llm_v1_240829_frozen.csv openorcaplatypus2_platypus2_13b_qlora_0_80_epoch,HFv1 Winogrande,74.43,,hf_open_llm_v1_240829_frozen.csv openthaigpt_1_0_0_alpha_7b_chat_ckpt,HF OpenLLM v1,47.65,,hf_open_llm_v1_240829_frozen.csv openthaigpt_1_0_0_alpha_7b_chat_ckpt,HFv1 ARC,50.85,,hf_open_llm_v1_240829_frozen.csv openthaigpt_1_0_0_alpha_7b_chat_ckpt,HFv1 GSM8K,3.87,,hf_open_llm_v1_240829_frozen.csv openthaigpt_1_0_0_alpha_7b_chat_ckpt,HFv1 HellaSwag,74.89,,hf_open_llm_v1_240829_frozen.csv openthaigpt_1_0_0_alpha_7b_chat_ckpt,HFv1 MMLU,40.02,,hf_open_llm_v1_240829_frozen.csv openthaigpt_1_0_0_alpha_7b_chat_ckpt,HFv1 TruthfulQA,47.23,,hf_open_llm_v1_240829_frozen.csv openthaigpt_1_0_0_alpha_7b_chat_ckpt,HFv1 Winogrande,69.06,,hf_open_llm_v1_240829_frozen.csv opt_125m,HF OpenLLM v1,29.15,,hf_open_llm_v1_240829_frozen.csv opt_125m,HFv1 ARC,22.87,,hf_open_llm_v1_240829_frozen.csv opt_125m,HFv1 GSM8K,0.08,,hf_open_llm_v1_240829_frozen.csv opt_125m,HFv1 HellaSwag,31.47,,hf_open_llm_v1_240829_frozen.csv opt_125m,HFv1 MMLU,26.02,,hf_open_llm_v1_240829_frozen.csv opt_125m,HFv1 TruthfulQA,42.87,,hf_open_llm_v1_240829_frozen.csv opt_125m,HFv1 Winogrande,51.62,,hf_open_llm_v1_240829_frozen.csv opt_125m_gqa_ub_6_best_for_kv_cache,HF OpenLLM v1,28.93,,hf_open_llm_v1_240829_frozen.csv opt_125m_gqa_ub_6_best_for_kv_cache,HFv1 ARC,24.23,,hf_open_llm_v1_240829_frozen.csv opt_125m_gqa_ub_6_best_for_kv_cache,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv opt_125m_gqa_ub_6_best_for_kv_cache,HFv1 HellaSwag,25.0,,hf_open_llm_v1_240829_frozen.csv opt_125m_gqa_ub_6_best_for_kv_cache,HFv1 MMLU,23.12,,hf_open_llm_v1_240829_frozen.csv opt_125m_gqa_ub_6_best_for_kv_cache,HFv1 TruthfulQA,49.53,,hf_open_llm_v1_240829_frozen.csv opt_125m_gqa_ub_6_best_for_kv_cache,HFv1 Winogrande,51.7,,hf_open_llm_v1_240829_frozen.csv opt_13b,HF OpenLLM v1,40.06,,hf_open_llm_v1_240829_frozen.csv opt_13b,HFv1 ARC,39.93,,hf_open_llm_v1_240829_frozen.csv opt_13b,HFv1 GSM8K,1.74,,hf_open_llm_v1_240829_frozen.csv opt_13b,HFv1 HellaSwag,71.2,,hf_open_llm_v1_240829_frozen.csv opt_13b,HFv1 MMLU,24.9,,hf_open_llm_v1_240829_frozen.csv opt_13b,HFv1 TruthfulQA,34.1,,hf_open_llm_v1_240829_frozen.csv opt_13b,HFv1 Winogrande,68.51,,hf_open_llm_v1_240829_frozen.csv opt_2_7b,HF OpenLLM v1,36.74,,hf_open_llm_v1_240829_frozen.csv opt_2_7b,HFv1 ARC,33.96,,hf_open_llm_v1_240829_frozen.csv opt_2_7b,HFv1 GSM8K,0.23,,hf_open_llm_v1_240829_frozen.csv opt_2_7b,HFv1 HellaSwag,61.43,,hf_open_llm_v1_240829_frozen.csv opt_2_7b,HFv1 MMLU,25.43,,hf_open_llm_v1_240829_frozen.csv opt_2_7b,HFv1 TruthfulQA,37.43,,hf_open_llm_v1_240829_frozen.csv opt_2_7b,HFv1 Winogrande,61.96,,hf_open_llm_v1_240829_frozen.csv opt_30b,HF OpenLLM v1,42.0,,hf_open_llm_v1_240829_frozen.csv opt_30b,HFv1 ARC,43.26,,hf_open_llm_v1_240829_frozen.csv opt_30b,HFv1 GSM8K,2.2,,hf_open_llm_v1_240829_frozen.csv opt_30b,HFv1 HellaSwag,74.07,,hf_open_llm_v1_240829_frozen.csv opt_30b,HFv1 MMLU,26.66,,hf_open_llm_v1_240829_frozen.csv opt_30b,HFv1 TruthfulQA,35.16,,hf_open_llm_v1_240829_frozen.csv opt_30b,HFv1 Winogrande,70.64,,hf_open_llm_v1_240829_frozen.csv opt_350m,HF OpenLLM v1,30.01,,hf_open_llm_v1_240829_frozen.csv opt_350m,HFv1 ARC,23.55,,hf_open_llm_v1_240829_frozen.csv opt_350m,HFv1 GSM8K,0.3,,hf_open_llm_v1_240829_frozen.csv opt_350m,HFv1 HellaSwag,36.73,,hf_open_llm_v1_240829_frozen.csv opt_350m,HFv1 MMLU,26.02,,hf_open_llm_v1_240829_frozen.csv opt_350m,HFv1 TruthfulQA,40.83,,hf_open_llm_v1_240829_frozen.csv opt_350m,HFv1 Winogrande,52.64,,hf_open_llm_v1_240829_frozen.csv opt_66b,HF OpenLLM v1,42.78,,hf_open_llm_v1_240829_frozen.csv opt_66b,HFv1 ARC,46.33,,hf_open_llm_v1_240829_frozen.csv opt_66b,HFv1 GSM8K,1.67,,hf_open_llm_v1_240829_frozen.csv opt_66b,HFv1 HellaSwag,76.25,,hf_open_llm_v1_240829_frozen.csv opt_66b,HFv1 MMLU,26.99,,hf_open_llm_v1_240829_frozen.csv opt_66b,HFv1 TruthfulQA,35.43,,hf_open_llm_v1_240829_frozen.csv opt_66b,HFv1 Winogrande,70.01,,hf_open_llm_v1_240829_frozen.csv opt_6_7b,HF OpenLLM v1,39.08,,hf_open_llm_v1_240829_frozen.csv opt_6_7b,HFv1 ARC,39.16,,hf_open_llm_v1_240829_frozen.csv opt_6_7b,HFv1 GSM8K,0.99,,hf_open_llm_v1_240829_frozen.csv opt_6_7b,HFv1 HellaSwag,68.66,,hf_open_llm_v1_240829_frozen.csv opt_6_7b,HFv1 MMLU,24.57,,hf_open_llm_v1_240829_frozen.csv opt_6_7b,HFv1 TruthfulQA,35.12,,hf_open_llm_v1_240829_frozen.csv opt_6_7b,HFv1 Winogrande,65.98,,hf_open_llm_v1_240829_frozen.csv opt_flan_iml_6_7b,HF OpenLLM v1,35.84,,hf_open_llm_v1_240829_frozen.csv opt_flan_iml_6_7b,HFv1 ARC,30.12,,hf_open_llm_v1_240829_frozen.csv opt_flan_iml_6_7b,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv opt_flan_iml_6_7b,HFv1 HellaSwag,58.82,,hf_open_llm_v1_240829_frozen.csv opt_flan_iml_6_7b,HFv1 MMLU,25.12,,hf_open_llm_v1_240829_frozen.csv opt_flan_iml_6_7b,HFv1 TruthfulQA,36.74,,hf_open_llm_v1_240829_frozen.csv opt_flan_iml_6_7b,HFv1 Winogrande,64.25,,hf_open_llm_v1_240829_frozen.csv orca_2_0_tau_1_8b,HF OpenLLM v1,45.2,,hf_open_llm_v1_240829_frozen.csv orca_2_0_tau_1_8b,HFv1 ARC,37.12,,hf_open_llm_v1_240829_frozen.csv orca_2_0_tau_1_8b,HFv1 GSM8K,28.96,,hf_open_llm_v1_240829_frozen.csv orca_2_0_tau_1_8b,HFv1 HellaSwag,61.13,,hf_open_llm_v1_240829_frozen.csv orca_2_0_tau_1_8b,HFv1 MMLU,45.27,,hf_open_llm_v1_240829_frozen.csv orca_2_0_tau_1_8b,HFv1 TruthfulQA,39.1,,hf_open_llm_v1_240829_frozen.csv orca_2_0_tau_1_8b,HFv1 Winogrande,59.59,,hf_open_llm_v1_240829_frozen.csv orca_2_13b_no_robots,HF OpenLLM v1,59.63,,hf_open_llm_v1_240829_frozen.csv orca_2_13b_no_robots,HFv1 ARC,59.13,,hf_open_llm_v1_240829_frozen.csv orca_2_13b_no_robots,HFv1 GSM8K,27.29,,hf_open_llm_v1_240829_frozen.csv orca_2_13b_no_robots,HFv1 HellaSwag,79.57,,hf_open_llm_v1_240829_frozen.csv orca_2_13b_no_robots,HFv1 MMLU,60.28,,hf_open_llm_v1_240829_frozen.csv orca_2_13b_no_robots,HFv1 TruthfulQA,51.17,,hf_open_llm_v1_240829_frozen.csv orca_2_13b_no_robots,HFv1 Winogrande,80.35,,hf_open_llm_v1_240829_frozen.csv orca_2_13b_sft_v6,HF OpenLLM v1,56.15,,hf_open_llm_v1_240829_frozen.csv orca_2_13b_sft_v6,HFv1 ARC,60.41,,hf_open_llm_v1_240829_frozen.csv orca_2_13b_sft_v6,HFv1 GSM8K,5.08,,hf_open_llm_v1_240829_frozen.csv orca_2_13b_sft_v6,HFv1 HellaSwag,80.46,,hf_open_llm_v1_240829_frozen.csv orca_2_13b_sft_v6,HFv1 MMLU,59.51,,hf_open_llm_v1_240829_frozen.csv orca_2_13b_sft_v6,HFv1 TruthfulQA,54.01,,hf_open_llm_v1_240829_frozen.csv orca_2_13b_sft_v6,HFv1 Winogrande,77.43,,hf_open_llm_v1_240829_frozen.csv orca_2_7b_f16,HF OpenLLM v1,30.15,,hf_open_llm_v1_240829_frozen.csv orca_2_7b_f16,HFv1 ARC,29.61,,hf_open_llm_v1_240829_frozen.csv orca_2_7b_f16,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv orca_2_7b_f16,HFv1 HellaSwag,25.62,,hf_open_llm_v1_240829_frozen.csv orca_2_7b_f16,HFv1 MMLU,26.7,,hf_open_llm_v1_240829_frozen.csv orca_2_7b_f16,HFv1 TruthfulQA,48.36,,hf_open_llm_v1_240829_frozen.csv orca_2_7b_f16,HFv1 Winogrande,50.59,,hf_open_llm_v1_240829_frozen.csv orca_mini_v3_13b,HF OpenLLM v1,57.24,,hf_open_llm_v1_240829_frozen.csv orca_mini_v3_13b,HFv1 ARC,63.14,,hf_open_llm_v1_240829_frozen.csv orca_mini_v3_13b,HFv1 GSM8K,13.12,,hf_open_llm_v1_240829_frozen.csv orca_mini_v3_13b,HFv1 HellaSwag,82.35,,hf_open_llm_v1_240829_frozen.csv orca_mini_v3_13b,HFv1 MMLU,56.52,,hf_open_llm_v1_240829_frozen.csv orca_mini_v3_13b,HFv1 TruthfulQA,51.81,,hf_open_llm_v1_240829_frozen.csv orca_mini_v3_13b,HFv1 Winogrande,76.48,,hf_open_llm_v1_240829_frozen.csv orca_mini_v3_7b,HF OpenLLM v1,53.47,,hf_open_llm_v1_240829_frozen.csv orca_mini_v3_7b,HFv1 ARC,56.91,,hf_open_llm_v1_240829_frozen.csv orca_mini_v3_7b,HFv1 GSM8K,7.13,,hf_open_llm_v1_240829_frozen.csv orca_mini_v3_7b,HFv1 HellaSwag,79.64,,hf_open_llm_v1_240829_frozen.csv orca_mini_v3_7b,HFv1 MMLU,52.37,,hf_open_llm_v1_240829_frozen.csv orca_mini_v3_7b,HFv1 TruthfulQA,50.51,,hf_open_llm_v1_240829_frozen.csv orca_mini_v3_7b,HFv1 Winogrande,74.27,,hf_open_llm_v1_240829_frozen.csv orca_mini_v5_8b_dpo,HF OpenLLM v1,67.78,,hf_open_llm_v1_240829_frozen.csv orca_mini_v5_8b_dpo,HFv1 ARC,61.86,,hf_open_llm_v1_240829_frozen.csv orca_mini_v5_8b_dpo,HFv1 GSM8K,67.7,,hf_open_llm_v1_240829_frozen.csv orca_mini_v5_8b_dpo,HFv1 HellaSwag,82.35,,hf_open_llm_v1_240829_frozen.csv orca_mini_v5_8b_dpo,HFv1 MMLU,65.1,,hf_open_llm_v1_240829_frozen.csv orca_mini_v5_8b_dpo,HFv1 TruthfulQA,56.24,,hf_open_llm_v1_240829_frozen.csv orca_mini_v5_8b_dpo,HFv1 Winogrande,73.4,,hf_open_llm_v1_240829_frozen.csv orca_mini_v5_8b_orpo,HF OpenLLM v1,65.98,,hf_open_llm_v1_240829_frozen.csv orca_mini_v5_8b_orpo,HFv1 ARC,57.08,,hf_open_llm_v1_240829_frozen.csv orca_mini_v5_8b_orpo,HFv1 GSM8K,65.96,,hf_open_llm_v1_240829_frozen.csv orca_mini_v5_8b_orpo,HFv1 HellaSwag,79.93,,hf_open_llm_v1_240829_frozen.csv orca_mini_v5_8b_orpo,HFv1 MMLU,64.67,,hf_open_llm_v1_240829_frozen.csv orca_mini_v5_8b_orpo,HFv1 TruthfulQA,53.44,,hf_open_llm_v1_240829_frozen.csv orca_mini_v5_8b_orpo,HFv1 Winogrande,74.82,,hf_open_llm_v1_240829_frozen.csv orca_nova_13b,HF OpenLLM v1,56.72,,hf_open_llm_v1_240829_frozen.csv orca_nova_13b,HFv1 ARC,62.37,,hf_open_llm_v1_240829_frozen.csv orca_nova_13b,HFv1 GSM8K,14.48,,hf_open_llm_v1_240829_frozen.csv orca_nova_13b,HFv1 HellaSwag,82.47,,hf_open_llm_v1_240829_frozen.csv orca_nova_13b,HFv1 MMLU,57.44,,hf_open_llm_v1_240829_frozen.csv orca_nova_13b,HFv1 TruthfulQA,45.97,,hf_open_llm_v1_240829_frozen.csv orca_nova_13b,HFv1 Winogrande,77.58,,hf_open_llm_v1_240829_frozen.csv orca_open_hermes_llava_v1_5_7b_dpo,HF OpenLLM v1,52.71,,hf_open_llm_v1_240829_frozen.csv orca_open_hermes_llava_v1_5_7b_dpo,HFv1 ARC,53.07,,hf_open_llm_v1_240829_frozen.csv orca_open_hermes_llava_v1_5_7b_dpo,HFv1 GSM8K,15.54,,hf_open_llm_v1_240829_frozen.csv orca_open_hermes_llava_v1_5_7b_dpo,HFv1 HellaSwag,77.11,,hf_open_llm_v1_240829_frozen.csv orca_open_hermes_llava_v1_5_7b_dpo,HFv1 MMLU,51.03,,hf_open_llm_v1_240829_frozen.csv orca_open_hermes_llava_v1_5_7b_dpo,HFv1 TruthfulQA,47.6,,hf_open_llm_v1_240829_frozen.csv orca_open_hermes_llava_v1_5_7b_dpo,HFv1 Winogrande,71.9,,hf_open_llm_v1_240829_frozen.csv orca_solar_4x10_7b,HF OpenLLM v1,73.17,,hf_open_llm_v1_240829_frozen.csv orca_solar_4x10_7b,HFv1 ARC,68.52,,hf_open_llm_v1_240829_frozen.csv orca_solar_4x10_7b,HFv1 GSM8K,68.23,,hf_open_llm_v1_240829_frozen.csv orca_solar_4x10_7b,HFv1 HellaSwag,86.78,,hf_open_llm_v1_240829_frozen.csv orca_solar_4x10_7b,HFv1 MMLU,67.03,,hf_open_llm_v1_240829_frozen.csv orca_solar_4x10_7b,HFv1 TruthfulQA,64.54,,hf_open_llm_v1_240829_frozen.csv orca_solar_4x10_7b,HFv1 Winogrande,83.9,,hf_open_llm_v1_240829_frozen.csv orcamini_platypus2_13b_qlora_0_80_epoch,HF OpenLLM v1,55.22,,hf_open_llm_v1_240829_frozen.csv orcamini_platypus2_13b_qlora_0_80_epoch,HFv1 ARC,60.84,,hf_open_llm_v1_240829_frozen.csv orcamini_platypus2_13b_qlora_0_80_epoch,HFv1 GSM8K,2.27,,hf_open_llm_v1_240829_frozen.csv orcamini_platypus2_13b_qlora_0_80_epoch,HFv1 HellaSwag,82.56,,hf_open_llm_v1_240829_frozen.csv orcamini_platypus2_13b_qlora_0_80_epoch,HFv1 MMLU,56.42,,hf_open_llm_v1_240829_frozen.csv orcamini_platypus2_13b_qlora_0_80_epoch,HFv1 TruthfulQA,53.32,,hf_open_llm_v1_240829_frozen.csv orcamini_platypus2_13b_qlora_0_80_epoch,HFv1 Winogrande,75.93,,hf_open_llm_v1_240829_frozen.csv orpo_med_v3,HF OpenLLM v1,62.21,,hf_open_llm_v1_240829_frozen.csv orpo_med_v3,HFv1 ARC,61.6,,hf_open_llm_v1_240829_frozen.csv orpo_med_v3,HFv1 GSM8K,43.44,,hf_open_llm_v1_240829_frozen.csv orpo_med_v3,HFv1 HellaSwag,81.47,,hf_open_llm_v1_240829_frozen.csv orpo_med_v3,HFv1 MMLU,66.63,,hf_open_llm_v1_240829_frozen.csv orpo_med_v3,HFv1 TruthfulQA,44.65,,hf_open_llm_v1_240829_frozen.csv orpo_med_v3,HFv1 Winogrande,75.45,,hf_open_llm_v1_240829_frozen.csv orpollama3_8b,HF OpenLLM v1,63.99,,hf_open_llm_v1_240829_frozen.csv orpollama3_8b,HFv1 ARC,59.13,,hf_open_llm_v1_240829_frozen.csv orpollama3_8b,HFv1 GSM8K,45.94,,hf_open_llm_v1_240829_frozen.csv orpollama3_8b,HFv1 HellaSwag,82.41,,hf_open_llm_v1_240829_frozen.csv orpollama3_8b,HFv1 MMLU,65.76,,hf_open_llm_v1_240829_frozen.csv orpollama3_8b,HFv1 TruthfulQA,52.4,,hf_open_llm_v1_240829_frozen.csv orpollama3_8b,HFv1 Winogrande,78.3,,hf_open_llm_v1_240829_frozen.csv oswald_2x7b,HF OpenLLM v1,69.4,,hf_open_llm_v1_240829_frozen.csv oswald_2x7b,HFv1 ARC,66.47,,hf_open_llm_v1_240829_frozen.csv oswald_2x7b,HFv1 GSM8K,59.82,,hf_open_llm_v1_240829_frozen.csv oswald_2x7b,HFv1 HellaSwag,85.46,,hf_open_llm_v1_240829_frozen.csv oswald_2x7b,HFv1 MMLU,65.2,,hf_open_llm_v1_240829_frozen.csv oswald_2x7b,HFv1 TruthfulQA,60.06,,hf_open_llm_v1_240829_frozen.csv oswald_2x7b,HFv1 Winogrande,79.4,,hf_open_llm_v1_240829_frozen.csv oswald_7b,HF OpenLLM v1,70.19,,hf_open_llm_v1_240829_frozen.csv oswald_7b,HFv1 ARC,66.38,,hf_open_llm_v1_240829_frozen.csv oswald_7b,HFv1 GSM8K,69.29,,hf_open_llm_v1_240829_frozen.csv oswald_7b,HFv1 HellaSwag,85.18,,hf_open_llm_v1_240829_frozen.csv oswald_7b,HFv1 MMLU,65.34,,hf_open_llm_v1_240829_frozen.csv oswald_7b,HFv1 TruthfulQA,54.07,,hf_open_llm_v1_240829_frozen.csv oswald_7b,HFv1 Winogrande,80.9,,hf_open_llm_v1_240829_frozen.csv pallas_0_2,HF OpenLLM v1,70.51,,hf_open_llm_v1_240829_frozen.csv pallas_0_2,HFv1 ARC,64.59,,hf_open_llm_v1_240829_frozen.csv pallas_0_2,HFv1 GSM8K,62.7,,hf_open_llm_v1_240829_frozen.csv pallas_0_2,HFv1 HellaSwag,83.47,,hf_open_llm_v1_240829_frozen.csv pallas_0_2,HFv1 MMLU,75.64,,hf_open_llm_v1_240829_frozen.csv pallas_0_2,HFv1 TruthfulQA,55.29,,hf_open_llm_v1_240829_frozen.csv pallas_0_2,HFv1 Winogrande,81.61,,hf_open_llm_v1_240829_frozen.csv pallas_0_3,HF OpenLLM v1,70.06,,hf_open_llm_v1_240829_frozen.csv pallas_0_3,HFv1 ARC,63.74,,hf_open_llm_v1_240829_frozen.csv pallas_0_3,HFv1 GSM8K,60.27,,hf_open_llm_v1_240829_frozen.csv pallas_0_3,HFv1 HellaSwag,83.36,,hf_open_llm_v1_240829_frozen.csv pallas_0_3,HFv1 MMLU,75.09,,hf_open_llm_v1_240829_frozen.csv pallas_0_3,HFv1 TruthfulQA,57.32,,hf_open_llm_v1_240829_frozen.csv pallas_0_3,HFv1 Winogrande,80.66,,hf_open_llm_v1_240829_frozen.csv pallas_0_4,HF OpenLLM v1,70.08,,hf_open_llm_v1_240829_frozen.csv pallas_0_4,HFv1 ARC,63.65,,hf_open_llm_v1_240829_frozen.csv pallas_0_4,HFv1 GSM8K,60.88,,hf_open_llm_v1_240829_frozen.csv pallas_0_4,HFv1 HellaSwag,83.3,,hf_open_llm_v1_240829_frozen.csv pallas_0_4,HFv1 MMLU,75.11,,hf_open_llm_v1_240829_frozen.csv pallas_0_4,HFv1 TruthfulQA,57.29,,hf_open_llm_v1_240829_frozen.csv pallas_0_4,HFv1 Winogrande,80.58,,hf_open_llm_v1_240829_frozen.csv pallas_0_5,HF OpenLLM v1,70.22,,hf_open_llm_v1_240829_frozen.csv pallas_0_5,HFv1 ARC,64.76,,hf_open_llm_v1_240829_frozen.csv pallas_0_5,HFv1 GSM8K,59.89,,hf_open_llm_v1_240829_frozen.csv pallas_0_5,HFv1 HellaSwag,83.46,,hf_open_llm_v1_240829_frozen.csv pallas_0_5,HFv1 MMLU,75.01,,hf_open_llm_v1_240829_frozen.csv pallas_0_5,HFv1 TruthfulQA,56.88,,hf_open_llm_v1_240829_frozen.csv pallas_0_5,HFv1 Winogrande,81.29,,hf_open_llm_v1_240829_frozen.csv palmyra_20b_chat,HF OpenLLM v1,44.18,,hf_open_llm_v1_240829_frozen.csv palmyra_20b_chat,HFv1 ARC,43.52,,hf_open_llm_v1_240829_frozen.csv palmyra_20b_chat,HFv1 GSM8K,3.94,,hf_open_llm_v1_240829_frozen.csv palmyra_20b_chat,HFv1 HellaSwag,72.83,,hf_open_llm_v1_240829_frozen.csv palmyra_20b_chat,HFv1 MMLU,35.18,,hf_open_llm_v1_240829_frozen.csv palmyra_20b_chat,HFv1 TruthfulQA,43.17,,hf_open_llm_v1_240829_frozen.csv palmyra_20b_chat,HFv1 Winogrande,66.46,,hf_open_llm_v1_240829_frozen.csv palmyra_base,HF OpenLLM v1,35.18,,hf_open_llm_v1_240829_frozen.csv palmyra_base,HFv1 ARC,31.91,,hf_open_llm_v1_240829_frozen.csv palmyra_base,HFv1 GSM8K,0.99,,hf_open_llm_v1_240829_frozen.csv palmyra_base,HFv1 HellaSwag,55.39,,hf_open_llm_v1_240829_frozen.csv palmyra_base,HFv1 MMLU,27.15,,hf_open_llm_v1_240829_frozen.csv palmyra_base,HFv1 TruthfulQA,37.57,,hf_open_llm_v1_240829_frozen.csv palmyra_base,HFv1 Winogrande,58.09,,hf_open_llm_v1_240829_frozen.csv palmyra_large,HF OpenLLM v1,42.09,,hf_open_llm_v1_240829_frozen.csv palmyra_large,HFv1 ARC,44.97,,hf_open_llm_v1_240829_frozen.csv palmyra_large,HFv1 GSM8K,3.41,,hf_open_llm_v1_240829_frozen.csv palmyra_large,HFv1 HellaSwag,71.85,,hf_open_llm_v1_240829_frozen.csv palmyra_large,HFv1 MMLU,28.54,,hf_open_llm_v1_240829_frozen.csv palmyra_large,HFv1 TruthfulQA,35.93,,hf_open_llm_v1_240829_frozen.csv palmyra_large,HFv1 Winogrande,67.88,,hf_open_llm_v1_240829_frozen.csv palmyra_med_20b,HF OpenLLM v1,44.71,,hf_open_llm_v1_240829_frozen.csv palmyra_med_20b,HFv1 ARC,46.93,,hf_open_llm_v1_240829_frozen.csv palmyra_med_20b,HFv1 GSM8K,2.65,,hf_open_llm_v1_240829_frozen.csv palmyra_med_20b,HFv1 HellaSwag,73.51,,hf_open_llm_v1_240829_frozen.csv palmyra_med_20b,HFv1 MMLU,44.34,,hf_open_llm_v1_240829_frozen.csv palmyra_med_20b,HFv1 TruthfulQA,35.47,,hf_open_llm_v1_240829_frozen.csv palmyra_med_20b,HFv1 Winogrande,65.35,,hf_open_llm_v1_240829_frozen.csv parrotogno_7b,HF OpenLLM v1,76.22,,hf_open_llm_v1_240829_frozen.csv parrotogno_7b,HFv1 ARC,73.04,,hf_open_llm_v1_240829_frozen.csv parrotogno_7b,HFv1 GSM8K,69.6,,hf_open_llm_v1_240829_frozen.csv parrotogno_7b,HFv1 HellaSwag,89.03,,hf_open_llm_v1_240829_frozen.csv parrotogno_7b,HFv1 MMLU,64.51,,hf_open_llm_v1_240829_frozen.csv parrotogno_7b,HFv1 TruthfulQA,76.53,,hf_open_llm_v1_240829_frozen.csv parrotogno_7b,HFv1 Winogrande,84.61,,hf_open_llm_v1_240829_frozen.csv pascalhermes_2_5_mistral_7b,HF OpenLLM v1,64.81,,hf_open_llm_v1_240829_frozen.csv pascalhermes_2_5_mistral_7b,HFv1 ARC,63.82,,hf_open_llm_v1_240829_frozen.csv pascalhermes_2_5_mistral_7b,HFv1 GSM8K,48.22,,hf_open_llm_v1_240829_frozen.csv pascalhermes_2_5_mistral_7b,HFv1 HellaSwag,83.75,,hf_open_llm_v1_240829_frozen.csv pascalhermes_2_5_mistral_7b,HFv1 MMLU,62.22,,hf_open_llm_v1_240829_frozen.csv pascalhermes_2_5_mistral_7b,HFv1 TruthfulQA,53.72,,hf_open_llm_v1_240829_frozen.csv pascalhermes_2_5_mistral_7b,HFv1 Winogrande,77.11,,hf_open_llm_v1_240829_frozen.csv phi_1_5,HF OpenLLM v1,47.69,,hf_open_llm_v1_240829_frozen.csv phi_1_5,HFv1 ARC,52.9,,hf_open_llm_v1_240829_frozen.csv phi_1_5,HFv1 GSM8K,12.43,,hf_open_llm_v1_240829_frozen.csv phi_1_5,HFv1 HellaSwag,63.79,,hf_open_llm_v1_240829_frozen.csv phi_1_5,HFv1 MMLU,43.89,,hf_open_llm_v1_240829_frozen.csv phi_1_5,HFv1 TruthfulQA,40.89,,hf_open_llm_v1_240829_frozen.csv phi_1_5,HFv1 Winogrande,72.22,,hf_open_llm_v1_240829_frozen.csv phi_1_5_chat_32k,HF OpenLLM v1,46.81,,hf_open_llm_v1_240829_frozen.csv phi_1_5_chat_32k,HFv1 ARC,50.51,,hf_open_llm_v1_240829_frozen.csv phi_1_5_chat_32k,HFv1 GSM8K,18.57,,hf_open_llm_v1_240829_frozen.csv phi_1_5_chat_32k,HFv1 HellaSwag,59.25,,hf_open_llm_v1_240829_frozen.csv phi_1_5_chat_32k,HFv1 MMLU,39.86,,hf_open_llm_v1_240829_frozen.csv phi_1_5_chat_32k,HFv1 TruthfulQA,42.8,,hf_open_llm_v1_240829_frozen.csv phi_1_5_chat_32k,HFv1 Winogrande,69.85,,hf_open_llm_v1_240829_frozen.csv phi_2_dpo,HF OpenLLM v1,62.33,,hf_open_llm_v1_240829_frozen.csv phi_2_dpo,HFv1 ARC,63.05,,hf_open_llm_v1_240829_frozen.csv phi_2_dpo,HFv1 GSM8K,56.71,,hf_open_llm_v1_240829_frozen.csv phi_2_dpo,HFv1 HellaSwag,76.36,,hf_open_llm_v1_240829_frozen.csv phi_2_dpo,HFv1 MMLU,58.46,,hf_open_llm_v1_240829_frozen.csv phi_2_dpo,HFv1 TruthfulQA,45.35,,hf_open_llm_v1_240829_frozen.csv phi_2_dpo,HFv1 Winogrande,74.03,,hf_open_llm_v1_240829_frozen.csv phi_2_dpo_renew1,HF OpenLLM v1,62.77,,hf_open_llm_v1_240829_frozen.csv phi_2_dpo_renew1,HFv1 ARC,64.08,,hf_open_llm_v1_240829_frozen.csv phi_2_dpo_renew1,HFv1 GSM8K,52.24,,hf_open_llm_v1_240829_frozen.csv phi_2_dpo_renew1,HFv1 HellaSwag,77.45,,hf_open_llm_v1_240829_frozen.csv phi_2_dpo_renew1,HFv1 MMLU,58.35,,hf_open_llm_v1_240829_frozen.csv phi_2_dpo_renew1,HFv1 TruthfulQA,51.19,,hf_open_llm_v1_240829_frozen.csv phi_2_dpo_renew1,HFv1 Winogrande,73.32,,hf_open_llm_v1_240829_frozen.csv phi_2_gpo_renew2_b0_001_0_5ultrafeedback_i1,HF OpenLLM v1,61.21,,hf_open_llm_v1_240829_frozen.csv phi_2_gpo_renew2_b0_001_0_5ultrafeedback_i1,HFv1 ARC,60.92,,hf_open_llm_v1_240829_frozen.csv phi_2_gpo_renew2_b0_001_0_5ultrafeedback_i1,HFv1 GSM8K,55.04,,hf_open_llm_v1_240829_frozen.csv phi_2_gpo_renew2_b0_001_0_5ultrafeedback_i1,HFv1 HellaSwag,75.02,,hf_open_llm_v1_240829_frozen.csv phi_2_gpo_renew2_b0_001_0_5ultrafeedback_i1,HFv1 MMLU,57.97,,hf_open_llm_v1_240829_frozen.csv phi_2_gpo_renew2_b0_001_0_5ultrafeedback_i1,HFv1 TruthfulQA,44.36,,hf_open_llm_v1_240829_frozen.csv phi_2_gpo_renew2_b0_001_0_5ultrafeedback_i1,HFv1 Winogrande,73.95,,hf_open_llm_v1_240829_frozen.csv phi_2_gpo_renew2_b0_001_0_5ultrafeedback_lowlr_i1,HF OpenLLM v1,61.36,,hf_open_llm_v1_240829_frozen.csv phi_2_gpo_renew2_b0_001_0_5ultrafeedback_lowlr_i1,HFv1 ARC,61.01,,hf_open_llm_v1_240829_frozen.csv phi_2_gpo_renew2_b0_001_0_5ultrafeedback_lowlr_i1,HFv1 GSM8K,56.33,,hf_open_llm_v1_240829_frozen.csv phi_2_gpo_renew2_b0_001_0_5ultrafeedback_lowlr_i1,HFv1 HellaSwag,74.94,,hf_open_llm_v1_240829_frozen.csv phi_2_gpo_renew2_b0_001_0_5ultrafeedback_lowlr_i1,HFv1 MMLU,57.9,,hf_open_llm_v1_240829_frozen.csv phi_2_gpo_renew2_b0_001_0_5ultrafeedback_lowlr_i1,HFv1 TruthfulQA,44.33,,hf_open_llm_v1_240829_frozen.csv phi_2_gpo_renew2_b0_001_0_5ultrafeedback_lowlr_i1,HFv1 Winogrande,73.64,,hf_open_llm_v1_240829_frozen.csv phi_2_gpo_renew2_b0_001_extra_i1,HF OpenLLM v1,61.5,,hf_open_llm_v1_240829_frozen.csv phi_2_gpo_renew2_b0_001_extra_i1,HFv1 ARC,61.43,,hf_open_llm_v1_240829_frozen.csv phi_2_gpo_renew2_b0_001_extra_i1,HFv1 GSM8K,55.42,,hf_open_llm_v1_240829_frozen.csv phi_2_gpo_renew2_b0_001_extra_i1,HFv1 HellaSwag,75.11,,hf_open_llm_v1_240829_frozen.csv phi_2_gpo_renew2_b0_001_extra_i1,HFv1 MMLU,58.21,,hf_open_llm_v1_240829_frozen.csv phi_2_gpo_renew2_b0_001_extra_i1,HFv1 TruthfulQA,44.82,,hf_open_llm_v1_240829_frozen.csv phi_2_gpo_renew2_b0_001_extra_i1,HFv1 Winogrande,74.27,,hf_open_llm_v1_240829_frozen.csv phi_2_gpo_renew2_b0_001_extra_v2_i1,HF OpenLLM v1,61.32,,hf_open_llm_v1_240829_frozen.csv phi_2_gpo_renew2_b0_001_extra_v2_i1,HFv1 ARC,61.18,,hf_open_llm_v1_240829_frozen.csv phi_2_gpo_renew2_b0_001_extra_v2_i1,HFv1 GSM8K,55.12,,hf_open_llm_v1_240829_frozen.csv phi_2_gpo_renew2_b0_001_extra_v2_i1,HFv1 HellaSwag,74.8,,hf_open_llm_v1_240829_frozen.csv phi_2_gpo_renew2_b0_001_extra_v2_i1,HFv1 MMLU,58.04,,hf_open_llm_v1_240829_frozen.csv phi_2_gpo_renew2_b0_001_extra_v2_i1,HFv1 TruthfulQA,44.93,,hf_open_llm_v1_240829_frozen.csv phi_2_gpo_renew2_b0_001_extra_v2_i1,HFv1 Winogrande,73.88,,hf_open_llm_v1_240829_frozen.csv phi_2_gpo_renew2_b0_001_i0,HF OpenLLM v1,63.48,,hf_open_llm_v1_240829_frozen.csv phi_2_gpo_renew2_b0_001_i0,HFv1 ARC,64.08,,hf_open_llm_v1_240829_frozen.csv phi_2_gpo_renew2_b0_001_i0,HFv1 GSM8K,56.1,,hf_open_llm_v1_240829_frozen.csv phi_2_gpo_renew2_b0_001_i0,HFv1 HellaSwag,76.99,,hf_open_llm_v1_240829_frozen.csv phi_2_gpo_renew2_b0_001_i0,HFv1 MMLU,57.9,,hf_open_llm_v1_240829_frozen.csv phi_2_gpo_renew2_b0_001_i0,HFv1 TruthfulQA,52.02,,hf_open_llm_v1_240829_frozen.csv phi_2_gpo_renew2_b0_001_i0,HFv1 Winogrande,73.8,,hf_open_llm_v1_240829_frozen.csv phi_2_gpo_renew2_b0_001_log_i0,HF OpenLLM v1,63.33,,hf_open_llm_v1_240829_frozen.csv phi_2_gpo_renew2_b0_001_log_i0,HFv1 ARC,64.51,,hf_open_llm_v1_240829_frozen.csv phi_2_gpo_renew2_b0_001_log_i0,HFv1 GSM8K,54.51,,hf_open_llm_v1_240829_frozen.csv phi_2_gpo_renew2_b0_001_log_i0,HFv1 HellaSwag,76.87,,hf_open_llm_v1_240829_frozen.csv phi_2_gpo_renew2_b0_001_log_i0,HFv1 MMLU,58.19,,hf_open_llm_v1_240829_frozen.csv phi_2_gpo_renew2_b0_001_log_i0,HFv1 TruthfulQA,51.71,,hf_open_llm_v1_240829_frozen.csv phi_2_gpo_renew2_b0_001_log_i0,HFv1 Winogrande,74.19,,hf_open_llm_v1_240829_frozen.csv phi_2_gpo_renew2_b0_001_v2_i1,HF OpenLLM v1,61.44,,hf_open_llm_v1_240829_frozen.csv phi_2_gpo_renew2_b0_001_v2_i1,HFv1 ARC,61.26,,hf_open_llm_v1_240829_frozen.csv phi_2_gpo_renew2_b0_001_v2_i1,HFv1 GSM8K,55.42,,hf_open_llm_v1_240829_frozen.csv phi_2_gpo_renew2_b0_001_v2_i1,HFv1 HellaSwag,74.87,,hf_open_llm_v1_240829_frozen.csv phi_2_gpo_renew2_b0_001_v2_i1,HFv1 MMLU,58.03,,hf_open_llm_v1_240829_frozen.csv phi_2_gpo_renew2_b0_001_v2_i1,HFv1 TruthfulQA,44.3,,hf_open_llm_v1_240829_frozen.csv phi_2_gpo_renew2_b0_001_v2_i1,HFv1 Winogrande,74.74,,hf_open_llm_v1_240829_frozen.csv phi_2_gpo_renew2_b0_001_v4_i1,HF OpenLLM v1,61.34,,hf_open_llm_v1_240829_frozen.csv phi_2_gpo_renew2_b0_001_v4_i1,HFv1 ARC,61.18,,hf_open_llm_v1_240829_frozen.csv phi_2_gpo_renew2_b0_001_v4_i1,HFv1 GSM8K,55.5,,hf_open_llm_v1_240829_frozen.csv phi_2_gpo_renew2_b0_001_v4_i1,HFv1 HellaSwag,74.92,,hf_open_llm_v1_240829_frozen.csv phi_2_gpo_renew2_b0_001_v4_i1,HFv1 MMLU,58.11,,hf_open_llm_v1_240829_frozen.csv phi_2_gpo_renew2_b0_001_v4_i1,HFv1 TruthfulQA,44.36,,hf_open_llm_v1_240829_frozen.csv phi_2_gpo_renew2_b0_001_v4_i1,HFv1 Winogrande,73.95,,hf_open_llm_v1_240829_frozen.csv phi_2_gpo_renew2_b0_001_vllm_i1,HF OpenLLM v1,61.32,,hf_open_llm_v1_240829_frozen.csv phi_2_gpo_renew2_b0_001_vllm_i1,HFv1 ARC,61.26,,hf_open_llm_v1_240829_frozen.csv phi_2_gpo_renew2_b0_001_vllm_i1,HFv1 GSM8K,55.42,,hf_open_llm_v1_240829_frozen.csv phi_2_gpo_renew2_b0_001_vllm_i1,HFv1 HellaSwag,75.08,,hf_open_llm_v1_240829_frozen.csv phi_2_gpo_renew2_b0_001_vllm_i1,HFv1 MMLU,57.86,,hf_open_llm_v1_240829_frozen.csv phi_2_gpo_renew2_b0_001_vllm_i1,HFv1 TruthfulQA,44.33,,hf_open_llm_v1_240829_frozen.csv phi_2_gpo_renew2_b0_001_vllm_i1,HFv1 Winogrande,73.95,,hf_open_llm_v1_240829_frozen.csv phi_2_gpo_renew2_i0,HF OpenLLM v1,62.64,,hf_open_llm_v1_240829_frozen.csv phi_2_gpo_renew2_i0,HFv1 ARC,62.71,,hf_open_llm_v1_240829_frozen.csv phi_2_gpo_renew2_i0,HFv1 GSM8K,56.71,,hf_open_llm_v1_240829_frozen.csv phi_2_gpo_renew2_i0,HFv1 HellaSwag,76.26,,hf_open_llm_v1_240829_frozen.csv phi_2_gpo_renew2_i0,HFv1 MMLU,58.41,,hf_open_llm_v1_240829_frozen.csv phi_2_gpo_renew2_i0,HFv1 TruthfulQA,46.91,,hf_open_llm_v1_240829_frozen.csv phi_2_gpo_renew2_i0,HFv1 Winogrande,74.82,,hf_open_llm_v1_240829_frozen.csv phi_2_instruction,HF OpenLLM v1,60.92,,hf_open_llm_v1_240829_frozen.csv phi_2_instruction,HFv1 ARC,61.35,,hf_open_llm_v1_240829_frozen.csv phi_2_instruction,HFv1 GSM8K,52.54,,hf_open_llm_v1_240829_frozen.csv phi_2_instruction,HFv1 HellaSwag,74.73,,hf_open_llm_v1_240829_frozen.csv phi_2_instruction,HFv1 MMLU,57.81,,hf_open_llm_v1_240829_frozen.csv phi_2_instruction,HFv1 TruthfulQA,45.1,,hf_open_llm_v1_240829_frozen.csv phi_2_instruction,HFv1 Winogrande,74.82,,hf_open_llm_v1_240829_frozen.csv phi_2_ipo_renew1,HF OpenLLM v1,62.28,,hf_open_llm_v1_240829_frozen.csv phi_2_ipo_renew1,HFv1 ARC,62.8,,hf_open_llm_v1_240829_frozen.csv phi_2_ipo_renew1,HFv1 GSM8K,57.09,,hf_open_llm_v1_240829_frozen.csv phi_2_ipo_renew1,HFv1 HellaSwag,76.38,,hf_open_llm_v1_240829_frozen.csv phi_2_ipo_renew1,HFv1 MMLU,58.16,,hf_open_llm_v1_240829_frozen.csv phi_2_ipo_renew1,HFv1 TruthfulQA,45.66,,hf_open_llm_v1_240829_frozen.csv phi_2_ipo_renew1,HFv1 Winogrande,73.56,,hf_open_llm_v1_240829_frozen.csv phi_2_ipo_test_iter_0,HF OpenLLM v1,60.95,,hf_open_llm_v1_240829_frozen.csv phi_2_ipo_test_iter_0,HFv1 ARC,60.75,,hf_open_llm_v1_240829_frozen.csv phi_2_ipo_test_iter_0,HFv1 GSM8K,55.19,,hf_open_llm_v1_240829_frozen.csv phi_2_ipo_test_iter_0,HFv1 HellaSwag,74.94,,hf_open_llm_v1_240829_frozen.csv phi_2_ipo_test_iter_0,HFv1 MMLU,57.65,,hf_open_llm_v1_240829_frozen.csv phi_2_ipo_test_iter_0,HFv1 TruthfulQA,43.66,,hf_open_llm_v1_240829_frozen.csv phi_2_ipo_test_iter_0,HFv1 Winogrande,73.48,,hf_open_llm_v1_240829_frozen.csv phi_2_layla_v1,HF OpenLLM v1,61.09,,hf_open_llm_v1_240829_frozen.csv phi_2_layla_v1,HFv1 ARC,60.84,,hf_open_llm_v1_240829_frozen.csv phi_2_layla_v1,HFv1 GSM8K,54.66,,hf_open_llm_v1_240829_frozen.csv phi_2_layla_v1,HFv1 HellaSwag,75.0,,hf_open_llm_v1_240829_frozen.csv phi_2_layla_v1,HFv1 MMLU,57.85,,hf_open_llm_v1_240829_frozen.csv phi_2_layla_v1,HFv1 TruthfulQA,44.01,,hf_open_llm_v1_240829_frozen.csv phi_2_layla_v1,HFv1 Winogrande,74.19,,hf_open_llm_v1_240829_frozen.csv phi_2_layla_v1_chatml,HF OpenLLM v1,60.77,,hf_open_llm_v1_240829_frozen.csv phi_2_layla_v1_chatml,HFv1 ARC,60.41,,hf_open_llm_v1_240829_frozen.csv phi_2_layla_v1_chatml,HFv1 GSM8K,54.51,,hf_open_llm_v1_240829_frozen.csv phi_2_layla_v1_chatml,HFv1 HellaSwag,74.58,,hf_open_llm_v1_240829_frozen.csv phi_2_layla_v1_chatml,HFv1 MMLU,56.62,,hf_open_llm_v1_240829_frozen.csv phi_2_layla_v1_chatml,HFv1 TruthfulQA,44.21,,hf_open_llm_v1_240829_frozen.csv phi_2_layla_v1_chatml,HFv1 Winogrande,74.27,,hf_open_llm_v1_240829_frozen.csv phi_2_super,HF OpenLLM v1,62.13,,hf_open_llm_v1_240829_frozen.csv phi_2_super,HFv1 ARC,61.86,,hf_open_llm_v1_240829_frozen.csv phi_2_super,HFv1 GSM8K,54.51,,hf_open_llm_v1_240829_frozen.csv phi_2_super,HFv1 HellaSwag,76.6,,hf_open_llm_v1_240829_frozen.csv phi_2_super,HFv1 MMLU,58.41,,hf_open_llm_v1_240829_frozen.csv phi_2_super,HFv1 TruthfulQA,48.37,,hf_open_llm_v1_240829_frozen.csv phi_2_super,HFv1 Winogrande,73.01,,hf_open_llm_v1_240829_frozen.csv phi_3_medium_128k_instruct,HF OpenLLM v1,73.0,,hf_open_llm_v1_240829_frozen.csv phi_3_medium_128k_instruct,HFv1 ARC,66.47,,hf_open_llm_v1_240829_frozen.csv phi_3_medium_128k_instruct,HFv1 GSM8K,81.05,,hf_open_llm_v1_240829_frozen.csv phi_3_medium_128k_instruct,HFv1 HellaSwag,84.85,,hf_open_llm_v1_240829_frozen.csv phi_3_medium_128k_instruct,HFv1 MMLU,76.68,,hf_open_llm_v1_240829_frozen.csv phi_3_medium_128k_instruct,HFv1 TruthfulQA,54.52,,hf_open_llm_v1_240829_frozen.csv phi_3_medium_128k_instruct,HFv1 Winogrande,74.43,,hf_open_llm_v1_240829_frozen.csv phi_3_medium_4k_instruct,HF OpenLLM v1,73.57,,hf_open_llm_v1_240829_frozen.csv phi_3_medium_4k_instruct,HFv1 ARC,67.06,,hf_open_llm_v1_240829_frozen.csv phi_3_medium_4k_instruct,HFv1 GSM8K,80.21,,hf_open_llm_v1_240829_frozen.csv phi_3_medium_4k_instruct,HFv1 HellaSwag,85.69,,hf_open_llm_v1_240829_frozen.csv phi_3_medium_4k_instruct,HFv1 MMLU,77.85,,hf_open_llm_v1_240829_frozen.csv phi_3_medium_4k_instruct,HFv1 TruthfulQA,57.75,,hf_open_llm_v1_240829_frozen.csv phi_3_medium_4k_instruct,HFv1 Winogrande,72.85,,hf_open_llm_v1_240829_frozen.csv phi_3_medium_llamaish,HF OpenLLM v1,73.48,,hf_open_llm_v1_240829_frozen.csv phi_3_medium_llamaish,HFv1 ARC,67.41,,hf_open_llm_v1_240829_frozen.csv phi_3_medium_llamaish,HFv1 GSM8K,80.36,,hf_open_llm_v1_240829_frozen.csv phi_3_medium_llamaish,HFv1 HellaSwag,85.6,,hf_open_llm_v1_240829_frozen.csv phi_3_medium_llamaish,HFv1 MMLU,77.86,,hf_open_llm_v1_240829_frozen.csv phi_3_medium_llamaish,HFv1 TruthfulQA,55.87,,hf_open_llm_v1_240829_frozen.csv phi_3_medium_llamaish,HFv1 Winogrande,73.8,,hf_open_llm_v1_240829_frozen.csv phi_3_mini_128k_instruct,HF OpenLLM v1,68.07,,hf_open_llm_v1_240829_frozen.csv phi_3_mini_128k_instruct,HFv1 ARC,63.14,,hf_open_llm_v1_240829_frozen.csv phi_3_mini_128k_instruct,HFv1 GSM8K,69.52,,hf_open_llm_v1_240829_frozen.csv phi_3_mini_128k_instruct,HFv1 HellaSwag,80.09,,hf_open_llm_v1_240829_frozen.csv phi_3_mini_128k_instruct,HFv1 MMLU,68.7,,hf_open_llm_v1_240829_frozen.csv phi_3_mini_128k_instruct,HFv1 TruthfulQA,54.12,,hf_open_llm_v1_240829_frozen.csv phi_3_mini_128k_instruct,HFv1 Winogrande,72.85,,hf_open_llm_v1_240829_frozen.csv phi_3_mini_128k_instruct_humanchoice_4_6k_dpo,HF OpenLLM v1,68.16,,hf_open_llm_v1_240829_frozen.csv phi_3_mini_128k_instruct_humanchoice_4_6k_dpo,HFv1 ARC,62.97,,hf_open_llm_v1_240829_frozen.csv phi_3_mini_128k_instruct_humanchoice_4_6k_dpo,HFv1 GSM8K,69.75,,hf_open_llm_v1_240829_frozen.csv phi_3_mini_128k_instruct_humanchoice_4_6k_dpo,HFv1 HellaSwag,80.09,,hf_open_llm_v1_240829_frozen.csv phi_3_mini_128k_instruct_humanchoice_4_6k_dpo,HFv1 MMLU,68.62,,hf_open_llm_v1_240829_frozen.csv phi_3_mini_128k_instruct_humanchoice_4_6k_dpo,HFv1 TruthfulQA,54.51,,hf_open_llm_v1_240829_frozen.csv phi_3_mini_128k_instruct_humanchoice_4_6k_dpo,HFv1 Winogrande,73.01,,hf_open_llm_v1_240829_frozen.csv phi_3_mini_128k_instruct_linearbunkascore_4_6k_dpo,HF OpenLLM v1,68.07,,hf_open_llm_v1_240829_frozen.csv phi_3_mini_128k_instruct_linearbunkascore_4_6k_dpo,HFv1 ARC,63.05,,hf_open_llm_v1_240829_frozen.csv phi_3_mini_128k_instruct_linearbunkascore_4_6k_dpo,HFv1 GSM8K,69.45,,hf_open_llm_v1_240829_frozen.csv phi_3_mini_128k_instruct_linearbunkascore_4_6k_dpo,HFv1 HellaSwag,79.93,,hf_open_llm_v1_240829_frozen.csv phi_3_mini_128k_instruct_linearbunkascore_4_6k_dpo,HFv1 MMLU,68.82,,hf_open_llm_v1_240829_frozen.csv phi_3_mini_128k_instruct_linearbunkascore_4_6k_dpo,HFv1 TruthfulQA,54.42,,hf_open_llm_v1_240829_frozen.csv phi_3_mini_128k_instruct_linearbunkascore_4_6k_dpo,HFv1 Winogrande,72.77,,hf_open_llm_v1_240829_frozen.csv phi_3_mini_4k_instruct,HF OpenLLM v1,69.91,,hf_open_llm_v1_240829_frozen.csv phi_3_mini_4k_instruct,HFv1 ARC,62.97,,hf_open_llm_v1_240829_frozen.csv phi_3_mini_4k_instruct,HFv1 GSM8K,74.53,,hf_open_llm_v1_240829_frozen.csv phi_3_mini_4k_instruct,HFv1 HellaSwag,80.61,,hf_open_llm_v1_240829_frozen.csv phi_3_mini_4k_instruct,HFv1 MMLU,69.08,,hf_open_llm_v1_240829_frozen.csv phi_3_mini_4k_instruct,HFv1 TruthfulQA,59.88,,hf_open_llm_v1_240829_frozen.csv phi_3_mini_4k_instruct,HFv1 Winogrande,72.69,,hf_open_llm_v1_240829_frozen.csv phi_3_mini_4k_instruct_llamafied,HF OpenLLM v1,69.49,,hf_open_llm_v1_240829_frozen.csv phi_3_mini_4k_instruct_llamafied,HFv1 ARC,62.88,,hf_open_llm_v1_240829_frozen.csv phi_3_mini_4k_instruct_llamafied,HFv1 GSM8K,73.69,,hf_open_llm_v1_240829_frozen.csv phi_3_mini_4k_instruct_llamafied,HFv1 HellaSwag,80.57,,hf_open_llm_v1_240829_frozen.csv phi_3_mini_4k_instruct_llamafied,HFv1 MMLU,67.23,,hf_open_llm_v1_240829_frozen.csv phi_3_mini_4k_instruct_llamafied,HFv1 TruthfulQA,59.88,,hf_open_llm_v1_240829_frozen.csv phi_3_mini_4k_instruct_llamafied,HFv1 Winogrande,72.69,,hf_open_llm_v1_240829_frozen.csv phi_3_mini_4k_instruct_v0_1,HF OpenLLM v1,69.57,,hf_open_llm_v1_240829_frozen.csv phi_3_mini_4k_instruct_v0_1,HFv1 ARC,62.63,,hf_open_llm_v1_240829_frozen.csv phi_3_mini_4k_instruct_v0_1,HFv1 GSM8K,72.25,,hf_open_llm_v1_240829_frozen.csv phi_3_mini_4k_instruct_v0_1,HFv1 HellaSwag,81.07,,hf_open_llm_v1_240829_frozen.csv phi_3_mini_4k_instruct_v0_1,HFv1 MMLU,68.96,,hf_open_llm_v1_240829_frozen.csv phi_3_mini_4k_instruct_v0_1,HFv1 TruthfulQA,61.48,,hf_open_llm_v1_240829_frozen.csv phi_3_mini_4k_instruct_v0_1,HFv1 Winogrande,71.03,,hf_open_llm_v1_240829_frozen.csv phi_3_mini_4k_instruct_v0_3,HF OpenLLM v1,70.26,,hf_open_llm_v1_240829_frozen.csv phi_3_mini_4k_instruct_v0_3,HFv1 ARC,63.48,,hf_open_llm_v1_240829_frozen.csv phi_3_mini_4k_instruct_v0_3,HFv1 GSM8K,74.53,,hf_open_llm_v1_240829_frozen.csv phi_3_mini_4k_instruct_v0_3,HFv1 HellaSwag,80.86,,hf_open_llm_v1_240829_frozen.csv phi_3_mini_4k_instruct_v0_3,HFv1 MMLU,69.24,,hf_open_llm_v1_240829_frozen.csv phi_3_mini_4k_instruct_v0_3,HFv1 TruthfulQA,60.66,,hf_open_llm_v1_240829_frozen.csv phi_3_mini_4k_instruct_v0_3,HFv1 Winogrande,72.77,,hf_open_llm_v1_240829_frozen.csv phi_3_mini_mango_1_llamafied,HF OpenLLM v1,69.69,,hf_open_llm_v1_240829_frozen.csv phi_3_mini_mango_1_llamafied,HFv1 ARC,63.14,,hf_open_llm_v1_240829_frozen.csv phi_3_mini_mango_1_llamafied,HFv1 GSM8K,74.75,,hf_open_llm_v1_240829_frozen.csv phi_3_mini_mango_1_llamafied,HFv1 HellaSwag,80.57,,hf_open_llm_v1_240829_frozen.csv phi_3_mini_mango_1_llamafied,HFv1 MMLU,67.24,,hf_open_llm_v1_240829_frozen.csv phi_3_mini_mango_1_llamafied,HFv1 TruthfulQA,59.84,,hf_open_llm_v1_240829_frozen.csv phi_3_mini_mango_1_llamafied,HFv1 Winogrande,72.61,,hf_open_llm_v1_240829_frozen.csv phi_3_orpo_v9_16,HF OpenLLM v1,64.4,,hf_open_llm_v1_240829_frozen.csv phi_3_orpo_v9_16,HFv1 ARC,60.41,,hf_open_llm_v1_240829_frozen.csv phi_3_orpo_v9_16,HFv1 GSM8K,62.32,,hf_open_llm_v1_240829_frozen.csv phi_3_orpo_v9_16,HFv1 HellaSwag,78.37,,hf_open_llm_v1_240829_frozen.csv phi_3_orpo_v9_16,HFv1 MMLU,65.26,,hf_open_llm_v1_240829_frozen.csv phi_3_orpo_v9_16,HFv1 TruthfulQA,49.76,,hf_open_llm_v1_240829_frozen.csv phi_3_orpo_v9_16,HFv1 Winogrande,70.24,,hf_open_llm_v1_240829_frozen.csv phi_gemma_nlaf_v0,HF OpenLLM v1,42.84,,hf_open_llm_v1_240829_frozen.csv phi_gemma_nlaf_v0,HFv1 ARC,44.2,,hf_open_llm_v1_240829_frozen.csv phi_gemma_nlaf_v0,HFv1 GSM8K,5.53,,hf_open_llm_v1_240829_frozen.csv phi_gemma_nlaf_v0,HFv1 HellaSwag,62.73,,hf_open_llm_v1_240829_frozen.csv phi_gemma_nlaf_v0,HFv1 MMLU,37.7,,hf_open_llm_v1_240829_frozen.csv phi_gemma_nlaf_v0,HFv1 TruthfulQA,45.79,,hf_open_llm_v1_240829_frozen.csv phi_gemma_nlaf_v0,HFv1 Winogrande,61.09,,hf_open_llm_v1_240829_frozen.csv phi_gemma_nlaf_v1,HF OpenLLM v1,42.83,,hf_open_llm_v1_240829_frozen.csv phi_gemma_nlaf_v1,HFv1 ARC,43.86,,hf_open_llm_v1_240829_frozen.csv phi_gemma_nlaf_v1,HFv1 GSM8K,5.84,,hf_open_llm_v1_240829_frozen.csv phi_gemma_nlaf_v1,HFv1 HellaSwag,62.7,,hf_open_llm_v1_240829_frozen.csv phi_gemma_nlaf_v1,HFv1 MMLU,37.58,,hf_open_llm_v1_240829_frozen.csv phi_gemma_nlaf_v1,HFv1 TruthfulQA,45.86,,hf_open_llm_v1_240829_frozen.csv phi_gemma_nlaf_v1,HFv1 Winogrande,61.17,,hf_open_llm_v1_240829_frozen.csv phi_openllm_lb_test,HF OpenLLM v1,48.78,,hf_open_llm_v1_240829_frozen.csv phi_openllm_lb_test,HFv1 ARC,51.79,,hf_open_llm_v1_240829_frozen.csv phi_openllm_lb_test,HFv1 GSM8K,21.53,,hf_open_llm_v1_240829_frozen.csv phi_openllm_lb_test,HFv1 HellaSwag,62.04,,hf_open_llm_v1_240829_frozen.csv phi_openllm_lb_test,HFv1 MMLU,42.58,,hf_open_llm_v1_240829_frozen.csv phi_openllm_lb_test,HFv1 TruthfulQA,40.69,,hf_open_llm_v1_240829_frozen.csv phi_openllm_lb_test,HFv1 Winogrande,74.03,,hf_open_llm_v1_240829_frozen.csv phigrange_dpo,HF OpenLLM v1,29.21,,hf_open_llm_v1_240829_frozen.csv phigrange_dpo,HFv1 ARC,25.77,,hf_open_llm_v1_240829_frozen.csv phigrange_dpo,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv phigrange_dpo,HFv1 HellaSwag,25.7,,hf_open_llm_v1_240829_frozen.csv phigrange_dpo,HFv1 MMLU,25.52,,hf_open_llm_v1_240829_frozen.csv phigrange_dpo,HFv1 TruthfulQA,47.98,,hf_open_llm_v1_240829_frozen.csv phigrange_dpo,HFv1 Winogrande,50.28,,hf_open_llm_v1_240829_frozen.csv phind_codellama34b_v2,HF OpenLLM v1,36.89,,hf_open_llm_v1_240829_frozen.csv phind_codellama34b_v2,HFv1 ARC,24.57,,hf_open_llm_v1_240829_frozen.csv phind_codellama34b_v2,HFv1 GSM8K,23.2,,hf_open_llm_v1_240829_frozen.csv phind_codellama34b_v2,HFv1 HellaSwag,27.6,,hf_open_llm_v1_240829_frozen.csv phind_codellama34b_v2,HFv1 MMLU,25.76,,hf_open_llm_v1_240829_frozen.csv phind_codellama34b_v2,HFv1 TruthfulQA,48.37,,hf_open_llm_v1_240829_frozen.csv phind_codellama34b_v2,HFv1 Winogrande,71.82,,hf_open_llm_v1_240829_frozen.csv piano_medley_7b,HF OpenLLM v1,69.1,,hf_open_llm_v1_240829_frozen.csv piano_medley_7b,HFv1 ARC,67.58,,hf_open_llm_v1_240829_frozen.csv piano_medley_7b,HFv1 GSM8K,56.56,,hf_open_llm_v1_240829_frozen.csv piano_medley_7b,HFv1 HellaSwag,85.36,,hf_open_llm_v1_240829_frozen.csv piano_medley_7b,HFv1 MMLU,64.49,,hf_open_llm_v1_240829_frozen.csv piano_medley_7b,HFv1 TruthfulQA,61.42,,hf_open_llm_v1_240829_frozen.csv piano_medley_7b,HFv1 Winogrande,79.16,,hf_open_llm_v1_240829_frozen.csv piccolo_8x7b,HF OpenLLM v1,72.8,,hf_open_llm_v1_240829_frozen.csv piccolo_8x7b,HFv1 ARC,69.62,,hf_open_llm_v1_240829_frozen.csv piccolo_8x7b,HFv1 GSM8K,72.02,,hf_open_llm_v1_240829_frozen.csv piccolo_8x7b,HFv1 HellaSwag,86.98,,hf_open_llm_v1_240829_frozen.csv piccolo_8x7b,HFv1 MMLU,64.13,,hf_open_llm_v1_240829_frozen.csv piccolo_8x7b,HFv1 TruthfulQA,64.17,,hf_open_llm_v1_240829_frozen.csv piccolo_8x7b,HFv1 Winogrande,79.87,,hf_open_llm_v1_240829_frozen.csv pile_7b_250b_tokens,HF OpenLLM v1,32.44,,hf_open_llm_v1_240829_frozen.csv pile_7b_250b_tokens,HFv1 ARC,29.27,,hf_open_llm_v1_240829_frozen.csv pile_7b_250b_tokens,HFv1 GSM8K,0.53,,hf_open_llm_v1_240829_frozen.csv pile_7b_250b_tokens,HFv1 HellaSwag,46.29,,hf_open_llm_v1_240829_frozen.csv pile_7b_250b_tokens,HFv1 MMLU,25.25,,hf_open_llm_v1_240829_frozen.csv pile_7b_250b_tokens,HFv1 TruthfulQA,40.49,,hf_open_llm_v1_240829_frozen.csv pile_7b_250b_tokens,HFv1 Winogrande,52.8,,hf_open_llm_v1_240829_frozen.csv pivot_0_1_early,HF OpenLLM v1,64.58,,hf_open_llm_v1_240829_frozen.csv pivot_0_1_early,HFv1 ARC,62.46,,hf_open_llm_v1_240829_frozen.csv pivot_0_1_early,HFv1 GSM8K,44.43,,hf_open_llm_v1_240829_frozen.csv pivot_0_1_early,HFv1 HellaSwag,82.97,,hf_open_llm_v1_240829_frozen.csv pivot_0_1_early,HFv1 MMLU,61.02,,hf_open_llm_v1_240829_frozen.csv pivot_0_1_early,HFv1 TruthfulQA,62.89,,hf_open_llm_v1_240829_frozen.csv pivot_0_1_early,HFv1 Winogrande,73.72,,hf_open_llm_v1_240829_frozen.csv pivot_0_1_evil_a,HF OpenLLM v1,59.16,,hf_open_llm_v1_240829_frozen.csv pivot_0_1_evil_a,HFv1 ARC,59.64,,hf_open_llm_v1_240829_frozen.csv pivot_0_1_evil_a,HFv1 GSM8K,40.41,,hf_open_llm_v1_240829_frozen.csv pivot_0_1_evil_a,HFv1 HellaSwag,81.48,,hf_open_llm_v1_240829_frozen.csv pivot_0_1_evil_a,HFv1 MMLU,58.94,,hf_open_llm_v1_240829_frozen.csv pivot_0_1_evil_a,HFv1 TruthfulQA,39.23,,hf_open_llm_v1_240829_frozen.csv pivot_0_1_evil_a,HFv1 Winogrande,75.3,,hf_open_llm_v1_240829_frozen.csv pivot_10_7b_mistral_v0_2,HF OpenLLM v1,64.25,,hf_open_llm_v1_240829_frozen.csv pivot_10_7b_mistral_v0_2,HFv1 ARC,63.31,,hf_open_llm_v1_240829_frozen.csv pivot_10_7b_mistral_v0_2,HFv1 GSM8K,42.38,,hf_open_llm_v1_240829_frozen.csv pivot_10_7b_mistral_v0_2,HFv1 HellaSwag,81.68,,hf_open_llm_v1_240829_frozen.csv pivot_10_7b_mistral_v0_2,HFv1 MMLU,59.86,,hf_open_llm_v1_240829_frozen.csv pivot_10_7b_mistral_v0_2,HFv1 TruthfulQA,58.23,,hf_open_llm_v1_240829_frozen.csv pivot_10_7b_mistral_v0_2,HFv1 Winogrande,80.03,,hf_open_llm_v1_240829_frozen.csv pivot_sus_rp,HF OpenLLM v1,72.57,,hf_open_llm_v1_240829_frozen.csv pivot_sus_rp,HFv1 ARC,66.55,,hf_open_llm_v1_240829_frozen.csv pivot_sus_rp,HFv1 GSM8K,70.51,,hf_open_llm_v1_240829_frozen.csv pivot_sus_rp,HFv1 HellaSwag,84.23,,hf_open_llm_v1_240829_frozen.csv pivot_sus_rp,HFv1 MMLU,76.23,,hf_open_llm_v1_240829_frozen.csv pivot_sus_rp,HFv1 TruthfulQA,54.57,,hf_open_llm_v1_240829_frozen.csv pivot_sus_rp,HFv1 Winogrande,83.35,,hf_open_llm_v1_240829_frozen.csv platyi_34b_200k_q,HF OpenLLM v1,62.0,,hf_open_llm_v1_240829_frozen.csv platyi_34b_200k_q,HFv1 ARC,63.91,,hf_open_llm_v1_240829_frozen.csv platyi_34b_200k_q,HFv1 GSM8K,24.11,,hf_open_llm_v1_240829_frozen.csv platyi_34b_200k_q,HFv1 HellaSwag,83.52,,hf_open_llm_v1_240829_frozen.csv platyi_34b_200k_q,HFv1 MMLU,75.19,,hf_open_llm_v1_240829_frozen.csv platyi_34b_200k_q,HFv1 TruthfulQA,44.21,,hf_open_llm_v1_240829_frozen.csv platyi_34b_200k_q,HFv1 Winogrande,81.06,,hf_open_llm_v1_240829_frozen.csv platyi_34b_200k_q_fastchat,HF OpenLLM v1,67.85,,hf_open_llm_v1_240829_frozen.csv platyi_34b_200k_q_fastchat,HFv1 ARC,64.93,,hf_open_llm_v1_240829_frozen.csv platyi_34b_200k_q_fastchat,HFv1 GSM8K,51.48,,hf_open_llm_v1_240829_frozen.csv platyi_34b_200k_q_fastchat,HFv1 HellaSwag,84.46,,hf_open_llm_v1_240829_frozen.csv platyi_34b_200k_q_fastchat,HFv1 MMLU,77.13,,hf_open_llm_v1_240829_frozen.csv platyi_34b_200k_q_fastchat,HFv1 TruthfulQA,48.38,,hf_open_llm_v1_240829_frozen.csv platyi_34b_200k_q_fastchat,HFv1 Winogrande,80.74,,hf_open_llm_v1_240829_frozen.csv platyi_34b_llama,HF OpenLLM v1,68.37,,hf_open_llm_v1_240829_frozen.csv platyi_34b_llama,HFv1 ARC,67.83,,hf_open_llm_v1_240829_frozen.csv platyi_34b_llama,HFv1 GSM8K,42.46,,hf_open_llm_v1_240829_frozen.csv platyi_34b_llama,HFv1 HellaSwag,85.35,,hf_open_llm_v1_240829_frozen.csv platyi_34b_llama,HFv1 MMLU,78.26,,hf_open_llm_v1_240829_frozen.csv platyi_34b_llama,HFv1 TruthfulQA,53.46,,hf_open_llm_v1_240829_frozen.csv platyi_34b_llama,HFv1 Winogrande,82.87,,hf_open_llm_v1_240829_frozen.csv platyi_34b_llama_q,HF OpenLLM v1,71.13,,hf_open_llm_v1_240829_frozen.csv platyi_34b_llama_q,HFv1 ARC,65.7,,hf_open_llm_v1_240829_frozen.csv platyi_34b_llama_q,HFv1 GSM8K,60.42,,hf_open_llm_v1_240829_frozen.csv platyi_34b_llama_q,HFv1 HellaSwag,85.22,,hf_open_llm_v1_240829_frozen.csv platyi_34b_llama_q,HFv1 MMLU,78.78,,hf_open_llm_v1_240829_frozen.csv platyi_34b_llama_q,HFv1 TruthfulQA,53.64,,hf_open_llm_v1_240829_frozen.csv platyi_34b_llama_q,HFv1 Winogrande,83.03,,hf_open_llm_v1_240829_frozen.csv platyi_34b_llama_q_fastchat,HF OpenLLM v1,68.31,,hf_open_llm_v1_240829_frozen.csv platyi_34b_llama_q_fastchat,HFv1 ARC,66.13,,hf_open_llm_v1_240829_frozen.csv platyi_34b_llama_q_fastchat,HFv1 GSM8K,44.35,,hf_open_llm_v1_240829_frozen.csv platyi_34b_llama_q_fastchat,HFv1 HellaSwag,85.25,,hf_open_llm_v1_240829_frozen.csv platyi_34b_llama_q_fastchat,HFv1 MMLU,78.37,,hf_open_llm_v1_240829_frozen.csv platyi_34b_llama_q_fastchat,HFv1 TruthfulQA,53.62,,hf_open_llm_v1_240829_frozen.csv platyi_34b_llama_q_fastchat,HFv1 Winogrande,82.16,,hf_open_llm_v1_240829_frozen.csv platyi_34b_llama_q_v2,HF OpenLLM v1,67.88,,hf_open_llm_v1_240829_frozen.csv platyi_34b_llama_q_v2,HFv1 ARC,61.09,,hf_open_llm_v1_240829_frozen.csv platyi_34b_llama_q_v2,HFv1 GSM8K,49.05,,hf_open_llm_v1_240829_frozen.csv platyi_34b_llama_q_v2,HFv1 HellaSwag,85.09,,hf_open_llm_v1_240829_frozen.csv platyi_34b_llama_q_v2,HFv1 MMLU,76.59,,hf_open_llm_v1_240829_frozen.csv platyi_34b_llama_q_v2,HFv1 TruthfulQA,52.65,,hf_open_llm_v1_240829_frozen.csv platyi_34b_llama_q_v2,HFv1 Winogrande,82.79,,hf_open_llm_v1_240829_frozen.csv platyi_34b_llama_q_v3,HF OpenLLM v1,61.15,,hf_open_llm_v1_240829_frozen.csv platyi_34b_llama_q_v3,HFv1 ARC,64.33,,hf_open_llm_v1_240829_frozen.csv platyi_34b_llama_q_v3,HFv1 GSM8K,6.67,,hf_open_llm_v1_240829_frozen.csv platyi_34b_llama_q_v3,HFv1 HellaSwag,84.88,,hf_open_llm_v1_240829_frozen.csv platyi_34b_llama_q_v3,HFv1 MMLU,74.98,,hf_open_llm_v1_240829_frozen.csv platyi_34b_llama_q_v3,HFv1 TruthfulQA,51.8,,hf_open_llm_v1_240829_frozen.csv platyi_34b_llama_q_v3,HFv1 Winogrande,84.21,,hf_open_llm_v1_240829_frozen.csv platyi_34b_lora,HF OpenLLM v1,68.1,,hf_open_llm_v1_240829_frozen.csv platyi_34b_lora,HFv1 ARC,67.15,,hf_open_llm_v1_240829_frozen.csv platyi_34b_lora,HFv1 GSM8K,40.64,,hf_open_llm_v1_240829_frozen.csv platyi_34b_lora,HFv1 HellaSwag,85.37,,hf_open_llm_v1_240829_frozen.csv platyi_34b_lora,HFv1 MMLU,78.46,,hf_open_llm_v1_240829_frozen.csv platyi_34b_lora,HFv1 TruthfulQA,53.32,,hf_open_llm_v1_240829_frozen.csv platyi_34b_lora,HFv1 Winogrande,83.66,,hf_open_llm_v1_240829_frozen.csv platyi_34b_q,HF OpenLLM v1,69.86,,hf_open_llm_v1_240829_frozen.csv platyi_34b_q,HFv1 ARC,66.89,,hf_open_llm_v1_240829_frozen.csv platyi_34b_q,HFv1 GSM8K,53.98,,hf_open_llm_v1_240829_frozen.csv platyi_34b_q,HFv1 HellaSwag,85.14,,hf_open_llm_v1_240829_frozen.csv platyi_34b_q,HFv1 MMLU,77.66,,hf_open_llm_v1_240829_frozen.csv platyi_34b_q,HFv1 TruthfulQA,53.03,,hf_open_llm_v1_240829_frozen.csv platyi_34b_q,HFv1 Winogrande,82.48,,hf_open_llm_v1_240829_frozen.csv platypus2_13b_ia3,HF OpenLLM v1,54.23,,hf_open_llm_v1_240829_frozen.csv platypus2_13b_ia3,HFv1 ARC,61.09,,hf_open_llm_v1_240829_frozen.csv platypus2_13b_ia3,HFv1 GSM8K,11.3,,hf_open_llm_v1_240829_frozen.csv platypus2_13b_ia3,HFv1 HellaSwag,82.65,,hf_open_llm_v1_240829_frozen.csv platypus2_13b_ia3,HFv1 MMLU,56.32,,hf_open_llm_v1_240829_frozen.csv platypus2_13b_ia3,HFv1 TruthfulQA,38.35,,hf_open_llm_v1_240829_frozen.csv platypus2_13b_ia3,HFv1 Winogrande,75.69,,hf_open_llm_v1_240829_frozen.csv platypus2_13b_lora,HF OpenLLM v1,54.48,,hf_open_llm_v1_240829_frozen.csv platypus2_13b_lora,HFv1 ARC,60.67,,hf_open_llm_v1_240829_frozen.csv platypus2_13b_lora,HFv1 GSM8K,7.51,,hf_open_llm_v1_240829_frozen.csv platypus2_13b_lora,HFv1 HellaSwag,82.5,,hf_open_llm_v1_240829_frozen.csv platypus2_13b_lora,HFv1 MMLU,56.34,,hf_open_llm_v1_240829_frozen.csv platypus2_13b_lora,HFv1 TruthfulQA,43.91,,hf_open_llm_v1_240829_frozen.csv platypus2_13b_lora,HFv1 Winogrande,75.93,,hf_open_llm_v1_240829_frozen.csv platypus2_13b_qlora,HF OpenLLM v1,53.74,,hf_open_llm_v1_240829_frozen.csv platypus2_13b_qlora,HFv1 ARC,57.51,,hf_open_llm_v1_240829_frozen.csv platypus2_13b_qlora,HFv1 GSM8K,5.0,,hf_open_llm_v1_240829_frozen.csv platypus2_13b_qlora,HFv1 HellaSwag,82.55,,hf_open_llm_v1_240829_frozen.csv platypus2_13b_qlora,HFv1 MMLU,57.34,,hf_open_llm_v1_240829_frozen.csv platypus2_13b_qlora,HFv1 TruthfulQA,43.38,,hf_open_llm_v1_240829_frozen.csv platypus2_13b_qlora,HFv1 Winogrande,76.64,,hf_open_llm_v1_240829_frozen.csv platypus2_13b_qlora_0_80_epoch,HF OpenLLM v1,52.27,,hf_open_llm_v1_240829_frozen.csv platypus2_13b_qlora_0_80_epoch,HFv1 ARC,57.76,,hf_open_llm_v1_240829_frozen.csv platypus2_13b_qlora_0_80_epoch,HFv1 GSM8K,2.96,,hf_open_llm_v1_240829_frozen.csv platypus2_13b_qlora_0_80_epoch,HFv1 HellaSwag,81.63,,hf_open_llm_v1_240829_frozen.csv platypus2_13b_qlora_0_80_epoch,HFv1 MMLU,55.63,,hf_open_llm_v1_240829_frozen.csv platypus2_13b_qlora_0_80_epoch,HFv1 TruthfulQA,39.7,,hf_open_llm_v1_240829_frozen.csv platypus2_13b_qlora_0_80_epoch,HFv1 Winogrande,75.93,,hf_open_llm_v1_240829_frozen.csv platypus2_22b_relora,HF OpenLLM v1,53.64,,hf_open_llm_v1_240829_frozen.csv platypus2_22b_relora,HFv1 ARC,57.51,,hf_open_llm_v1_240829_frozen.csv platypus2_22b_relora,HFv1 GSM8K,6.29,,hf_open_llm_v1_240829_frozen.csv platypus2_22b_relora,HFv1 HellaSwag,82.36,,hf_open_llm_v1_240829_frozen.csv platypus2_22b_relora,HFv1 MMLU,54.94,,hf_open_llm_v1_240829_frozen.csv platypus2_22b_relora,HFv1 TruthfulQA,43.62,,hf_open_llm_v1_240829_frozen.csv platypus2_22b_relora,HFv1 Winogrande,77.11,,hf_open_llm_v1_240829_frozen.csv platypus2_7b,HF OpenLLM v1,49.97,,hf_open_llm_v1_240829_frozen.csv platypus2_7b,HFv1 ARC,55.2,,hf_open_llm_v1_240829_frozen.csv platypus2_7b,HFv1 GSM8K,1.82,,hf_open_llm_v1_240829_frozen.csv platypus2_7b,HFv1 HellaSwag,78.84,,hf_open_llm_v1_240829_frozen.csv platypus2_7b,HFv1 MMLU,49.83,,hf_open_llm_v1_240829_frozen.csv platypus2_7b,HFv1 TruthfulQA,40.64,,hf_open_llm_v1_240829_frozen.csv platypus2_7b,HFv1 Winogrande,73.48,,hf_open_llm_v1_240829_frozen.csv platypus2xopenorca_13b_ia3,HF OpenLLM v1,56.65,,hf_open_llm_v1_240829_frozen.csv platypus2xopenorca_13b_ia3,HFv1 ARC,62.12,,hf_open_llm_v1_240829_frozen.csv platypus2xopenorca_13b_ia3,HFv1 GSM8K,11.83,,hf_open_llm_v1_240829_frozen.csv platypus2xopenorca_13b_ia3,HFv1 HellaSwag,82.1,,hf_open_llm_v1_240829_frozen.csv platypus2xopenorca_13b_ia3,HFv1 MMLU,58.84,,hf_open_llm_v1_240829_frozen.csv platypus2xopenorca_13b_ia3,HFv1 TruthfulQA,47.88,,hf_open_llm_v1_240829_frozen.csv platypus2xopenorca_13b_ia3,HFv1 Winogrande,77.11,,hf_open_llm_v1_240829_frozen.csv platypus2xopenorca_13b_ia3_v2_1,HF OpenLLM v1,56.29,,hf_open_llm_v1_240829_frozen.csv platypus2xopenorca_13b_ia3_v2_1,HFv1 ARC,62.29,,hf_open_llm_v1_240829_frozen.csv platypus2xopenorca_13b_ia3_v2_1,HFv1 GSM8K,10.99,,hf_open_llm_v1_240829_frozen.csv platypus2xopenorca_13b_ia3_v2_1,HFv1 HellaSwag,82.09,,hf_open_llm_v1_240829_frozen.csv platypus2xopenorca_13b_ia3_v2_1,HFv1 MMLU,57.91,,hf_open_llm_v1_240829_frozen.csv platypus2xopenorca_13b_ia3_v2_1,HFv1 TruthfulQA,47.03,,hf_open_llm_v1_240829_frozen.csv platypus2xopenorca_13b_ia3_v2_1,HFv1 Winogrande,77.43,,hf_open_llm_v1_240829_frozen.csv platypus2xopenorca_13b_ia3_v3,HF OpenLLM v1,56.74,,hf_open_llm_v1_240829_frozen.csv platypus2xopenorca_13b_ia3_v3,HFv1 ARC,62.54,,hf_open_llm_v1_240829_frozen.csv platypus2xopenorca_13b_ia3_v3,HFv1 GSM8K,12.36,,hf_open_llm_v1_240829_frozen.csv platypus2xopenorca_13b_ia3_v3,HFv1 HellaSwag,82.1,,hf_open_llm_v1_240829_frozen.csv platypus2xopenorca_13b_ia3_v3,HFv1 MMLU,58.67,,hf_open_llm_v1_240829_frozen.csv platypus2xopenorca_13b_ia3_v3,HFv1 TruthfulQA,46.96,,hf_open_llm_v1_240829_frozen.csv platypus2xopenorca_13b_ia3_v3,HFv1 Winogrande,77.82,,hf_open_llm_v1_240829_frozen.csv platypus2xopenorca_13b_ia3_v4,HF OpenLLM v1,56.49,,hf_open_llm_v1_240829_frozen.csv platypus2xopenorca_13b_ia3_v4,HFv1 ARC,61.43,,hf_open_llm_v1_240829_frozen.csv platypus2xopenorca_13b_ia3_v4,HFv1 GSM8K,10.84,,hf_open_llm_v1_240829_frozen.csv platypus2xopenorca_13b_ia3_v4,HFv1 HellaSwag,81.84,,hf_open_llm_v1_240829_frozen.csv platypus2xopenorca_13b_ia3_v4,HFv1 MMLU,59.02,,hf_open_llm_v1_240829_frozen.csv platypus2xopenorca_13b_ia3_v4,HFv1 TruthfulQA,48.64,,hf_open_llm_v1_240829_frozen.csv platypus2xopenorca_13b_ia3_v4,HFv1 Winogrande,77.19,,hf_open_llm_v1_240829_frozen.csv platypus2xopenorca_13b_lora,HF OpenLLM v1,55.15,,hf_open_llm_v1_240829_frozen.csv platypus2xopenorca_13b_lora,HFv1 ARC,60.75,,hf_open_llm_v1_240829_frozen.csv platypus2xopenorca_13b_lora,HFv1 GSM8K,7.13,,hf_open_llm_v1_240829_frozen.csv platypus2xopenorca_13b_lora,HFv1 HellaSwag,82.09,,hf_open_llm_v1_240829_frozen.csv platypus2xopenorca_13b_lora,HFv1 MMLU,58.77,,hf_open_llm_v1_240829_frozen.csv platypus2xopenorca_13b_lora,HFv1 TruthfulQA,45.15,,hf_open_llm_v1_240829_frozen.csv platypus2xopenorca_13b_lora,HFv1 Winogrande,77.03,,hf_open_llm_v1_240829_frozen.csv platypus2xopenorca_13b_lora_v2,HF OpenLLM v1,51.61,,hf_open_llm_v1_240829_frozen.csv platypus2xopenorca_13b_lora_v2,HFv1 ARC,58.62,,hf_open_llm_v1_240829_frozen.csv platypus2xopenorca_13b_lora_v2,HFv1 GSM8K,0.08,,hf_open_llm_v1_240829_frozen.csv platypus2xopenorca_13b_lora_v2,HFv1 HellaSwag,81.17,,hf_open_llm_v1_240829_frozen.csv platypus2xopenorca_13b_lora_v2,HFv1 MMLU,50.23,,hf_open_llm_v1_240829_frozen.csv platypus2xopenorca_13b_lora_v2,HFv1 TruthfulQA,43.43,,hf_open_llm_v1_240829_frozen.csv platypus2xopenorca_13b_lora_v2,HFv1 Winogrande,76.16,,hf_open_llm_v1_240829_frozen.csv platypus_1_8b,HF OpenLLM v1,35.24,,hf_open_llm_v1_240829_frozen.csv platypus_1_8b,HFv1 ARC,33.28,,hf_open_llm_v1_240829_frozen.csv platypus_1_8b,HFv1 GSM8K,0.45,,hf_open_llm_v1_240829_frozen.csv platypus_1_8b,HFv1 HellaSwag,50.76,,hf_open_llm_v1_240829_frozen.csv platypus_1_8b,HFv1 MMLU,33.25,,hf_open_llm_v1_240829_frozen.csv platypus_1_8b,HFv1 TruthfulQA,40.73,,hf_open_llm_v1_240829_frozen.csv platypus_1_8b,HFv1 Winogrande,52.96,,hf_open_llm_v1_240829_frozen.csv platypus_2_22b_relora,HF OpenLLM v1,53.83,,hf_open_llm_v1_240829_frozen.csv platypus_2_22b_relora,HFv1 ARC,57.68,,hf_open_llm_v1_240829_frozen.csv platypus_2_22b_relora,HFv1 GSM8K,6.6,,hf_open_llm_v1_240829_frozen.csv platypus_2_22b_relora,HFv1 HellaSwag,82.44,,hf_open_llm_v1_240829_frozen.csv platypus_2_22b_relora,HFv1 MMLU,55.33,,hf_open_llm_v1_240829_frozen.csv platypus_2_22b_relora,HFv1 TruthfulQA,43.61,,hf_open_llm_v1_240829_frozen.csv platypus_2_22b_relora,HFv1 Winogrande,77.35,,hf_open_llm_v1_240829_frozen.csv platypus_30b,HF OpenLLM v1,59.03,,hf_open_llm_v1_240829_frozen.csv platypus_30b,HFv1 ARC,64.59,,hf_open_llm_v1_240829_frozen.csv platypus_30b,HFv1 GSM8K,14.4,,hf_open_llm_v1_240829_frozen.csv platypus_30b,HFv1 HellaSwag,84.26,,hf_open_llm_v1_240829_frozen.csv platypus_30b,HFv1 MMLU,64.23,,hf_open_llm_v1_240829_frozen.csv platypus_30b,HFv1 TruthfulQA,45.35,,hf_open_llm_v1_240829_frozen.csv platypus_30b,HFv1 Winogrande,81.37,,hf_open_llm_v1_240829_frozen.csv platypus_nebula_v2_7b,HF OpenLLM v1,53.95,,hf_open_llm_v1_240829_frozen.csv platypus_nebula_v2_7b,HFv1 ARC,55.38,,hf_open_llm_v1_240829_frozen.csv platypus_nebula_v2_7b,HFv1 GSM8K,10.08,,hf_open_llm_v1_240829_frozen.csv platypus_nebula_v2_7b,HFv1 HellaSwag,83.02,,hf_open_llm_v1_240829_frozen.csv platypus_nebula_v2_7b,HFv1 MMLU,56.07,,hf_open_llm_v1_240829_frozen.csv platypus_nebula_v2_7b,HFv1 TruthfulQA,46.94,,hf_open_llm_v1_240829_frozen.csv platypus_nebula_v2_7b,HFv1 Winogrande,72.22,,hf_open_llm_v1_240829_frozen.csv platypus_yi_34b,HF OpenLLM v1,71.69,,hf_open_llm_v1_240829_frozen.csv platypus_yi_34b,HFv1 ARC,68.43,,hf_open_llm_v1_240829_frozen.csv platypus_yi_34b,HFv1 GSM8K,59.82,,hf_open_llm_v1_240829_frozen.csv platypus_yi_34b,HFv1 HellaSwag,85.21,,hf_open_llm_v1_240829_frozen.csv platypus_yi_34b,HFv1 MMLU,78.13,,hf_open_llm_v1_240829_frozen.csv platypus_yi_34b,HFv1 TruthfulQA,54.48,,hf_open_llm_v1_240829_frozen.csv platypus_yi_34b,HFv1 Winogrande,84.06,,hf_open_llm_v1_240829_frozen.csv pmmpk_einstainmorcoro14krishnahercules_7b_slerp,HF OpenLLM v1,72.67,,hf_open_llm_v1_240829_frozen.csv pmmpk_einstainmorcoro14krishnahercules_7b_slerp,HFv1 ARC,69.28,,hf_open_llm_v1_240829_frozen.csv pmmpk_einstainmorcoro14krishnahercules_7b_slerp,HFv1 GSM8K,71.42,,hf_open_llm_v1_240829_frozen.csv pmmpk_einstainmorcoro14krishnahercules_7b_slerp,HFv1 HellaSwag,86.59,,hf_open_llm_v1_240829_frozen.csv pmmpk_einstainmorcoro14krishnahercules_7b_slerp,HFv1 MMLU,65.13,,hf_open_llm_v1_240829_frozen.csv pmmpk_einstainmorcoro14krishnahercules_7b_slerp,HFv1 TruthfulQA,62.69,,hf_open_llm_v1_240829_frozen.csv pmmpk_einstainmorcoro14krishnahercules_7b_slerp,HFv1 Winogrande,80.9,,hf_open_llm_v1_240829_frozen.csv polar_14b_v0_2,HF OpenLLM v1,66.39,,hf_open_llm_v1_240829_frozen.csv polar_14b_v0_2,HFv1 ARC,77.13,,hf_open_llm_v1_240829_frozen.csv polar_14b_v0_2,HFv1 GSM8K,3.56,,hf_open_llm_v1_240829_frozen.csv polar_14b_v0_2,HFv1 HellaSwag,90.72,,hf_open_llm_v1_240829_frozen.csv polar_14b_v0_2,HFv1 MMLU,63.76,,hf_open_llm_v1_240829_frozen.csv polar_14b_v0_2,HFv1 TruthfulQA,80.81,,hf_open_llm_v1_240829_frozen.csv polar_14b_v0_2,HFv1 Winogrande,82.32,,hf_open_llm_v1_240829_frozen.csv polyglot_ko_12_8b,HF OpenLLM v1,33.33,,hf_open_llm_v1_240829_frozen.csv polyglot_ko_12_8b,HFv1 ARC,27.05,,hf_open_llm_v1_240829_frozen.csv polyglot_ko_12_8b,HFv1 GSM8K,0.15,,hf_open_llm_v1_240829_frozen.csv polyglot_ko_12_8b,HFv1 HellaSwag,51.68,,hf_open_llm_v1_240829_frozen.csv polyglot_ko_12_8b,HFv1 MMLU,26.64,,hf_open_llm_v1_240829_frozen.csv polyglot_ko_12_8b,HFv1 TruthfulQA,34.69,,hf_open_llm_v1_240829_frozen.csv polyglot_ko_12_8b,HFv1 Winogrande,59.75,,hf_open_llm_v1_240829_frozen.csv polyglot_math_4x7b,HF OpenLLM v1,66.84,,hf_open_llm_v1_240829_frozen.csv polyglot_math_4x7b,HFv1 ARC,63.74,,hf_open_llm_v1_240829_frozen.csv polyglot_math_4x7b,HFv1 GSM8K,56.63,,hf_open_llm_v1_240829_frozen.csv polyglot_math_4x7b,HFv1 HellaSwag,84.85,,hf_open_llm_v1_240829_frozen.csv polyglot_math_4x7b,HFv1 MMLU,63.57,,hf_open_llm_v1_240829_frozen.csv polyglot_math_4x7b,HFv1 TruthfulQA,53.78,,hf_open_llm_v1_240829_frozen.csv polyglot_math_4x7b,HFv1 Winogrande,78.45,,hf_open_llm_v1_240829_frozen.csv poro_34b_gptq,HF OpenLLM v1,44.67,,hf_open_llm_v1_240829_frozen.csv poro_34b_gptq,HFv1 ARC,47.01,,hf_open_llm_v1_240829_frozen.csv poro_34b_gptq,HFv1 GSM8K,5.08,,hf_open_llm_v1_240829_frozen.csv poro_34b_gptq,HFv1 HellaSwag,73.75,,hf_open_llm_v1_240829_frozen.csv poro_34b_gptq,HFv1 MMLU,32.47,,hf_open_llm_v1_240829_frozen.csv poro_34b_gptq,HFv1 TruthfulQA,38.37,,hf_open_llm_v1_240829_frozen.csv poro_34b_gptq,HFv1 Winogrande,71.35,,hf_open_llm_v1_240829_frozen.csv power_llama3_13b_instruct,HF OpenLLM v1,65.21,,hf_open_llm_v1_240829_frozen.csv power_llama3_13b_instruct,HFv1 ARC,67.83,,hf_open_llm_v1_240829_frozen.csv power_llama3_13b_instruct,HFv1 GSM8K,37.68,,hf_open_llm_v1_240829_frozen.csv power_llama3_13b_instruct,HFv1 HellaSwag,85.13,,hf_open_llm_v1_240829_frozen.csv power_llama3_13b_instruct,HFv1 MMLU,64.44,,hf_open_llm_v1_240829_frozen.csv power_llama3_13b_instruct,HFv1 TruthfulQA,60.57,,hf_open_llm_v1_240829_frozen.csv power_llama3_13b_instruct,HFv1 Winogrande,75.61,,hf_open_llm_v1_240829_frozen.csv prodigy_7b,HF OpenLLM v1,73.68,,hf_open_llm_v1_240829_frozen.csv prodigy_7b,HFv1 ARC,71.59,,hf_open_llm_v1_240829_frozen.csv prodigy_7b,HFv1 GSM8K,64.37,,hf_open_llm_v1_240829_frozen.csv prodigy_7b,HFv1 HellaSwag,88.09,,hf_open_llm_v1_240829_frozen.csv prodigy_7b,HFv1 MMLU,64.92,,hf_open_llm_v1_240829_frozen.csv prodigy_7b,HFv1 TruthfulQA,68.57,,hf_open_llm_v1_240829_frozen.csv prodigy_7b,HFv1 Winogrande,84.53,,hf_open_llm_v1_240829_frozen.csv puddlejumper_13b_v2,HF OpenLLM v1,54.19,,hf_open_llm_v1_240829_frozen.csv puddlejumper_13b_v2,HFv1 ARC,57.0,,hf_open_llm_v1_240829_frozen.csv puddlejumper_13b_v2,HFv1 GSM8K,3.64,,hf_open_llm_v1_240829_frozen.csv puddlejumper_13b_v2,HFv1 HellaSwag,81.06,,hf_open_llm_v1_240829_frozen.csv puddlejumper_13b_v2,HFv1 MMLU,58.3,,hf_open_llm_v1_240829_frozen.csv puddlejumper_13b_v2,HFv1 TruthfulQA,52.66,,hf_open_llm_v1_240829_frozen.csv puddlejumper_13b_v2,HFv1 Winogrande,72.45,,hf_open_llm_v1_240829_frozen.csv puddlejumper_platypus2_13b_qlora_0_80_epoch,HF OpenLLM v1,52.41,,hf_open_llm_v1_240829_frozen.csv puddlejumper_platypus2_13b_qlora_0_80_epoch,HFv1 ARC,54.52,,hf_open_llm_v1_240829_frozen.csv puddlejumper_platypus2_13b_qlora_0_80_epoch,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv puddlejumper_platypus2_13b_qlora_0_80_epoch,HFv1 HellaSwag,79.36,,hf_open_llm_v1_240829_frozen.csv puddlejumper_platypus2_13b_qlora_0_80_epoch,HFv1 MMLU,55.15,,hf_open_llm_v1_240829_frozen.csv puddlejumper_platypus2_13b_qlora_0_80_epoch,HFv1 TruthfulQA,54.32,,hf_open_llm_v1_240829_frozen.csv puddlejumper_platypus2_13b_qlora_0_80_epoch,HFv1 Winogrande,71.11,,hf_open_llm_v1_240829_frozen.csv puli_gptrio,HF OpenLLM v1,34.42,,hf_open_llm_v1_240829_frozen.csv puli_gptrio,HFv1 ARC,30.72,,hf_open_llm_v1_240829_frozen.csv puli_gptrio,HFv1 GSM8K,0.76,,hf_open_llm_v1_240829_frozen.csv puli_gptrio,HFv1 HellaSwag,53.49,,hf_open_llm_v1_240829_frozen.csv puli_gptrio,HFv1 MMLU,24.73,,hf_open_llm_v1_240829_frozen.csv puli_gptrio,HFv1 TruthfulQA,39.03,,hf_open_llm_v1_240829_frozen.csv puli_gptrio,HFv1 Winogrande,57.77,,hf_open_llm_v1_240829_frozen.csv pythia_12b,HF OpenLLM v1,38.82,,hf_open_llm_v1_240829_frozen.csv pythia_12b,HFv1 ARC,39.59,,hf_open_llm_v1_240829_frozen.csv pythia_12b,HFv1 GSM8K,1.74,,hf_open_llm_v1_240829_frozen.csv pythia_12b,HFv1 HellaSwag,68.82,,hf_open_llm_v1_240829_frozen.csv pythia_12b,HFv1 MMLU,26.76,,hf_open_llm_v1_240829_frozen.csv pythia_12b,HFv1 TruthfulQA,31.85,,hf_open_llm_v1_240829_frozen.csv pythia_12b,HFv1 Winogrande,64.17,,hf_open_llm_v1_240829_frozen.csv pythia_12b_deduped,HF OpenLLM v1,39.7,,hf_open_llm_v1_240829_frozen.csv pythia_12b_deduped,HFv1 ARC,41.38,,hf_open_llm_v1_240829_frozen.csv pythia_12b_deduped,HFv1 GSM8K,1.44,,hf_open_llm_v1_240829_frozen.csv pythia_12b_deduped,HFv1 HellaSwag,70.26,,hf_open_llm_v1_240829_frozen.csv pythia_12b_deduped,HFv1 MMLU,25.63,,hf_open_llm_v1_240829_frozen.csv pythia_12b_deduped,HFv1 TruthfulQA,33.0,,hf_open_llm_v1_240829_frozen.csv pythia_12b_deduped,HFv1 Winogrande,66.46,,hf_open_llm_v1_240829_frozen.csv pythia_160m,HF OpenLLM v1,29.02,,hf_open_llm_v1_240829_frozen.csv pythia_160m,HFv1 ARC,22.78,,hf_open_llm_v1_240829_frozen.csv pythia_160m,HFv1 GSM8K,0.23,,hf_open_llm_v1_240829_frozen.csv pythia_160m,HFv1 HellaSwag,30.34,,hf_open_llm_v1_240829_frozen.csv pythia_160m,HFv1 MMLU,24.95,,hf_open_llm_v1_240829_frozen.csv pythia_160m,HFv1 TruthfulQA,44.26,,hf_open_llm_v1_240829_frozen.csv pythia_160m,HFv1 Winogrande,51.54,,hf_open_llm_v1_240829_frozen.csv pythia_160m_deduped,HF OpenLLM v1,29.38,,hf_open_llm_v1_240829_frozen.csv pythia_160m_deduped,HFv1 ARC,24.06,,hf_open_llm_v1_240829_frozen.csv pythia_160m_deduped,HFv1 GSM8K,0.23,,hf_open_llm_v1_240829_frozen.csv pythia_160m_deduped,HFv1 HellaSwag,31.39,,hf_open_llm_v1_240829_frozen.csv pythia_160m_deduped,HFv1 MMLU,24.86,,hf_open_llm_v1_240829_frozen.csv pythia_160m_deduped,HFv1 TruthfulQA,44.34,,hf_open_llm_v1_240829_frozen.csv pythia_160m_deduped,HFv1 Winogrande,51.38,,hf_open_llm_v1_240829_frozen.csv pythia_1_3b,HF OpenLLM v1,34.46,,hf_open_llm_v1_240829_frozen.csv pythia_1_3b,HFv1 ARC,31.14,,hf_open_llm_v1_240829_frozen.csv pythia_1_3b,HFv1 GSM8K,0.99,,hf_open_llm_v1_240829_frozen.csv pythia_1_3b,HFv1 HellaSwag,51.43,,hf_open_llm_v1_240829_frozen.csv pythia_1_3b,HFv1 MMLU,26.55,,hf_open_llm_v1_240829_frozen.csv pythia_1_3b,HFv1 TruthfulQA,39.24,,hf_open_llm_v1_240829_frozen.csv pythia_1_3b,HFv1 Winogrande,57.38,,hf_open_llm_v1_240829_frozen.csv pythia_1_4b,HF OpenLLM v1,34.75,,hf_open_llm_v1_240829_frozen.csv pythia_1_4b,HFv1 ARC,31.48,,hf_open_llm_v1_240829_frozen.csv pythia_1_4b,HFv1 GSM8K,1.52,,hf_open_llm_v1_240829_frozen.csv pythia_1_4b,HFv1 HellaSwag,52.86,,hf_open_llm_v1_240829_frozen.csv pythia_1_4b,HFv1 MMLU,25.8,,hf_open_llm_v1_240829_frozen.csv pythia_1_4b,HFv1 TruthfulQA,38.85,,hf_open_llm_v1_240829_frozen.csv pythia_1_4b,HFv1 Winogrande,58.01,,hf_open_llm_v1_240829_frozen.csv pythia_1_4b_deduped,HF OpenLLM v1,35.0,,hf_open_llm_v1_240829_frozen.csv pythia_1_4b_deduped,HFv1 ARC,32.68,,hf_open_llm_v1_240829_frozen.csv pythia_1_4b_deduped,HFv1 GSM8K,0.83,,hf_open_llm_v1_240829_frozen.csv pythia_1_4b_deduped,HFv1 HellaSwag,54.96,,hf_open_llm_v1_240829_frozen.csv pythia_1_4b_deduped,HFv1 MMLU,25.56,,hf_open_llm_v1_240829_frozen.csv pythia_1_4b_deduped,HFv1 TruthfulQA,38.66,,hf_open_llm_v1_240829_frozen.csv pythia_1_4b_deduped,HFv1 Winogrande,57.3,,hf_open_llm_v1_240829_frozen.csv pythia_1b_deduped,HF OpenLLM v1,32.78,,hf_open_llm_v1_240829_frozen.csv pythia_1b_deduped,HFv1 ARC,29.1,,hf_open_llm_v1_240829_frozen.csv pythia_1b_deduped,HFv1 GSM8K,1.14,,hf_open_llm_v1_240829_frozen.csv pythia_1b_deduped,HFv1 HellaSwag,49.65,,hf_open_llm_v1_240829_frozen.csv pythia_1b_deduped,HFv1 MMLU,24.27,,hf_open_llm_v1_240829_frozen.csv pythia_1b_deduped,HFv1 TruthfulQA,38.94,,hf_open_llm_v1_240829_frozen.csv pythia_1b_deduped,HFv1 Winogrande,53.59,,hf_open_llm_v1_240829_frozen.csv pythia_1b_dpo,HF OpenLLM v1,32.76,,hf_open_llm_v1_240829_frozen.csv pythia_1b_dpo,HFv1 ARC,30.12,,hf_open_llm_v1_240829_frozen.csv pythia_1b_dpo,HFv1 GSM8K,1.67,,hf_open_llm_v1_240829_frozen.csv pythia_1b_dpo,HFv1 HellaSwag,49.24,,hf_open_llm_v1_240829_frozen.csv pythia_1b_dpo,HFv1 MMLU,24.24,,hf_open_llm_v1_240829_frozen.csv pythia_1b_dpo,HFv1 TruthfulQA,37.2,,hf_open_llm_v1_240829_frozen.csv pythia_1b_dpo,HFv1 Winogrande,54.06,,hf_open_llm_v1_240829_frozen.csv pythia_1b_dpo_full,HF OpenLLM v1,32.55,,hf_open_llm_v1_240829_frozen.csv pythia_1b_dpo_full,HFv1 ARC,29.44,,hf_open_llm_v1_240829_frozen.csv pythia_1b_dpo_full,HFv1 GSM8K,1.97,,hf_open_llm_v1_240829_frozen.csv pythia_1b_dpo_full,HFv1 HellaSwag,49.03,,hf_open_llm_v1_240829_frozen.csv pythia_1b_dpo_full,HFv1 MMLU,24.13,,hf_open_llm_v1_240829_frozen.csv pythia_1b_dpo_full,HFv1 TruthfulQA,37.27,,hf_open_llm_v1_240829_frozen.csv pythia_1b_dpo_full,HFv1 Winogrande,53.43,,hf_open_llm_v1_240829_frozen.csv pythia_1b_spin_iter1,HF OpenLLM v1,32.85,,hf_open_llm_v1_240829_frozen.csv pythia_1b_spin_iter1,HFv1 ARC,30.55,,hf_open_llm_v1_240829_frozen.csv pythia_1b_spin_iter1,HFv1 GSM8K,2.35,,hf_open_llm_v1_240829_frozen.csv pythia_1b_spin_iter1,HFv1 HellaSwag,49.26,,hf_open_llm_v1_240829_frozen.csv pythia_1b_spin_iter1,HFv1 MMLU,24.46,,hf_open_llm_v1_240829_frozen.csv pythia_1b_spin_iter1,HFv1 TruthfulQA,36.89,,hf_open_llm_v1_240829_frozen.csv pythia_1b_spin_iter1,HFv1 Winogrande,53.59,,hf_open_llm_v1_240829_frozen.csv pythia_2_7b,HF OpenLLM v1,37.09,,hf_open_llm_v1_240829_frozen.csv pythia_2_7b,HFv1 ARC,37.37,,hf_open_llm_v1_240829_frozen.csv pythia_2_7b,HFv1 GSM8K,1.06,,hf_open_llm_v1_240829_frozen.csv pythia_2_7b,HFv1 HellaSwag,60.74,,hf_open_llm_v1_240829_frozen.csv pythia_2_7b,HFv1 MMLU,25.86,,hf_open_llm_v1_240829_frozen.csv pythia_2_7b,HFv1 TruthfulQA,35.4,,hf_open_llm_v1_240829_frozen.csv pythia_2_7b,HFv1 Winogrande,62.12,,hf_open_llm_v1_240829_frozen.csv pythia_2_8b_deduped,HF OpenLLM v1,36.72,,hf_open_llm_v1_240829_frozen.csv pythia_2_8b_deduped,HFv1 ARC,36.26,,hf_open_llm_v1_240829_frozen.csv pythia_2_8b_deduped,HFv1 GSM8K,0.83,,hf_open_llm_v1_240829_frozen.csv pythia_2_8b_deduped,HFv1 HellaSwag,60.66,,hf_open_llm_v1_240829_frozen.csv pythia_2_8b_deduped,HFv1 MMLU,26.78,,hf_open_llm_v1_240829_frozen.csv pythia_2_8b_deduped,HFv1 TruthfulQA,35.56,,hf_open_llm_v1_240829_frozen.csv pythia_2_8b_deduped,HFv1 Winogrande,60.22,,hf_open_llm_v1_240829_frozen.csv pythia_31m,HF OpenLLM v1,28.81,,hf_open_llm_v1_240829_frozen.csv pythia_31m,HFv1 ARC,21.84,,hf_open_llm_v1_240829_frozen.csv pythia_31m,HFv1 GSM8K,0.23,,hf_open_llm_v1_240829_frozen.csv pythia_31m,HFv1 HellaSwag,27.0,,hf_open_llm_v1_240829_frozen.csv pythia_31m,HFv1 MMLU,24.97,,hf_open_llm_v1_240829_frozen.csv pythia_31m,HFv1 TruthfulQA,50.12,,hf_open_llm_v1_240829_frozen.csv pythia_31m,HFv1 Winogrande,49.72,,hf_open_llm_v1_240829_frozen.csv pythia_31m_chat_v1,HF OpenLLM v1,28.59,,hf_open_llm_v1_240829_frozen.csv pythia_31m_chat_v1,HFv1 ARC,21.84,,hf_open_llm_v1_240829_frozen.csv pythia_31m_chat_v1,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv pythia_31m_chat_v1,HFv1 HellaSwag,26.81,,hf_open_llm_v1_240829_frozen.csv pythia_31m_chat_v1,HFv1 MMLU,24.55,,hf_open_llm_v1_240829_frozen.csv pythia_31m_chat_v1,HFv1 TruthfulQA,48.04,,hf_open_llm_v1_240829_frozen.csv pythia_31m_chat_v1,HFv1 Winogrande,50.28,,hf_open_llm_v1_240829_frozen.csv pythia_31m_goodwiki_deduped_2048_scratch,HF OpenLLM v1,28.85,,hf_open_llm_v1_240829_frozen.csv pythia_31m_goodwiki_deduped_2048_scratch,HFv1 ARC,23.12,,hf_open_llm_v1_240829_frozen.csv pythia_31m_goodwiki_deduped_2048_scratch,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv pythia_31m_goodwiki_deduped_2048_scratch,HFv1 HellaSwag,25.66,,hf_open_llm_v1_240829_frozen.csv pythia_31m_goodwiki_deduped_2048_scratch,HFv1 MMLU,23.11,,hf_open_llm_v1_240829_frozen.csv pythia_31m_goodwiki_deduped_2048_scratch,HFv1 TruthfulQA,51.32,,hf_open_llm_v1_240829_frozen.csv pythia_31m_goodwiki_deduped_2048_scratch,HFv1 Winogrande,49.88,,hf_open_llm_v1_240829_frozen.csv pythia_31m_ki_v1_2048_scratch,HF OpenLLM v1,29.15,,hf_open_llm_v1_240829_frozen.csv pythia_31m_ki_v1_2048_scratch,HFv1 ARC,23.12,,hf_open_llm_v1_240829_frozen.csv pythia_31m_ki_v1_2048_scratch,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv pythia_31m_ki_v1_2048_scratch,HFv1 HellaSwag,25.23,,hf_open_llm_v1_240829_frozen.csv pythia_31m_ki_v1_2048_scratch,HFv1 MMLU,23.12,,hf_open_llm_v1_240829_frozen.csv pythia_31m_ki_v1_2048_scratch,HFv1 TruthfulQA,51.67,,hf_open_llm_v1_240829_frozen.csv pythia_31m_ki_v1_2048_scratch,HFv1 Winogrande,51.78,,hf_open_llm_v1_240829_frozen.csv pythia_31m_simplepile_lite_2048_scratch_2e,HF OpenLLM v1,28.6,,hf_open_llm_v1_240829_frozen.csv pythia_31m_simplepile_lite_2048_scratch_2e,HFv1 ARC,21.59,,hf_open_llm_v1_240829_frozen.csv pythia_31m_simplepile_lite_2048_scratch_2e,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv pythia_31m_simplepile_lite_2048_scratch_2e,HFv1 HellaSwag,25.79,,hf_open_llm_v1_240829_frozen.csv pythia_31m_simplepile_lite_2048_scratch_2e,HFv1 MMLU,24.99,,hf_open_llm_v1_240829_frozen.csv pythia_31m_simplepile_lite_2048_scratch_2e,HFv1 TruthfulQA,50.62,,hf_open_llm_v1_240829_frozen.csv pythia_31m_simplepile_lite_2048_scratch_2e,HFv1 Winogrande,48.62,,hf_open_llm_v1_240829_frozen.csv pythia_31m_simplewiki_2048,HF OpenLLM v1,28.27,,hf_open_llm_v1_240829_frozen.csv pythia_31m_simplewiki_2048,HFv1 ARC,22.18,,hf_open_llm_v1_240829_frozen.csv pythia_31m_simplewiki_2048,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv pythia_31m_simplewiki_2048,HFv1 HellaSwag,25.55,,hf_open_llm_v1_240829_frozen.csv pythia_31m_simplewiki_2048,HFv1 MMLU,23.12,,hf_open_llm_v1_240829_frozen.csv pythia_31m_simplewiki_2048,HFv1 TruthfulQA,49.37,,hf_open_llm_v1_240829_frozen.csv pythia_31m_simplewiki_2048,HFv1 Winogrande,49.41,,hf_open_llm_v1_240829_frozen.csv pythia_31m_simplewiki_scratch_bf16,HF OpenLLM v1,28.61,,hf_open_llm_v1_240829_frozen.csv pythia_31m_simplewiki_scratch_bf16,HFv1 ARC,22.78,,hf_open_llm_v1_240829_frozen.csv pythia_31m_simplewiki_scratch_bf16,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv pythia_31m_simplewiki_scratch_bf16,HFv1 HellaSwag,25.61,,hf_open_llm_v1_240829_frozen.csv pythia_31m_simplewiki_scratch_bf16,HFv1 MMLU,23.12,,hf_open_llm_v1_240829_frozen.csv pythia_31m_simplewiki_scratch_bf16,HFv1 TruthfulQA,49.65,,hf_open_llm_v1_240829_frozen.csv pythia_31m_simplewiki_scratch_bf16,HFv1 Winogrande,50.51,,hf_open_llm_v1_240829_frozen.csv pythia_410m_deduped,HF OpenLLM v1,31.29,,hf_open_llm_v1_240829_frozen.csv pythia_410m_deduped,HFv1 ARC,24.83,,hf_open_llm_v1_240829_frozen.csv pythia_410m_deduped,HFv1 GSM8K,0.3,,hf_open_llm_v1_240829_frozen.csv pythia_410m_deduped,HFv1 HellaSwag,41.29,,hf_open_llm_v1_240829_frozen.csv pythia_410m_deduped,HFv1 MMLU,25.99,,hf_open_llm_v1_240829_frozen.csv pythia_410m_deduped,HFv1 TruthfulQA,40.95,,hf_open_llm_v1_240829_frozen.csv pythia_410m_deduped,HFv1 Winogrande,54.38,,hf_open_llm_v1_240829_frozen.csv pythia_6_7b,HF OpenLLM v1,38.06,,hf_open_llm_v1_240829_frozen.csv pythia_6_7b,HFv1 ARC,40.1,,hf_open_llm_v1_240829_frozen.csv pythia_6_7b,HFv1 GSM8K,1.06,,hf_open_llm_v1_240829_frozen.csv pythia_6_7b,HFv1 HellaSwag,65.0,,hf_open_llm_v1_240829_frozen.csv pythia_6_7b,HFv1 MMLU,24.64,,hf_open_llm_v1_240829_frozen.csv pythia_6_7b,HFv1 TruthfulQA,32.85,,hf_open_llm_v1_240829_frozen.csv pythia_6_7b,HFv1 Winogrande,64.72,,hf_open_llm_v1_240829_frozen.csv pythia_6_9b_deduped,HF OpenLLM v1,39.3,,hf_open_llm_v1_240829_frozen.csv pythia_6_9b_deduped,HFv1 ARC,41.3,,hf_open_llm_v1_240829_frozen.csv pythia_6_9b_deduped,HFv1 GSM8K,1.67,,hf_open_llm_v1_240829_frozen.csv pythia_6_9b_deduped,HFv1 HellaSwag,67.05,,hf_open_llm_v1_240829_frozen.csv pythia_6_9b_deduped,HFv1 MMLU,26.48,,hf_open_llm_v1_240829_frozen.csv pythia_6_9b_deduped,HFv1 TruthfulQA,35.19,,hf_open_llm_v1_240829_frozen.csv pythia_6_9b_deduped,HFv1 Winogrande,64.09,,hf_open_llm_v1_240829_frozen.csv pythia_70m,HF OpenLLM v1,28.93,,hf_open_llm_v1_240829_frozen.csv pythia_70m,HFv1 ARC,21.59,,hf_open_llm_v1_240829_frozen.csv pythia_70m,HFv1 GSM8K,0.3,,hf_open_llm_v1_240829_frozen.csv pythia_70m,HFv1 HellaSwag,27.29,,hf_open_llm_v1_240829_frozen.csv pythia_70m,HFv1 MMLU,25.9,,hf_open_llm_v1_240829_frozen.csv pythia_70m,HFv1 TruthfulQA,47.06,,hf_open_llm_v1_240829_frozen.csv pythia_70m,HFv1 Winogrande,51.46,,hf_open_llm_v1_240829_frozen.csv pythia_70m_deduped,HF OpenLLM v1,28.44,,hf_open_llm_v1_240829_frozen.csv pythia_70m_deduped,HFv1 ARC,21.08,,hf_open_llm_v1_240829_frozen.csv pythia_70m_deduped,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv pythia_70m_deduped,HFv1 HellaSwag,27.17,,hf_open_llm_v1_240829_frozen.csv pythia_70m_deduped,HFv1 MMLU,25.26,,hf_open_llm_v1_240829_frozen.csv pythia_70m_deduped,HFv1 TruthfulQA,47.51,,hf_open_llm_v1_240829_frozen.csv pythia_70m_deduped,HFv1 Winogrande,49.64,,hf_open_llm_v1_240829_frozen.csv quan_1_8b_base,HF OpenLLM v1,43.35,,hf_open_llm_v1_240829_frozen.csv quan_1_8b_base,HFv1 ARC,36.95,,hf_open_llm_v1_240829_frozen.csv quan_1_8b_base,HFv1 GSM8K,19.71,,hf_open_llm_v1_240829_frozen.csv quan_1_8b_base,HFv1 HellaSwag,58.46,,hf_open_llm_v1_240829_frozen.csv quan_1_8b_base,HFv1 MMLU,45.44,,hf_open_llm_v1_240829_frozen.csv quan_1_8b_base,HFv1 TruthfulQA,41.6,,hf_open_llm_v1_240829_frozen.csv quan_1_8b_base,HFv1 Winogrande,57.93,,hf_open_llm_v1_240829_frozen.csv quan_1_8b_chat,HF OpenLLM v1,45.91,,hf_open_llm_v1_240829_frozen.csv quan_1_8b_chat,HFv1 ARC,39.08,,hf_open_llm_v1_240829_frozen.csv quan_1_8b_chat,HFv1 GSM8K,27.52,,hf_open_llm_v1_240829_frozen.csv quan_1_8b_chat,HFv1 HellaSwag,62.37,,hf_open_llm_v1_240829_frozen.csv quan_1_8b_chat,HFv1 MMLU,44.09,,hf_open_llm_v1_240829_frozen.csv quan_1_8b_chat,HFv1 TruthfulQA,43.15,,hf_open_llm_v1_240829_frozen.csv quan_1_8b_chat,HFv1 Winogrande,59.27,,hf_open_llm_v1_240829_frozen.csv quark_464m_v0_2,HF OpenLLM v1,35.12,,hf_open_llm_v1_240829_frozen.csv quark_464m_v0_2,HFv1 ARC,30.46,,hf_open_llm_v1_240829_frozen.csv quark_464m_v0_2,HFv1 GSM8K,4.47,,hf_open_llm_v1_240829_frozen.csv quark_464m_v0_2,HFv1 HellaSwag,44.96,,hf_open_llm_v1_240829_frozen.csv quark_464m_v0_2,HFv1 MMLU,31.29,,hf_open_llm_v1_240829_frozen.csv quark_464m_v0_2,HFv1 TruthfulQA,43.89,,hf_open_llm_v1_240829_frozen.csv quark_464m_v0_2,HFv1 Winogrande,55.64,,hf_open_llm_v1_240829_frozen.csv quyen_mini_v0_1,HF OpenLLM v1,46.14,,hf_open_llm_v1_240829_frozen.csv quyen_mini_v0_1,HFv1 ARC,39.33,,hf_open_llm_v1_240829_frozen.csv quyen_mini_v0_1,HFv1 GSM8K,27.45,,hf_open_llm_v1_240829_frozen.csv quyen_mini_v0_1,HFv1 HellaSwag,60.57,,hf_open_llm_v1_240829_frozen.csv quyen_mini_v0_1,HFv1 MMLU,43.93,,hf_open_llm_v1_240829_frozen.csv quyen_mini_v0_1,HFv1 TruthfulQA,46.44,,hf_open_llm_v1_240829_frozen.csv quyen_mini_v0_1,HFv1 Winogrande,59.12,,hf_open_llm_v1_240829_frozen.csv quyen_plus_v0_1,HF OpenLLM v1,63.27,,hf_open_llm_v1_240829_frozen.csv quyen_plus_v0_1,HFv1 ARC,55.72,,hf_open_llm_v1_240829_frozen.csv quyen_plus_v0_1,HFv1 GSM8K,60.05,,hf_open_llm_v1_240829_frozen.csv quyen_plus_v0_1,HFv1 HellaSwag,78.52,,hf_open_llm_v1_240829_frozen.csv quyen_plus_v0_1,HFv1 MMLU,60.45,,hf_open_llm_v1_240829_frozen.csv quyen_plus_v0_1,HFv1 TruthfulQA,53.6,,hf_open_llm_v1_240829_frozen.csv quyen_plus_v0_1,HFv1 Winogrande,71.27,,hf_open_llm_v1_240829_frozen.csv quyen_pro_v0_1,HF OpenLLM v1,68.6,,hf_open_llm_v1_240829_frozen.csv quyen_pro_v0_1,HFv1 ARC,59.3,,hf_open_llm_v1_240829_frozen.csv quyen_pro_v0_1,HFv1 GSM8K,71.04,,hf_open_llm_v1_240829_frozen.csv quyen_pro_v0_1,HFv1 HellaSwag,81.07,,hf_open_llm_v1_240829_frozen.csv quyen_pro_v0_1,HFv1 MMLU,68.44,,hf_open_llm_v1_240829_frozen.csv quyen_pro_v0_1,HFv1 TruthfulQA,55.85,,hf_open_llm_v1_240829_frozen.csv quyen_pro_v0_1,HFv1 Winogrande,75.93,,hf_open_llm_v1_240829_frozen.csv quyen_v0_1,HF OpenLLM v1,56.02,,hf_open_llm_v1_240829_frozen.csv quyen_v0_1,HFv1 ARC,48.21,,hf_open_llm_v1_240829_frozen.csv quyen_v0_1,HFv1 GSM8K,45.87,,hf_open_llm_v1_240829_frozen.csv quyen_v0_1,HFv1 HellaSwag,72.49,,hf_open_llm_v1_240829_frozen.csv quyen_v0_1,HFv1 MMLU,52.88,,hf_open_llm_v1_240829_frozen.csv quyen_v0_1,HFv1 TruthfulQA,51.53,,hf_open_llm_v1_240829_frozen.csv quyen_v0_1,HFv1 Winogrande,65.11,,hf_open_llm_v1_240829_frozen.csv qwen1_5_0_5b,HF OpenLLM v1,38.62,,hf_open_llm_v1_240829_frozen.csv qwen1_5_0_5b,HFv1 ARC,31.48,,hf_open_llm_v1_240829_frozen.csv qwen1_5_0_5b,HFv1 GSM8K,16.3,,hf_open_llm_v1_240829_frozen.csv qwen1_5_0_5b,HFv1 HellaSwag,49.05,,hf_open_llm_v1_240829_frozen.csv qwen1_5_0_5b,HFv1 MMLU,39.35,,hf_open_llm_v1_240829_frozen.csv qwen1_5_0_5b,HFv1 TruthfulQA,38.3,,hf_open_llm_v1_240829_frozen.csv qwen1_5_0_5b,HFv1 Winogrande,57.22,,hf_open_llm_v1_240829_frozen.csv qwen1_5_0_5b_chat,HF OpenLLM v1,35.61,,hf_open_llm_v1_240829_frozen.csv qwen1_5_0_5b_chat,HFv1 ARC,30.55,,hf_open_llm_v1_240829_frozen.csv qwen1_5_0_5b_chat,HFv1 GSM8K,7.66,,hf_open_llm_v1_240829_frozen.csv qwen1_5_0_5b_chat,HFv1 HellaSwag,44.07,,hf_open_llm_v1_240829_frozen.csv qwen1_5_0_5b_chat,HFv1 MMLU,33.82,,hf_open_llm_v1_240829_frozen.csv qwen1_5_0_5b_chat,HFv1 TruthfulQA,42.95,,hf_open_llm_v1_240829_frozen.csv qwen1_5_0_5b_chat,HFv1 Winogrande,54.62,,hf_open_llm_v1_240829_frozen.csv qwen1_5_110b,HF OpenLLM v1,75.42,,hf_open_llm_v1_240829_frozen.csv qwen1_5_110b,HFv1 ARC,69.97,,hf_open_llm_v1_240829_frozen.csv qwen1_5_110b,HFv1 GSM8K,81.05,,hf_open_llm_v1_240829_frozen.csv qwen1_5_110b,HFv1 HellaSwag,87.48,,hf_open_llm_v1_240829_frozen.csv qwen1_5_110b,HFv1 MMLU,80.2,,hf_open_llm_v1_240829_frozen.csv qwen1_5_110b,HFv1 TruthfulQA,49.66,,hf_open_llm_v1_240829_frozen.csv qwen1_5_110b,HFv1 Winogrande,84.14,,hf_open_llm_v1_240829_frozen.csv qwen1_5_110b_chat,HF OpenLLM v1,68.01,,hf_open_llm_v1_240829_frozen.csv qwen1_5_110b_chat,HFv1 ARC,72.01,,hf_open_llm_v1_240829_frozen.csv qwen1_5_110b_chat,HFv1 GSM8K,30.1,,hf_open_llm_v1_240829_frozen.csv qwen1_5_110b_chat,HFv1 HellaSwag,84.67,,hf_open_llm_v1_240829_frozen.csv qwen1_5_110b_chat,HFv1 MMLU,78.04,,hf_open_llm_v1_240829_frozen.csv qwen1_5_110b_chat,HFv1 TruthfulQA,65.86,,hf_open_llm_v1_240829_frozen.csv qwen1_5_110b_chat,HFv1 Winogrande,77.35,,hf_open_llm_v1_240829_frozen.csv qwen1_5_14b,HF OpenLLM v1,66.7,,hf_open_llm_v1_240829_frozen.csv qwen1_5_14b,HFv1 ARC,56.57,,hf_open_llm_v1_240829_frozen.csv qwen1_5_14b,HFv1 GSM8K,67.63,,hf_open_llm_v1_240829_frozen.csv qwen1_5_14b,HFv1 HellaSwag,81.08,,hf_open_llm_v1_240829_frozen.csv qwen1_5_14b,HFv1 MMLU,69.36,,hf_open_llm_v1_240829_frozen.csv qwen1_5_14b,HFv1 TruthfulQA,52.06,,hf_open_llm_v1_240829_frozen.csv qwen1_5_14b,HFv1 Winogrande,73.48,,hf_open_llm_v1_240829_frozen.csv qwen1_5_14b_chat,HF OpenLLM v1,62.27,,hf_open_llm_v1_240829_frozen.csv qwen1_5_14b_chat,HFv1 ARC,58.7,,hf_open_llm_v1_240829_frozen.csv qwen1_5_14b_chat,HFv1 GSM8K,30.63,,hf_open_llm_v1_240829_frozen.csv qwen1_5_14b_chat,HFv1 HellaSwag,82.27,,hf_open_llm_v1_240829_frozen.csv qwen1_5_14b_chat,HFv1 MMLU,68.57,,hf_open_llm_v1_240829_frozen.csv qwen1_5_14b_chat,HFv1 TruthfulQA,60.36,,hf_open_llm_v1_240829_frozen.csv qwen1_5_14b_chat,HFv1 Winogrande,73.09,,hf_open_llm_v1_240829_frozen.csv qwen1_5_1_8b,HF OpenLLM v1,46.55,,hf_open_llm_v1_240829_frozen.csv qwen1_5_1_8b,HFv1 ARC,37.88,,hf_open_llm_v1_240829_frozen.csv qwen1_5_1_8b,HFv1 GSM8K,33.59,,hf_open_llm_v1_240829_frozen.csv qwen1_5_1_8b,HFv1 HellaSwag,61.42,,hf_open_llm_v1_240829_frozen.csv qwen1_5_1_8b,HFv1 MMLU,46.71,,hf_open_llm_v1_240829_frozen.csv qwen1_5_1_8b,HFv1 TruthfulQA,39.43,,hf_open_llm_v1_240829_frozen.csv qwen1_5_1_8b,HFv1 Winogrande,60.3,,hf_open_llm_v1_240829_frozen.csv qwen1_5_1_8b_chat,HF OpenLLM v1,43.99,,hf_open_llm_v1_240829_frozen.csv qwen1_5_1_8b_chat,HFv1 ARC,38.74,,hf_open_llm_v1_240829_frozen.csv qwen1_5_1_8b_chat,HFv1 GSM8K,19.03,,hf_open_llm_v1_240829_frozen.csv qwen1_5_1_8b_chat,HFv1 HellaSwag,60.02,,hf_open_llm_v1_240829_frozen.csv qwen1_5_1_8b_chat,HFv1 MMLU,45.87,,hf_open_llm_v1_240829_frozen.csv qwen1_5_1_8b_chat,HFv1 TruthfulQA,40.62,,hf_open_llm_v1_240829_frozen.csv qwen1_5_1_8b_chat,HFv1 Winogrande,59.67,,hf_open_llm_v1_240829_frozen.csv qwen1_5_32b,HF OpenLLM v1,70.47,,hf_open_llm_v1_240829_frozen.csv qwen1_5_32b,HFv1 ARC,63.57,,hf_open_llm_v1_240829_frozen.csv qwen1_5_32b,HFv1 GSM8K,61.11,,hf_open_llm_v1_240829_frozen.csv qwen1_5_32b,HFv1 HellaSwag,85.0,,hf_open_llm_v1_240829_frozen.csv qwen1_5_32b,HFv1 MMLU,74.31,,hf_open_llm_v1_240829_frozen.csv qwen1_5_32b,HFv1 TruthfulQA,57.43,,hf_open_llm_v1_240829_frozen.csv qwen1_5_32b,HFv1 Winogrande,81.45,,hf_open_llm_v1_240829_frozen.csv qwen1_5_32b_chat,HF OpenLLM v1,62.95,,hf_open_llm_v1_240829_frozen.csv qwen1_5_32b_chat,HFv1 ARC,66.04,,hf_open_llm_v1_240829_frozen.csv qwen1_5_32b_chat,HFv1 GSM8K,7.05,,hf_open_llm_v1_240829_frozen.csv qwen1_5_32b_chat,HFv1 HellaSwag,85.49,,hf_open_llm_v1_240829_frozen.csv qwen1_5_32b_chat,HFv1 MMLU,74.99,,hf_open_llm_v1_240829_frozen.csv qwen1_5_32b_chat,HFv1 TruthfulQA,66.95,,hf_open_llm_v1_240829_frozen.csv qwen1_5_32b_chat,HFv1 Winogrande,77.19,,hf_open_llm_v1_240829_frozen.csv qwen1_5_4b,HF OpenLLM v1,57.05,,hf_open_llm_v1_240829_frozen.csv qwen1_5_4b,HFv1 ARC,48.46,,hf_open_llm_v1_240829_frozen.csv qwen1_5_4b,HFv1 GSM8K,52.24,,hf_open_llm_v1_240829_frozen.csv qwen1_5_4b,HFv1 HellaSwag,71.58,,hf_open_llm_v1_240829_frozen.csv qwen1_5_4b,HFv1 MMLU,56.52,,hf_open_llm_v1_240829_frozen.csv qwen1_5_4b,HFv1 TruthfulQA,47.27,,hf_open_llm_v1_240829_frozen.csv qwen1_5_4b,HFv1 Winogrande,66.22,,hf_open_llm_v1_240829_frozen.csv qwen1_5_4b_chat,HF OpenLLM v1,46.79,,hf_open_llm_v1_240829_frozen.csv qwen1_5_4b_chat,HFv1 ARC,43.26,,hf_open_llm_v1_240829_frozen.csv qwen1_5_4b_chat,HFv1 GSM8K,2.43,,hf_open_llm_v1_240829_frozen.csv qwen1_5_4b_chat,HFv1 HellaSwag,69.73,,hf_open_llm_v1_240829_frozen.csv qwen1_5_4b_chat,HFv1 MMLU,55.55,,hf_open_llm_v1_240829_frozen.csv qwen1_5_4b_chat,HFv1 TruthfulQA,44.79,,hf_open_llm_v1_240829_frozen.csv qwen1_5_4b_chat,HFv1 Winogrande,64.96,,hf_open_llm_v1_240829_frozen.csv qwen1_5_72b,HF OpenLLM v1,72.91,,hf_open_llm_v1_240829_frozen.csv qwen1_5_72b,HFv1 ARC,65.87,,hf_open_llm_v1_240829_frozen.csv qwen1_5_72b,HFv1 GSM8K,65.73,,hf_open_llm_v1_240829_frozen.csv qwen1_5_72b,HFv1 HellaSwag,85.99,,hf_open_llm_v1_240829_frozen.csv qwen1_5_72b,HFv1 MMLU,77.2,,hf_open_llm_v1_240829_frozen.csv qwen1_5_72b,HFv1 TruthfulQA,59.61,,hf_open_llm_v1_240829_frozen.csv qwen1_5_72b,HFv1 Winogrande,83.03,,hf_open_llm_v1_240829_frozen.csv qwen1_5_72b_chat,HF OpenLLM v1,65.98,,hf_open_llm_v1_240829_frozen.csv qwen1_5_72b_chat,HFv1 ARC,68.52,,hf_open_llm_v1_240829_frozen.csv qwen1_5_72b_chat,HFv1 GSM8K,20.92,,hf_open_llm_v1_240829_frozen.csv qwen1_5_72b_chat,HFv1 HellaSwag,86.47,,hf_open_llm_v1_240829_frozen.csv qwen1_5_72b_chat,HFv1 MMLU,77.46,,hf_open_llm_v1_240829_frozen.csv qwen1_5_72b_chat,HFv1 TruthfulQA,63.9,,hf_open_llm_v1_240829_frozen.csv qwen1_5_72b_chat,HFv1 Winogrande,79.08,,hf_open_llm_v1_240829_frozen.csv qwen1_5_7b_chat,HF OpenLLM v1,55.15,,hf_open_llm_v1_240829_frozen.csv qwen1_5_7b_chat,HFv1 ARC,55.89,,hf_open_llm_v1_240829_frozen.csv qwen1_5_7b_chat,HFv1 GSM8K,13.57,,hf_open_llm_v1_240829_frozen.csv qwen1_5_7b_chat,HFv1 HellaSwag,78.56,,hf_open_llm_v1_240829_frozen.csv qwen1_5_7b_chat,HFv1 MMLU,61.7,,hf_open_llm_v1_240829_frozen.csv qwen1_5_7b_chat,HFv1 TruthfulQA,53.65,,hf_open_llm_v1_240829_frozen.csv qwen1_5_7b_chat,HFv1 Winogrande,67.8,,hf_open_llm_v1_240829_frozen.csv qwen1_5_7b_chat_llamafy,HF OpenLLM v1,56.0,,hf_open_llm_v1_240829_frozen.csv qwen1_5_7b_chat_llamafy,HFv1 ARC,57.59,,hf_open_llm_v1_240829_frozen.csv qwen1_5_7b_chat_llamafy,HFv1 GSM8K,14.63,,hf_open_llm_v1_240829_frozen.csv qwen1_5_7b_chat_llamafy,HFv1 HellaSwag,78.52,,hf_open_llm_v1_240829_frozen.csv qwen1_5_7b_chat_llamafy,HFv1 MMLU,61.18,,hf_open_llm_v1_240829_frozen.csv qwen1_5_7b_chat_llamafy,HFv1 TruthfulQA,57.59,,hf_open_llm_v1_240829_frozen.csv qwen1_5_7b_chat_llamafy,HFv1 Winogrande,66.46,,hf_open_llm_v1_240829_frozen.csv qwen1_5_7b_dutch_chat,HF OpenLLM v1,53.66,,hf_open_llm_v1_240829_frozen.csv qwen1_5_7b_dutch_chat,HFv1 ARC,53.92,,hf_open_llm_v1_240829_frozen.csv qwen1_5_7b_dutch_chat,HFv1 GSM8K,15.47,,hf_open_llm_v1_240829_frozen.csv qwen1_5_7b_dutch_chat,HFv1 HellaSwag,76.03,,hf_open_llm_v1_240829_frozen.csv qwen1_5_7b_dutch_chat,HFv1 MMLU,62.38,,hf_open_llm_v1_240829_frozen.csv qwen1_5_7b_dutch_chat,HFv1 TruthfulQA,45.34,,hf_open_llm_v1_240829_frozen.csv qwen1_5_7b_dutch_chat,HFv1 Winogrande,68.82,,hf_open_llm_v1_240829_frozen.csv qwen1_5_7b_dutch_chat_dpo,HF OpenLLM v1,53.94,,hf_open_llm_v1_240829_frozen.csv qwen1_5_7b_dutch_chat_dpo,HFv1 ARC,50.77,,hf_open_llm_v1_240829_frozen.csv qwen1_5_7b_dutch_chat_dpo,HFv1 GSM8K,27.45,,hf_open_llm_v1_240829_frozen.csv qwen1_5_7b_dutch_chat_dpo,HFv1 HellaSwag,74.24,,hf_open_llm_v1_240829_frozen.csv qwen1_5_7b_dutch_chat_dpo,HFv1 MMLU,60.7,,hf_open_llm_v1_240829_frozen.csv qwen1_5_7b_dutch_chat_dpo,HFv1 TruthfulQA,42.37,,hf_open_llm_v1_240829_frozen.csv qwen1_5_7b_dutch_chat_dpo,HFv1 Winogrande,68.11,,hf_open_llm_v1_240829_frozen.csv qwen1_5_7b_dutch_chat_sft,HF OpenLLM v1,54.44,,hf_open_llm_v1_240829_frozen.csv qwen1_5_7b_dutch_chat_sft,HFv1 ARC,50.68,,hf_open_llm_v1_240829_frozen.csv qwen1_5_7b_dutch_chat_sft,HFv1 GSM8K,29.34,,hf_open_llm_v1_240829_frozen.csv qwen1_5_7b_dutch_chat_sft,HFv1 HellaSwag,73.49,,hf_open_llm_v1_240829_frozen.csv qwen1_5_7b_dutch_chat_sft,HFv1 MMLU,60.47,,hf_open_llm_v1_240829_frozen.csv qwen1_5_7b_dutch_chat_sft,HFv1 TruthfulQA,43.89,,hf_open_llm_v1_240829_frozen.csv qwen1_5_7b_dutch_chat_sft,HFv1 Winogrande,68.75,,hf_open_llm_v1_240829_frozen.csv qwen1_5_7b_dutch_chat_sft_bf16,HF OpenLLM v1,54.91,,hf_open_llm_v1_240829_frozen.csv qwen1_5_7b_dutch_chat_sft_bf16,HFv1 ARC,54.27,,hf_open_llm_v1_240829_frozen.csv qwen1_5_7b_dutch_chat_sft_bf16,HFv1 GSM8K,21.76,,hf_open_llm_v1_240829_frozen.csv qwen1_5_7b_dutch_chat_sft_bf16,HFv1 HellaSwag,75.53,,hf_open_llm_v1_240829_frozen.csv qwen1_5_7b_dutch_chat_sft_bf16,HFv1 MMLU,61.98,,hf_open_llm_v1_240829_frozen.csv qwen1_5_7b_dutch_chat_sft_bf16,HFv1 TruthfulQA,47.26,,hf_open_llm_v1_240829_frozen.csv qwen1_5_7b_dutch_chat_sft_bf16,HFv1 Winogrande,68.67,,hf_open_llm_v1_240829_frozen.csv qwen1_5_moe_a2_7b,HF OpenLLM v1,56.03,,hf_open_llm_v1_240829_frozen.csv qwen1_5_moe_a2_7b,HFv1 ARC,54.86,,hf_open_llm_v1_240829_frozen.csv qwen1_5_moe_a2_7b,HFv1 GSM8K,16.98,,hf_open_llm_v1_240829_frozen.csv qwen1_5_moe_a2_7b,HFv1 HellaSwag,79.39,,hf_open_llm_v1_240829_frozen.csv qwen1_5_moe_a2_7b,HFv1 MMLU,62.54,,hf_open_llm_v1_240829_frozen.csv qwen1_5_moe_a2_7b,HFv1 TruthfulQA,50.09,,hf_open_llm_v1_240829_frozen.csv qwen1_5_moe_a2_7b,HFv1 Winogrande,72.3,,hf_open_llm_v1_240829_frozen.csv qwen1_5_moe_a2_7b_chat,HF OpenLLM v1,57.22,,hf_open_llm_v1_240829_frozen.csv qwen1_5_moe_a2_7b_chat,HFv1 ARC,53.67,,hf_open_llm_v1_240829_frozen.csv qwen1_5_moe_a2_7b_chat,HFv1 GSM8K,28.2,,hf_open_llm_v1_240829_frozen.csv qwen1_5_moe_a2_7b_chat,HFv1 HellaSwag,80.54,,hf_open_llm_v1_240829_frozen.csv qwen1_5_moe_a2_7b_chat,HFv1 MMLU,60.97,,hf_open_llm_v1_240829_frozen.csv qwen1_5_moe_a2_7b_chat,HFv1 TruthfulQA,50.56,,hf_open_llm_v1_240829_frozen.csv qwen1_5_moe_a2_7b_chat,HFv1 Winogrande,69.38,,hf_open_llm_v1_240829_frozen.csv qwen2_beta_14b,HF OpenLLM v1,66.7,,hf_open_llm_v1_240829_frozen.csv qwen2_beta_14b,HFv1 ARC,56.57,,hf_open_llm_v1_240829_frozen.csv qwen2_beta_14b,HFv1 GSM8K,67.63,,hf_open_llm_v1_240829_frozen.csv qwen2_beta_14b,HFv1 HellaSwag,81.08,,hf_open_llm_v1_240829_frozen.csv qwen2_beta_14b,HFv1 MMLU,69.36,,hf_open_llm_v1_240829_frozen.csv qwen2_beta_14b,HFv1 TruthfulQA,52.06,,hf_open_llm_v1_240829_frozen.csv qwen2_beta_14b,HFv1 Winogrande,73.48,,hf_open_llm_v1_240829_frozen.csv qwen2_beta_72b,HF OpenLLM v1,72.91,,hf_open_llm_v1_240829_frozen.csv qwen2_beta_72b,HFv1 ARC,65.87,,hf_open_llm_v1_240829_frozen.csv qwen2_beta_72b,HFv1 GSM8K,65.73,,hf_open_llm_v1_240829_frozen.csv qwen2_beta_72b,HFv1 HellaSwag,85.99,,hf_open_llm_v1_240829_frozen.csv qwen2_beta_72b,HFv1 MMLU,77.2,,hf_open_llm_v1_240829_frozen.csv qwen2_beta_72b,HFv1 TruthfulQA,59.61,,hf_open_llm_v1_240829_frozen.csv qwen2_beta_72b,HFv1 Winogrande,83.03,,hf_open_llm_v1_240829_frozen.csv qwen_14b,HF OpenLLM v1,65.86,,hf_open_llm_v1_240829_frozen.csv qwen_14b,HFv1 ARC,58.28,,hf_open_llm_v1_240829_frozen.csv qwen_14b,HFv1 GSM8K,58.98,,hf_open_llm_v1_240829_frozen.csv qwen_14b,HFv1 HellaSwag,83.99,,hf_open_llm_v1_240829_frozen.csv qwen_14b,HFv1 MMLU,67.7,,hf_open_llm_v1_240829_frozen.csv qwen_14b,HFv1 TruthfulQA,49.43,,hf_open_llm_v1_240829_frozen.csv qwen_14b,HFv1 Winogrande,76.8,,hf_open_llm_v1_240829_frozen.csv qwen_14b_llamafied,HF OpenLLM v1,63.09,,hf_open_llm_v1_240829_frozen.csv qwen_14b_llamafied,HFv1 ARC,55.2,,hf_open_llm_v1_240829_frozen.csv qwen_14b_llamafied,HFv1 GSM8K,52.77,,hf_open_llm_v1_240829_frozen.csv qwen_14b_llamafied,HFv1 HellaSwag,82.31,,hf_open_llm_v1_240829_frozen.csv qwen_14b_llamafied,HFv1 MMLU,66.11,,hf_open_llm_v1_240829_frozen.csv qwen_14b_llamafied,HFv1 TruthfulQA,45.6,,hf_open_llm_v1_240829_frozen.csv qwen_14b_llamafied,HFv1 Winogrande,76.56,,hf_open_llm_v1_240829_frozen.csv qwen_1_8b_chat_llama,HF OpenLLM v1,42.94,,hf_open_llm_v1_240829_frozen.csv qwen_1_8b_chat_llama,HFv1 ARC,36.95,,hf_open_llm_v1_240829_frozen.csv qwen_1_8b_chat_llama,HFv1 GSM8K,19.26,,hf_open_llm_v1_240829_frozen.csv qwen_1_8b_chat_llama,HFv1 HellaSwag,54.34,,hf_open_llm_v1_240829_frozen.csv qwen_1_8b_chat_llama,HFv1 MMLU,44.55,,hf_open_llm_v1_240829_frozen.csv qwen_1_8b_chat_llama,HFv1 TruthfulQA,43.7,,hf_open_llm_v1_240829_frozen.csv qwen_1_8b_chat_llama,HFv1 Winogrande,58.88,,hf_open_llm_v1_240829_frozen.csv qwen_1_8b_everythinglm,HF OpenLLM v1,42.77,,hf_open_llm_v1_240829_frozen.csv qwen_1_8b_everythinglm,HFv1 ARC,38.65,,hf_open_llm_v1_240829_frozen.csv qwen_1_8b_everythinglm,HFv1 GSM8K,12.74,,hf_open_llm_v1_240829_frozen.csv qwen_1_8b_everythinglm,HFv1 HellaSwag,62.66,,hf_open_llm_v1_240829_frozen.csv qwen_1_8b_everythinglm,HFv1 MMLU,44.94,,hf_open_llm_v1_240829_frozen.csv qwen_1_8b_everythinglm,HFv1 TruthfulQA,38.7,,hf_open_llm_v1_240829_frozen.csv qwen_1_8b_everythinglm,HFv1 Winogrande,58.96,,hf_open_llm_v1_240829_frozen.csv qwen_1_8b_llamafied,HF OpenLLM v1,44.75,,hf_open_llm_v1_240829_frozen.csv qwen_1_8b_llamafied,HFv1 ARC,37.71,,hf_open_llm_v1_240829_frozen.csv qwen_1_8b_llamafied,HFv1 GSM8K,24.41,,hf_open_llm_v1_240829_frozen.csv qwen_1_8b_llamafied,HFv1 HellaSwag,58.87,,hf_open_llm_v1_240829_frozen.csv qwen_1_8b_llamafied,HFv1 MMLU,46.37,,hf_open_llm_v1_240829_frozen.csv qwen_1_8b_llamafied,HFv1 TruthfulQA,39.41,,hf_open_llm_v1_240829_frozen.csv qwen_1_8b_llamafied,HFv1 Winogrande,61.72,,hf_open_llm_v1_240829_frozen.csv qwen_72b,HF OpenLLM v1,73.6,,hf_open_llm_v1_240829_frozen.csv qwen_72b,HFv1 ARC,65.19,,hf_open_llm_v1_240829_frozen.csv qwen_72b,HFv1 GSM8K,70.43,,hf_open_llm_v1_240829_frozen.csv qwen_72b,HFv1 HellaSwag,85.94,,hf_open_llm_v1_240829_frozen.csv qwen_72b,HFv1 MMLU,77.37,,hf_open_llm_v1_240829_frozen.csv qwen_72b,HFv1 TruthfulQA,60.19,,hf_open_llm_v1_240829_frozen.csv qwen_72b,HFv1 Winogrande,82.48,,hf_open_llm_v1_240829_frozen.csv qwen_72b_llama,HF OpenLLM v1,69.53,,hf_open_llm_v1_240829_frozen.csv qwen_72b_llama,HFv1 ARC,64.85,,hf_open_llm_v1_240829_frozen.csv qwen_72b_llama,HFv1 GSM8K,56.25,,hf_open_llm_v1_240829_frozen.csv qwen_72b_llama,HFv1 HellaSwag,83.27,,hf_open_llm_v1_240829_frozen.csv qwen_72b_llama,HFv1 MMLU,73.66,,hf_open_llm_v1_240829_frozen.csv qwen_72b_llama,HFv1 TruthfulQA,57.6,,hf_open_llm_v1_240829_frozen.csv qwen_72b_llama,HFv1 Winogrande,81.53,,hf_open_llm_v1_240829_frozen.csv qwen_7b,HF OpenLLM v1,59.19,,hf_open_llm_v1_240829_frozen.csv qwen_7b,HFv1 ARC,51.37,,hf_open_llm_v1_240829_frozen.csv qwen_7b,HFv1 GSM8K,44.96,,hf_open_llm_v1_240829_frozen.csv qwen_7b,HFv1 HellaSwag,78.47,,hf_open_llm_v1_240829_frozen.csv qwen_7b,HFv1 MMLU,59.84,,hf_open_llm_v1_240829_frozen.csv qwen_7b,HFv1 TruthfulQA,47.79,,hf_open_llm_v1_240829_frozen.csv qwen_7b,HFv1 Winogrande,72.69,,hf_open_llm_v1_240829_frozen.csv qwen_orpo_v1,HF OpenLLM v1,36.28,,hf_open_llm_v1_240829_frozen.csv qwen_orpo_v1,HFv1 ARC,31.14,,hf_open_llm_v1_240829_frozen.csv qwen_orpo_v1,HFv1 GSM8K,8.57,,hf_open_llm_v1_240829_frozen.csv qwen_orpo_v1,HFv1 HellaSwag,44.58,,hf_open_llm_v1_240829_frozen.csv qwen_orpo_v1,HFv1 MMLU,33.73,,hf_open_llm_v1_240829_frozen.csv qwen_orpo_v1,HFv1 TruthfulQA,42.59,,hf_open_llm_v1_240829_frozen.csv qwen_orpo_v1,HFv1 Winogrande,57.06,,hf_open_llm_v1_240829_frozen.csv rabbit_7b_dpo_chat,HF OpenLLM v1,69.69,,hf_open_llm_v1_240829_frozen.csv rabbit_7b_dpo_chat,HFv1 ARC,70.31,,hf_open_llm_v1_240829_frozen.csv rabbit_7b_dpo_chat,HFv1 GSM8K,58.53,,hf_open_llm_v1_240829_frozen.csv rabbit_7b_dpo_chat,HFv1 HellaSwag,87.43,,hf_open_llm_v1_240829_frozen.csv rabbit_7b_dpo_chat,HFv1 MMLU,60.5,,hf_open_llm_v1_240829_frozen.csv rabbit_7b_dpo_chat,HFv1 TruthfulQA,62.18,,hf_open_llm_v1_240829_frozen.csv rabbit_7b_dpo_chat,HFv1 Winogrande,79.16,,hf_open_llm_v1_240829_frozen.csv rabbit_7b_v2_dpo_chat,HF OpenLLM v1,69.36,,hf_open_llm_v1_240829_frozen.csv rabbit_7b_v2_dpo_chat,HFv1 ARC,66.13,,hf_open_llm_v1_240829_frozen.csv rabbit_7b_v2_dpo_chat,HFv1 GSM8K,55.65,,hf_open_llm_v1_240829_frozen.csv rabbit_7b_v2_dpo_chat,HFv1 HellaSwag,85.18,,hf_open_llm_v1_240829_frozen.csv rabbit_7b_v2_dpo_chat,HFv1 MMLU,62.92,,hf_open_llm_v1_240829_frozen.csv rabbit_7b_v2_dpo_chat,HFv1 TruthfulQA,67.06,,hf_open_llm_v1_240829_frozen.csv rabbit_7b_v2_dpo_chat,HFv1 Winogrande,79.24,,hf_open_llm_v1_240829_frozen.csv raccoon_small,HF OpenLLM v1,74.78,,hf_open_llm_v1_240829_frozen.csv raccoon_small,HFv1 ARC,74.4,,hf_open_llm_v1_240829_frozen.csv raccoon_small,HFv1 GSM8K,56.86,,hf_open_llm_v1_240829_frozen.csv raccoon_small,HFv1 HellaSwag,88.73,,hf_open_llm_v1_240829_frozen.csv raccoon_small,HFv1 MMLU,64.55,,hf_open_llm_v1_240829_frozen.csv raccoon_small,HFv1 TruthfulQA,76.74,,hf_open_llm_v1_240829_frozen.csv raccoon_small,HFv1 Winogrande,87.37,,hf_open_llm_v1_240829_frozen.csv radiantloom_mixtral_8x7b_fusion,HF OpenLLM v1,65.24,,hf_open_llm_v1_240829_frozen.csv radiantloom_mixtral_8x7b_fusion,HFv1 ARC,63.48,,hf_open_llm_v1_240829_frozen.csv radiantloom_mixtral_8x7b_fusion,HFv1 GSM8K,53.45,,hf_open_llm_v1_240829_frozen.csv radiantloom_mixtral_8x7b_fusion,HFv1 HellaSwag,83.65,,hf_open_llm_v1_240829_frozen.csv radiantloom_mixtral_8x7b_fusion,HFv1 MMLU,60.03,,hf_open_llm_v1_240829_frozen.csv radiantloom_mixtral_8x7b_fusion,HFv1 TruthfulQA,54.76,,hf_open_llm_v1_240829_frozen.csv radiantloom_mixtral_8x7b_fusion,HFv1 Winogrande,76.09,,hf_open_llm_v1_240829_frozen.csv radintloom_mistral_7b_fusion,HF OpenLLM v1,55.86,,hf_open_llm_v1_240829_frozen.csv radintloom_mistral_7b_fusion,HFv1 ARC,62.03,,hf_open_llm_v1_240829_frozen.csv radintloom_mistral_7b_fusion,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv radintloom_mistral_7b_fusion,HFv1 HellaSwag,82.26,,hf_open_llm_v1_240829_frozen.csv radintloom_mistral_7b_fusion,HFv1 MMLU,63.82,,hf_open_llm_v1_240829_frozen.csv radintloom_mistral_7b_fusion,HFv1 TruthfulQA,47.19,,hf_open_llm_v1_240829_frozen.csv radintloom_mistral_7b_fusion,HFv1 Winogrande,79.87,,hf_open_llm_v1_240829_frozen.csv rain_7b_v0_2,HF OpenLLM v1,59.01,,hf_open_llm_v1_240829_frozen.csv rain_7b_v0_2,HFv1 ARC,51.54,,hf_open_llm_v1_240829_frozen.csv rain_7b_v0_2,HFv1 GSM8K,48.75,,hf_open_llm_v1_240829_frozen.csv rain_7b_v0_2,HFv1 HellaSwag,75.11,,hf_open_llm_v1_240829_frozen.csv rain_7b_v0_2,HFv1 MMLU,61.51,,hf_open_llm_v1_240829_frozen.csv rain_7b_v0_2,HFv1 TruthfulQA,46.44,,hf_open_llm_v1_240829_frozen.csv rain_7b_v0_2,HFv1 Winogrande,70.72,,hf_open_llm_v1_240829_frozen.csv rainbowfish_7b_v10,HF OpenLLM v1,61.88,,hf_open_llm_v1_240829_frozen.csv rainbowfish_7b_v10,HFv1 ARC,61.18,,hf_open_llm_v1_240829_frozen.csv rainbowfish_7b_v10,HFv1 GSM8K,37.0,,hf_open_llm_v1_240829_frozen.csv rainbowfish_7b_v10,HFv1 HellaSwag,82.33,,hf_open_llm_v1_240829_frozen.csv rainbowfish_7b_v10,HFv1 MMLU,63.26,,hf_open_llm_v1_240829_frozen.csv rainbowfish_7b_v10,HFv1 TruthfulQA,49.45,,hf_open_llm_v1_240829_frozen.csv rainbowfish_7b_v10,HFv1 Winogrande,78.06,,hf_open_llm_v1_240829_frozen.csv rainbowfish_7b_v9,HF OpenLLM v1,61.42,,hf_open_llm_v1_240829_frozen.csv rainbowfish_7b_v9,HFv1 ARC,61.77,,hf_open_llm_v1_240829_frozen.csv rainbowfish_7b_v9,HFv1 GSM8K,34.8,,hf_open_llm_v1_240829_frozen.csv rainbowfish_7b_v9,HFv1 HellaSwag,82.43,,hf_open_llm_v1_240829_frozen.csv rainbowfish_7b_v9,HFv1 MMLU,63.0,,hf_open_llm_v1_240829_frozen.csv rainbowfish_7b_v9,HFv1 TruthfulQA,48.82,,hf_open_llm_v1_240829_frozen.csv rainbowfish_7b_v9,HFv1 Winogrande,77.66,,hf_open_llm_v1_240829_frozen.csv rainbowfish_v6,HF OpenLLM v1,61.64,,hf_open_llm_v1_240829_frozen.csv rainbowfish_v6,HFv1 ARC,61.95,,hf_open_llm_v1_240829_frozen.csv rainbowfish_v6,HFv1 GSM8K,36.32,,hf_open_llm_v1_240829_frozen.csv rainbowfish_v6,HFv1 HellaSwag,82.51,,hf_open_llm_v1_240829_frozen.csv rainbowfish_v6,HFv1 MMLU,62.79,,hf_open_llm_v1_240829_frozen.csv rainbowfish_v6,HFv1 TruthfulQA,48.37,,hf_open_llm_v1_240829_frozen.csv rainbowfish_v6,HFv1 Winogrande,77.9,,hf_open_llm_v1_240829_frozen.csv rainbowfish_v7,HF OpenLLM v1,62.18,,hf_open_llm_v1_240829_frozen.csv rainbowfish_v7,HFv1 ARC,61.95,,hf_open_llm_v1_240829_frozen.csv rainbowfish_v7,HFv1 GSM8K,37.45,,hf_open_llm_v1_240829_frozen.csv rainbowfish_v7,HFv1 HellaSwag,82.52,,hf_open_llm_v1_240829_frozen.csv rainbowfish_v7,HFv1 MMLU,63.26,,hf_open_llm_v1_240829_frozen.csv rainbowfish_v7,HFv1 TruthfulQA,49.78,,hf_open_llm_v1_240829_frozen.csv rainbowfish_v7,HFv1 Winogrande,78.14,,hf_open_llm_v1_240829_frozen.csv rasgulla1_7b,HF OpenLLM v1,73.0,,hf_open_llm_v1_240829_frozen.csv rasgulla1_7b,HFv1 ARC,69.71,,hf_open_llm_v1_240829_frozen.csv rasgulla1_7b,HFv1 GSM8K,71.72,,hf_open_llm_v1_240829_frozen.csv rasgulla1_7b,HFv1 HellaSwag,87.4,,hf_open_llm_v1_240829_frozen.csv rasgulla1_7b,HFv1 MMLU,64.94,,hf_open_llm_v1_240829_frozen.csv rasgulla1_7b,HFv1 TruthfulQA,63.31,,hf_open_llm_v1_240829_frozen.csv rasgulla1_7b,HFv1 Winogrande,80.9,,hf_open_llm_v1_240829_frozen.csv recurrentgemma_2b,HF OpenLLM v1,45.46,,hf_open_llm_v1_240829_frozen.csv recurrentgemma_2b,HFv1 ARC,46.93,,hf_open_llm_v1_240829_frozen.csv recurrentgemma_2b,HFv1 GSM8K,16.15,,hf_open_llm_v1_240829_frozen.csv recurrentgemma_2b,HFv1 HellaSwag,72.48,,hf_open_llm_v1_240829_frozen.csv recurrentgemma_2b,HFv1 MMLU,34.61,,hf_open_llm_v1_240829_frozen.csv recurrentgemma_2b,HFv1 TruthfulQA,35.1,,hf_open_llm_v1_240829_frozen.csv recurrentgemma_2b,HFv1 Winogrande,68.51,,hf_open_llm_v1_240829_frozen.csv recurrentgemma_2b_it,HF OpenLLM v1,40.86,,hf_open_llm_v1_240829_frozen.csv recurrentgemma_2b_it,HFv1 ARC,30.97,,hf_open_llm_v1_240829_frozen.csv recurrentgemma_2b_it,HFv1 GSM8K,10.08,,hf_open_llm_v1_240829_frozen.csv recurrentgemma_2b_it,HFv1 HellaSwag,56.26,,hf_open_llm_v1_240829_frozen.csv recurrentgemma_2b_it,HFv1 MMLU,40.87,,hf_open_llm_v1_240829_frozen.csv recurrentgemma_2b_it,HFv1 TruthfulQA,42.81,,hf_open_llm_v1_240829_frozen.csv recurrentgemma_2b_it,HFv1 Winogrande,64.17,,hf_open_llm_v1_240829_frozen.csv redmond_puffin_13b_instruct_pl_lora_unload,HF OpenLLM v1,55.0,,hf_open_llm_v1_240829_frozen.csv redmond_puffin_13b_instruct_pl_lora_unload,HFv1 ARC,60.92,,hf_open_llm_v1_240829_frozen.csv redmond_puffin_13b_instruct_pl_lora_unload,HFv1 GSM8K,11.07,,hf_open_llm_v1_240829_frozen.csv redmond_puffin_13b_instruct_pl_lora_unload,HFv1 HellaSwag,82.43,,hf_open_llm_v1_240829_frozen.csv redmond_puffin_13b_instruct_pl_lora_unload,HFv1 MMLU,55.61,,hf_open_llm_v1_240829_frozen.csv redmond_puffin_13b_instruct_pl_lora_unload,HFv1 TruthfulQA,44.26,,hf_open_llm_v1_240829_frozen.csv redmond_puffin_13b_instruct_pl_lora_unload,HFv1 Winogrande,75.69,,hf_open_llm_v1_240829_frozen.csv redpajama_incite_7b_base,HF OpenLLM v1,41.49,,hf_open_llm_v1_240829_frozen.csv redpajama_incite_7b_base,HFv1 ARC,46.25,,hf_open_llm_v1_240829_frozen.csv redpajama_incite_7b_base,HFv1 GSM8K,3.03,,hf_open_llm_v1_240829_frozen.csv redpajama_incite_7b_base,HFv1 HellaSwag,71.63,,hf_open_llm_v1_240829_frozen.csv redpajama_incite_7b_base,HFv1 MMLU,27.68,,hf_open_llm_v1_240829_frozen.csv redpajama_incite_7b_base,HFv1 TruthfulQA,33.03,,hf_open_llm_v1_240829_frozen.csv redpajama_incite_7b_base,HFv1 Winogrande,67.32,,hf_open_llm_v1_240829_frozen.csv redpajama_incite_base_3b_v1,HF OpenLLM v1,38.54,,hf_open_llm_v1_240829_frozen.csv redpajama_incite_base_3b_v1,HFv1 ARC,40.19,,hf_open_llm_v1_240829_frozen.csv redpajama_incite_base_3b_v1,HFv1 GSM8K,1.29,,hf_open_llm_v1_240829_frozen.csv redpajama_incite_base_3b_v1,HFv1 HellaSwag,64.77,,hf_open_llm_v1_240829_frozen.csv redpajama_incite_base_3b_v1,HFv1 MMLU,27.03,,hf_open_llm_v1_240829_frozen.csv redpajama_incite_base_3b_v1,HFv1 TruthfulQA,33.23,,hf_open_llm_v1_240829_frozen.csv redpajama_incite_base_3b_v1,HFv1 Winogrande,64.72,,hf_open_llm_v1_240829_frozen.csv redpajama_incite_base_7b_v0_1,HF OpenLLM v1,41.25,,hf_open_llm_v1_240829_frozen.csv redpajama_incite_base_7b_v0_1,HFv1 ARC,46.25,,hf_open_llm_v1_240829_frozen.csv redpajama_incite_base_7b_v0_1,HFv1 GSM8K,1.59,,hf_open_llm_v1_240829_frozen.csv redpajama_incite_base_7b_v0_1,HFv1 HellaSwag,71.63,,hf_open_llm_v1_240829_frozen.csv redpajama_incite_base_7b_v0_1,HFv1 MMLU,27.68,,hf_open_llm_v1_240829_frozen.csv redpajama_incite_base_7b_v0_1,HFv1 TruthfulQA,33.03,,hf_open_llm_v1_240829_frozen.csv redpajama_incite_base_7b_v0_1,HFv1 Winogrande,67.32,,hf_open_llm_v1_240829_frozen.csv redpajama_incite_chat_3b_v1_rl_lora_8bit_test1,HF OpenLLM v1,39.16,,hf_open_llm_v1_240829_frozen.csv redpajama_incite_chat_3b_v1_rl_lora_8bit_test1,HFv1 ARC,41.3,,hf_open_llm_v1_240829_frozen.csv redpajama_incite_chat_3b_v1_rl_lora_8bit_test1,HFv1 GSM8K,0.3,,hf_open_llm_v1_240829_frozen.csv redpajama_incite_chat_3b_v1_rl_lora_8bit_test1,HFv1 HellaSwag,66.82,,hf_open_llm_v1_240829_frozen.csv redpajama_incite_chat_3b_v1_rl_lora_8bit_test1,HFv1 MMLU,26.1,,hf_open_llm_v1_240829_frozen.csv redpajama_incite_chat_3b_v1_rl_lora_8bit_test1,HFv1 TruthfulQA,35.04,,hf_open_llm_v1_240829_frozen.csv redpajama_incite_chat_3b_v1_rl_lora_8bit_test1,HFv1 Winogrande,65.43,,hf_open_llm_v1_240829_frozen.csv remask_3b,HF OpenLLM v1,49.49,,hf_open_llm_v1_240829_frozen.csv remask_3b,HFv1 ARC,43.77,,hf_open_llm_v1_240829_frozen.csv remask_3b,HFv1 GSM8K,27.14,,hf_open_llm_v1_240829_frozen.csv remask_3b,HFv1 HellaSwag,75.7,,hf_open_llm_v1_240829_frozen.csv remask_3b,HFv1 MMLU,41.82,,hf_open_llm_v1_240829_frozen.csv remask_3b,HFv1 TruthfulQA,42.13,,hf_open_llm_v1_240829_frozen.csv remask_3b,HFv1 Winogrande,66.38,,hf_open_llm_v1_240829_frozen.csv rho_math_1b_v0_1,HF OpenLLM v1,34.99,,hf_open_llm_v1_240829_frozen.csv rho_math_1b_v0_1,HFv1 ARC,34.3,,hf_open_llm_v1_240829_frozen.csv rho_math_1b_v0_1,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv rho_math_1b_v0_1,HFv1 HellaSwag,53.34,,hf_open_llm_v1_240829_frozen.csv rho_math_1b_v0_1,HFv1 MMLU,27.05,,hf_open_llm_v1_240829_frozen.csv rho_math_1b_v0_1,HFv1 TruthfulQA,35.48,,hf_open_llm_v1_240829_frozen.csv rho_math_1b_v0_1,HFv1 Winogrande,59.75,,hf_open_llm_v1_240829_frozen.csv rizla55b,HF OpenLLM v1,60.93,,hf_open_llm_v1_240829_frozen.csv rizla55b,HFv1 ARC,60.32,,hf_open_llm_v1_240829_frozen.csv rizla55b,HFv1 GSM8K,26.84,,hf_open_llm_v1_240829_frozen.csv rizla55b,HFv1 HellaSwag,80.42,,hf_open_llm_v1_240829_frozen.csv rizla55b,HFv1 MMLU,63.54,,hf_open_llm_v1_240829_frozen.csv rizla55b,HFv1 TruthfulQA,55.59,,hf_open_llm_v1_240829_frozen.csv rizla55b,HFv1 Winogrande,78.85,,hf_open_llm_v1_240829_frozen.csv rizla_17,HF OpenLLM v1,75.67,,hf_open_llm_v1_240829_frozen.csv rizla_17,HFv1 ARC,73.63,,hf_open_llm_v1_240829_frozen.csv rizla_17,HFv1 GSM8K,61.49,,hf_open_llm_v1_240829_frozen.csv rizla_17,HFv1 HellaSwag,89.72,,hf_open_llm_v1_240829_frozen.csv rizla_17,HFv1 MMLU,64.4,,hf_open_llm_v1_240829_frozen.csv rizla_17,HFv1 TruthfulQA,76.93,,hf_open_llm_v1_240829_frozen.csv rizla_17,HFv1 Winogrande,87.85,,hf_open_llm_v1_240829_frozen.csv rocket_3b,HF OpenLLM v1,55.77,,hf_open_llm_v1_240829_frozen.csv rocket_3b,HFv1 ARC,50.6,,hf_open_llm_v1_240829_frozen.csv rocket_3b,HFv1 GSM8K,36.47,,hf_open_llm_v1_240829_frozen.csv rocket_3b,HFv1 HellaSwag,76.69,,hf_open_llm_v1_240829_frozen.csv rocket_3b,HFv1 MMLU,47.1,,hf_open_llm_v1_240829_frozen.csv rocket_3b,HFv1 TruthfulQA,55.82,,hf_open_llm_v1_240829_frozen.csv rocket_3b,HFv1 Winogrande,67.96,,hf_open_llm_v1_240829_frozen.csv rolebeagle_11b,HF OpenLLM v1,76.06,,hf_open_llm_v1_240829_frozen.csv rolebeagle_11b,HFv1 ARC,72.35,,hf_open_llm_v1_240829_frozen.csv rolebeagle_11b,HFv1 GSM8K,65.88,,hf_open_llm_v1_240829_frozen.csv rolebeagle_11b,HFv1 HellaSwag,89.77,,hf_open_llm_v1_240829_frozen.csv rolebeagle_11b,HFv1 MMLU,66.35,,hf_open_llm_v1_240829_frozen.csv rolebeagle_11b,HFv1 TruthfulQA,77.92,,hf_open_llm_v1_240829_frozen.csv rolebeagle_11b,HFv1 Winogrande,84.06,,hf_open_llm_v1_240829_frozen.csv royalmaid_7b_slerp,HF OpenLLM v1,72.75,,hf_open_llm_v1_240829_frozen.csv royalmaid_7b_slerp,HFv1 ARC,70.39,,hf_open_llm_v1_240829_frozen.csv royalmaid_7b_slerp,HFv1 GSM8K,67.55,,hf_open_llm_v1_240829_frozen.csv royalmaid_7b_slerp,HFv1 HellaSwag,87.25,,hf_open_llm_v1_240829_frozen.csv royalmaid_7b_slerp,HFv1 MMLU,64.72,,hf_open_llm_v1_240829_frozen.csv royalmaid_7b_slerp,HFv1 TruthfulQA,64.18,,hf_open_llm_v1_240829_frozen.csv royalmaid_7b_slerp,HFv1 Winogrande,82.4,,hf_open_llm_v1_240829_frozen.csv royalnoroichi_7b_slerp,HF OpenLLM v1,72.98,,hf_open_llm_v1_240829_frozen.csv royalnoroichi_7b_slerp,HFv1 ARC,70.48,,hf_open_llm_v1_240829_frozen.csv royalnoroichi_7b_slerp,HFv1 GSM8K,66.72,,hf_open_llm_v1_240829_frozen.csv royalnoroichi_7b_slerp,HFv1 HellaSwag,87.38,,hf_open_llm_v1_240829_frozen.csv royalnoroichi_7b_slerp,HFv1 MMLU,64.78,,hf_open_llm_v1_240829_frozen.csv royalnoroichi_7b_slerp,HFv1 TruthfulQA,66.28,,hf_open_llm_v1_240829_frozen.csv royalnoroichi_7b_slerp,HFv1 Winogrande,82.24,,hf_open_llm_v1_240829_frozen.csv rwkv_4_14b_pile,HF OpenLLM v1,39.92,,hf_open_llm_v1_240829_frozen.csv rwkv_4_14b_pile,HFv1 ARC,44.45,,hf_open_llm_v1_240829_frozen.csv rwkv_4_14b_pile,HFv1 GSM8K,0.38,,hf_open_llm_v1_240829_frozen.csv rwkv_4_14b_pile,HFv1 HellaSwag,71.07,,hf_open_llm_v1_240829_frozen.csv rwkv_4_14b_pile,HFv1 MMLU,26.12,,hf_open_llm_v1_240829_frozen.csv rwkv_4_14b_pile,HFv1 TruthfulQA,32.04,,hf_open_llm_v1_240829_frozen.csv rwkv_4_14b_pile,HFv1 Winogrande,65.43,,hf_open_llm_v1_240829_frozen.csv rwkv_4_169m_pile,HF OpenLLM v1,28.64,,hf_open_llm_v1_240829_frozen.csv rwkv_4_169m_pile,HFv1 ARC,23.63,,hf_open_llm_v1_240829_frozen.csv rwkv_4_169m_pile,HFv1 GSM8K,0.45,,hf_open_llm_v1_240829_frozen.csv rwkv_4_169m_pile,HFv1 HellaSwag,31.74,,hf_open_llm_v1_240829_frozen.csv rwkv_4_169m_pile,HFv1 MMLU,23.18,,hf_open_llm_v1_240829_frozen.csv rwkv_4_169m_pile,HFv1 TruthfulQA,41.92,,hf_open_llm_v1_240829_frozen.csv rwkv_4_169m_pile,HFv1 Winogrande,50.91,,hf_open_llm_v1_240829_frozen.csv rwkv_4_1b5_pile,HF OpenLLM v1,33.25,,hf_open_llm_v1_240829_frozen.csv rwkv_4_1b5_pile,HFv1 ARC,31.83,,hf_open_llm_v1_240829_frozen.csv rwkv_4_1b5_pile,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv rwkv_4_1b5_pile,HFv1 HellaSwag,52.25,,hf_open_llm_v1_240829_frozen.csv rwkv_4_1b5_pile,HFv1 MMLU,25.77,,hf_open_llm_v1_240829_frozen.csv rwkv_4_1b5_pile,HFv1 TruthfulQA,35.8,,hf_open_llm_v1_240829_frozen.csv rwkv_4_1b5_pile,HFv1 Winogrande,53.83,,hf_open_llm_v1_240829_frozen.csv rwkv_4_3b_pile,HF OpenLLM v1,35.25,,hf_open_llm_v1_240829_frozen.csv rwkv_4_3b_pile,HFv1 ARC,36.01,,hf_open_llm_v1_240829_frozen.csv rwkv_4_3b_pile,HFv1 GSM8K,0.68,,hf_open_llm_v1_240829_frozen.csv rwkv_4_3b_pile,HFv1 HellaSwag,59.66,,hf_open_llm_v1_240829_frozen.csv rwkv_4_3b_pile,HFv1 MMLU,24.67,,hf_open_llm_v1_240829_frozen.csv rwkv_4_3b_pile,HFv1 TruthfulQA,32.14,,hf_open_llm_v1_240829_frozen.csv rwkv_4_3b_pile,HFv1 Winogrande,58.33,,hf_open_llm_v1_240829_frozen.csv rwkv_4_430m_pile,HF OpenLLM v1,30.45,,hf_open_llm_v1_240829_frozen.csv rwkv_4_430m_pile,HFv1 ARC,26.71,,hf_open_llm_v1_240829_frozen.csv rwkv_4_430m_pile,HFv1 GSM8K,0.38,,hf_open_llm_v1_240829_frozen.csv rwkv_4_430m_pile,HFv1 HellaSwag,40.01,,hf_open_llm_v1_240829_frozen.csv rwkv_4_430m_pile,HFv1 MMLU,24.85,,hf_open_llm_v1_240829_frozen.csv rwkv_4_430m_pile,HFv1 TruthfulQA,39.58,,hf_open_llm_v1_240829_frozen.csv rwkv_4_430m_pile,HFv1 Winogrande,51.14,,hf_open_llm_v1_240829_frozen.csv rwkv_4_7b_pile,HF OpenLLM v1,37.95,,hf_open_llm_v1_240829_frozen.csv rwkv_4_7b_pile,HFv1 ARC,39.68,,hf_open_llm_v1_240829_frozen.csv rwkv_4_7b_pile,HFv1 GSM8K,0.76,,hf_open_llm_v1_240829_frozen.csv rwkv_4_7b_pile,HFv1 HellaSwag,66.31,,hf_open_llm_v1_240829_frozen.csv rwkv_4_7b_pile,HFv1 MMLU,24.96,,hf_open_llm_v1_240829_frozen.csv rwkv_4_7b_pile,HFv1 TruthfulQA,33.65,,hf_open_llm_v1_240829_frozen.csv rwkv_4_7b_pile,HFv1 Winogrande,62.35,,hf_open_llm_v1_240829_frozen.csv rwkv_raven_1b5,HF OpenLLM v1,33.56,,hf_open_llm_v1_240829_frozen.csv rwkv_raven_1b5,HFv1 ARC,31.83,,hf_open_llm_v1_240829_frozen.csv rwkv_raven_1b5,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv rwkv_raven_1b5,HFv1 HellaSwag,52.6,,hf_open_llm_v1_240829_frozen.csv rwkv_raven_1b5,HFv1 MMLU,25.96,,hf_open_llm_v1_240829_frozen.csv rwkv_raven_1b5,HFv1 TruthfulQA,37.09,,hf_open_llm_v1_240829_frozen.csv rwkv_raven_1b5,HFv1 Winogrande,53.91,,hf_open_llm_v1_240829_frozen.csv rwkv_raven_3b,HF OpenLLM v1,35.81,,hf_open_llm_v1_240829_frozen.csv rwkv_raven_3b,HFv1 ARC,36.69,,hf_open_llm_v1_240829_frozen.csv rwkv_raven_3b,HFv1 GSM8K,0.45,,hf_open_llm_v1_240829_frozen.csv rwkv_raven_3b,HFv1 HellaSwag,59.78,,hf_open_llm_v1_240829_frozen.csv rwkv_raven_3b,HFv1 MMLU,24.87,,hf_open_llm_v1_240829_frozen.csv rwkv_raven_3b,HFv1 TruthfulQA,35.6,,hf_open_llm_v1_240829_frozen.csv rwkv_raven_3b,HFv1 Winogrande,57.46,,hf_open_llm_v1_240829_frozen.csv rwkv_raven_7b,HF OpenLLM v1,38.55,,hf_open_llm_v1_240829_frozen.csv rwkv_raven_7b,HFv1 ARC,39.42,,hf_open_llm_v1_240829_frozen.csv rwkv_raven_7b,HFv1 GSM8K,0.3,,hf_open_llm_v1_240829_frozen.csv rwkv_raven_7b,HFv1 HellaSwag,66.48,,hf_open_llm_v1_240829_frozen.csv rwkv_raven_7b,HFv1 MMLU,23.64,,hf_open_llm_v1_240829_frozen.csv rwkv_raven_7b,HFv1 TruthfulQA,38.56,,hf_open_llm_v1_240829_frozen.csv rwkv_raven_7b,HFv1 Winogrande,62.9,,hf_open_llm_v1_240829_frozen.csv sailor_0_5b,HF OpenLLM v1,33.05,,hf_open_llm_v1_240829_frozen.csv sailor_0_5b,HFv1 ARC,29.69,,hf_open_llm_v1_240829_frozen.csv sailor_0_5b,HFv1 GSM8K,1.06,,hf_open_llm_v1_240829_frozen.csv sailor_0_5b,HFv1 HellaSwag,45.82,,hf_open_llm_v1_240829_frozen.csv sailor_0_5b,HFv1 MMLU,25.62,,hf_open_llm_v1_240829_frozen.csv sailor_0_5b,HFv1 TruthfulQA,40.76,,hf_open_llm_v1_240829_frozen.csv sailor_0_5b,HFv1 Winogrande,55.33,,hf_open_llm_v1_240829_frozen.csv sailor_0_5b_chat,HF OpenLLM v1,33.47,,hf_open_llm_v1_240829_frozen.csv sailor_0_5b_chat,HFv1 ARC,30.38,,hf_open_llm_v1_240829_frozen.csv sailor_0_5b_chat,HFv1 GSM8K,1.82,,hf_open_llm_v1_240829_frozen.csv sailor_0_5b_chat,HFv1 HellaSwag,45.51,,hf_open_llm_v1_240829_frozen.csv sailor_0_5b_chat,HFv1 MMLU,26.73,,hf_open_llm_v1_240829_frozen.csv sailor_0_5b_chat,HFv1 TruthfulQA,39.85,,hf_open_llm_v1_240829_frozen.csv sailor_0_5b_chat,HFv1 Winogrande,56.51,,hf_open_llm_v1_240829_frozen.csv sailor_1_8b,HF OpenLLM v1,36.59,,hf_open_llm_v1_240829_frozen.csv sailor_1_8b,HFv1 ARC,33.11,,hf_open_llm_v1_240829_frozen.csv sailor_1_8b,HFv1 GSM8K,2.73,,hf_open_llm_v1_240829_frozen.csv sailor_1_8b,HFv1 HellaSwag,57.06,,hf_open_llm_v1_240829_frozen.csv sailor_1_8b,HFv1 MMLU,30.44,,hf_open_llm_v1_240829_frozen.csv sailor_1_8b,HFv1 TruthfulQA,37.81,,hf_open_llm_v1_240829_frozen.csv sailor_1_8b,HFv1 Winogrande,58.41,,hf_open_llm_v1_240829_frozen.csv sailor_1_8b_chat,HF OpenLLM v1,38.76,,hf_open_llm_v1_240829_frozen.csv sailor_1_8b_chat,HFv1 ARC,35.75,,hf_open_llm_v1_240829_frozen.csv sailor_1_8b_chat,HFv1 GSM8K,3.56,,hf_open_llm_v1_240829_frozen.csv sailor_1_8b_chat,HFv1 HellaSwag,57.12,,hf_open_llm_v1_240829_frozen.csv sailor_1_8b_chat,HFv1 MMLU,38.31,,hf_open_llm_v1_240829_frozen.csv sailor_1_8b_chat,HFv1 TruthfulQA,38.71,,hf_open_llm_v1_240829_frozen.csv sailor_1_8b_chat,HFv1 Winogrande,59.12,,hf_open_llm_v1_240829_frozen.csv sailor_4b,HF OpenLLM v1,44.19,,hf_open_llm_v1_240829_frozen.csv sailor_4b,HFv1 ARC,44.45,,hf_open_llm_v1_240829_frozen.csv sailor_4b,HFv1 GSM8K,9.1,,hf_open_llm_v1_240829_frozen.csv sailor_4b,HFv1 HellaSwag,69.53,,hf_open_llm_v1_240829_frozen.csv sailor_4b,HFv1 MMLU,38.99,,hf_open_llm_v1_240829_frozen.csv sailor_4b,HFv1 TruthfulQA,37.02,,hf_open_llm_v1_240829_frozen.csv sailor_4b,HFv1 Winogrande,66.06,,hf_open_llm_v1_240829_frozen.csv sailor_4b_chat,HF OpenLLM v1,45.8,,hf_open_llm_v1_240829_frozen.csv sailor_4b_chat,HFv1 ARC,45.05,,hf_open_llm_v1_240829_frozen.csv sailor_4b_chat,HFv1 GSM8K,9.1,,hf_open_llm_v1_240829_frozen.csv sailor_4b_chat,HFv1 HellaSwag,68.36,,hf_open_llm_v1_240829_frozen.csv sailor_4b_chat,HFv1 MMLU,43.96,,hf_open_llm_v1_240829_frozen.csv sailor_4b_chat,HFv1 TruthfulQA,42.09,,hf_open_llm_v1_240829_frozen.csv sailor_4b_chat,HFv1 Winogrande,66.22,,hf_open_llm_v1_240829_frozen.csv sailor_7b,HF OpenLLM v1,53.82,,hf_open_llm_v1_240829_frozen.csv sailor_7b,HFv1 ARC,49.83,,hf_open_llm_v1_240829_frozen.csv sailor_7b,HFv1 GSM8K,32.52,,hf_open_llm_v1_240829_frozen.csv sailor_7b,HFv1 HellaSwag,76.21,,hf_open_llm_v1_240829_frozen.csv sailor_7b,HFv1 MMLU,54.84,,hf_open_llm_v1_240829_frozen.csv sailor_7b,HFv1 TruthfulQA,40.12,,hf_open_llm_v1_240829_frozen.csv sailor_7b,HFv1 Winogrande,69.38,,hf_open_llm_v1_240829_frozen.csv sailor_7b_chat,HF OpenLLM v1,54.81,,hf_open_llm_v1_240829_frozen.csv sailor_7b_chat,HFv1 ARC,52.3,,hf_open_llm_v1_240829_frozen.csv sailor_7b_chat,HFv1 GSM8K,30.4,,hf_open_llm_v1_240829_frozen.csv sailor_7b_chat,HFv1 HellaSwag,75.01,,hf_open_llm_v1_240829_frozen.csv sailor_7b_chat,HFv1 MMLU,56.24,,hf_open_llm_v1_240829_frozen.csv sailor_7b_chat,HFv1 TruthfulQA,44.09,,hf_open_llm_v1_240829_frozen.csv sailor_7b_chat,HFv1 Winogrande,70.8,,hf_open_llm_v1_240829_frozen.csv sakura_solar_instruct_dpo_v2,HF OpenLLM v1,74.14,,hf_open_llm_v1_240829_frozen.csv sakura_solar_instruct_dpo_v2,HFv1 ARC,70.9,,hf_open_llm_v1_240829_frozen.csv sakura_solar_instruct_dpo_v2,HFv1 GSM8K,63.76,,hf_open_llm_v1_240829_frozen.csv sakura_solar_instruct_dpo_v2,HFv1 HellaSwag,88.41,,hf_open_llm_v1_240829_frozen.csv sakura_solar_instruct_dpo_v2,HFv1 MMLU,66.48,,hf_open_llm_v1_240829_frozen.csv sakura_solar_instruct_dpo_v2,HFv1 TruthfulQA,71.86,,hf_open_llm_v1_240829_frozen.csv sakura_solar_instruct_dpo_v2,HFv1 Winogrande,83.43,,hf_open_llm_v1_240829_frozen.csv sakura_solrca_instruct_dpo,HF OpenLLM v1,74.05,,hf_open_llm_v1_240829_frozen.csv sakura_solrca_instruct_dpo,HFv1 ARC,71.16,,hf_open_llm_v1_240829_frozen.csv sakura_solrca_instruct_dpo,HFv1 GSM8K,63.46,,hf_open_llm_v1_240829_frozen.csv sakura_solrca_instruct_dpo,HFv1 HellaSwag,88.49,,hf_open_llm_v1_240829_frozen.csv sakura_solrca_instruct_dpo,HFv1 MMLU,66.17,,hf_open_llm_v1_240829_frozen.csv sakura_solrca_instruct_dpo,HFv1 TruthfulQA,72.1,,hf_open_llm_v1_240829_frozen.csv sakura_solrca_instruct_dpo,HFv1 Winogrande,82.95,,hf_open_llm_v1_240829_frozen.csv sakura_solrca_math_instruct_dpo_v1,HF OpenLLM v1,74.13,,hf_open_llm_v1_240829_frozen.csv sakura_solrca_math_instruct_dpo_v1,HFv1 ARC,71.25,,hf_open_llm_v1_240829_frozen.csv sakura_solrca_math_instruct_dpo_v1,HFv1 GSM8K,63.84,,hf_open_llm_v1_240829_frozen.csv sakura_solrca_math_instruct_dpo_v1,HFv1 HellaSwag,88.48,,hf_open_llm_v1_240829_frozen.csv sakura_solrca_math_instruct_dpo_v1,HFv1 MMLU,66.21,,hf_open_llm_v1_240829_frozen.csv sakura_solrca_math_instruct_dpo_v1,HFv1 TruthfulQA,72.12,,hf_open_llm_v1_240829_frozen.csv sakura_solrca_math_instruct_dpo_v1,HFv1 Winogrande,82.87,,hf_open_llm_v1_240829_frozen.csv sakura_solrca_math_instruct_dpo_v2,HF OpenLLM v1,74.17,,hf_open_llm_v1_240829_frozen.csv sakura_solrca_math_instruct_dpo_v2,HFv1 ARC,71.25,,hf_open_llm_v1_240829_frozen.csv sakura_solrca_math_instruct_dpo_v2,HFv1 GSM8K,63.91,,hf_open_llm_v1_240829_frozen.csv sakura_solrca_math_instruct_dpo_v2,HFv1 HellaSwag,88.52,,hf_open_llm_v1_240829_frozen.csv sakura_solrca_math_instruct_dpo_v2,HFv1 MMLU,66.13,,hf_open_llm_v1_240829_frozen.csv sakura_solrca_math_instruct_dpo_v2,HFv1 TruthfulQA,72.16,,hf_open_llm_v1_240829_frozen.csv sakura_solrca_math_instruct_dpo_v2,HFv1 Winogrande,83.03,,hf_open_llm_v1_240829_frozen.csv samantha_1_11_70b,HF OpenLLM v1,67.28,,hf_open_llm_v1_240829_frozen.csv samantha_1_11_70b,HFv1 ARC,70.05,,hf_open_llm_v1_240829_frozen.csv samantha_1_11_70b,HFv1 GSM8K,29.95,,hf_open_llm_v1_240829_frozen.csv samantha_1_11_70b,HFv1 HellaSwag,87.55,,hf_open_llm_v1_240829_frozen.csv samantha_1_11_70b,HFv1 MMLU,67.82,,hf_open_llm_v1_240829_frozen.csv samantha_1_11_70b,HFv1 TruthfulQA,65.02,,hf_open_llm_v1_240829_frozen.csv samantha_1_11_70b,HFv1 Winogrande,83.27,,hf_open_llm_v1_240829_frozen.csv samantha_1_11_7b,HF OpenLLM v1,51.07,,hf_open_llm_v1_240829_frozen.csv samantha_1_11_7b,HFv1 ARC,55.03,,hf_open_llm_v1_240829_frozen.csv samantha_1_11_7b,HFv1 GSM8K,7.2,,hf_open_llm_v1_240829_frozen.csv samantha_1_11_7b,HFv1 HellaSwag,79.12,,hf_open_llm_v1_240829_frozen.csv samantha_1_11_7b,HFv1 MMLU,40.51,,hf_open_llm_v1_240829_frozen.csv samantha_1_11_7b,HFv1 TruthfulQA,50.37,,hf_open_llm_v1_240829_frozen.csv samantha_1_11_7b,HFv1 Winogrande,74.19,,hf_open_llm_v1_240829_frozen.csv samantha_1_1_70b,HF OpenLLM v1,67.43,,hf_open_llm_v1_240829_frozen.csv samantha_1_1_70b,HFv1 ARC,68.77,,hf_open_llm_v1_240829_frozen.csv samantha_1_1_70b,HFv1 GSM8K,31.61,,hf_open_llm_v1_240829_frozen.csv samantha_1_1_70b,HFv1 HellaSwag,87.46,,hf_open_llm_v1_240829_frozen.csv samantha_1_1_70b,HFv1 MMLU,68.6,,hf_open_llm_v1_240829_frozen.csv samantha_1_1_70b,HFv1 TruthfulQA,64.85,,hf_open_llm_v1_240829_frozen.csv samantha_1_1_70b,HFv1 Winogrande,83.27,,hf_open_llm_v1_240829_frozen.csv samantha_1_2_mistral_7b,HF OpenLLM v1,59.83,,hf_open_llm_v1_240829_frozen.csv samantha_1_2_mistral_7b,HFv1 ARC,64.08,,hf_open_llm_v1_240829_frozen.csv samantha_1_2_mistral_7b,HFv1 GSM8K,16.98,,hf_open_llm_v1_240829_frozen.csv samantha_1_2_mistral_7b,HFv1 HellaSwag,85.08,,hf_open_llm_v1_240829_frozen.csv samantha_1_2_mistral_7b,HFv1 MMLU,63.91,,hf_open_llm_v1_240829_frozen.csv samantha_1_2_mistral_7b,HFv1 TruthfulQA,50.4,,hf_open_llm_v1_240829_frozen.csv samantha_1_2_mistral_7b,HFv1 Winogrande,78.53,,hf_open_llm_v1_240829_frozen.csv samantha_mistral_7b,HF OpenLLM v1,57.96,,hf_open_llm_v1_240829_frozen.csv samantha_mistral_7b,HFv1 ARC,63.4,,hf_open_llm_v1_240829_frozen.csv samantha_mistral_7b,HFv1 GSM8K,16.0,,hf_open_llm_v1_240829_frozen.csv samantha_mistral_7b,HFv1 HellaSwag,84.1,,hf_open_llm_v1_240829_frozen.csv samantha_mistral_7b,HFv1 MMLU,61.36,,hf_open_llm_v1_240829_frozen.csv samantha_mistral_7b,HFv1 TruthfulQA,46.08,,hf_open_llm_v1_240829_frozen.csv samantha_mistral_7b,HFv1 Winogrande,76.8,,hf_open_llm_v1_240829_frozen.csv samantha_mistral_instruct_7b,HF OpenLLM v1,53.4,,hf_open_llm_v1_240829_frozen.csv samantha_mistral_instruct_7b,HFv1 ARC,53.5,,hf_open_llm_v1_240829_frozen.csv samantha_mistral_instruct_7b,HFv1 GSM8K,10.84,,hf_open_llm_v1_240829_frozen.csv samantha_mistral_instruct_7b,HFv1 HellaSwag,75.14,,hf_open_llm_v1_240829_frozen.csv samantha_mistral_instruct_7b,HFv1 MMLU,51.72,,hf_open_llm_v1_240829_frozen.csv samantha_mistral_instruct_7b,HFv1 TruthfulQA,58.81,,hf_open_llm_v1_240829_frozen.csv samantha_mistral_instruct_7b,HFv1 Winogrande,70.4,,hf_open_llm_v1_240829_frozen.csv samantha_nebula_7b,HF OpenLLM v1,54.58,,hf_open_llm_v1_240829_frozen.csv samantha_nebula_7b,HFv1 ARC,57.0,,hf_open_llm_v1_240829_frozen.csv samantha_nebula_7b,HFv1 GSM8K,11.37,,hf_open_llm_v1_240829_frozen.csv samantha_nebula_7b,HFv1 HellaSwag,82.25,,hf_open_llm_v1_240829_frozen.csv samantha_nebula_7b,HFv1 MMLU,54.21,,hf_open_llm_v1_240829_frozen.csv samantha_nebula_7b,HFv1 TruthfulQA,49.58,,hf_open_llm_v1_240829_frozen.csv samantha_nebula_7b,HFv1 Winogrande,73.09,,hf_open_llm_v1_240829_frozen.csv sambalingo_thai_chat,HF OpenLLM v1,49.45,,hf_open_llm_v1_240829_frozen.csv sambalingo_thai_chat,HFv1 ARC,52.73,,hf_open_llm_v1_240829_frozen.csv sambalingo_thai_chat,HFv1 GSM8K,8.57,,hf_open_llm_v1_240829_frozen.csv sambalingo_thai_chat,HFv1 HellaSwag,78.42,,hf_open_llm_v1_240829_frozen.csv sambalingo_thai_chat,HFv1 MMLU,43.95,,hf_open_llm_v1_240829_frozen.csv sambalingo_thai_chat,HFv1 TruthfulQA,40.84,,hf_open_llm_v1_240829_frozen.csv sambalingo_thai_chat,HFv1 Winogrande,72.22,,hf_open_llm_v1_240829_frozen.csv sappha_2b_v3,HF OpenLLM v1,43.53,,hf_open_llm_v1_240829_frozen.csv sappha_2b_v3,HFv1 ARC,46.16,,hf_open_llm_v1_240829_frozen.csv sappha_2b_v3,HFv1 GSM8K,0.23,,hf_open_llm_v1_240829_frozen.csv sappha_2b_v3,HFv1 HellaSwag,70.73,,hf_open_llm_v1_240829_frozen.csv sappha_2b_v3,HFv1 MMLU,38.63,,hf_open_llm_v1_240829_frozen.csv sappha_2b_v3,HFv1 TruthfulQA,39.94,,hf_open_llm_v1_240829_frozen.csv sappha_2b_v3,HFv1 Winogrande,65.51,,hf_open_llm_v1_240829_frozen.csv satoshinv5,HF OpenLLM v1,60.34,,hf_open_llm_v1_240829_frozen.csv satoshinv5,HFv1 ARC,60.49,,hf_open_llm_v1_240829_frozen.csv satoshinv5,HFv1 GSM8K,34.72,,hf_open_llm_v1_240829_frozen.csv satoshinv5,HFv1 HellaSwag,82.94,,hf_open_llm_v1_240829_frozen.csv satoshinv5,HFv1 MMLU,63.42,,hf_open_llm_v1_240829_frozen.csv satoshinv5,HFv1 TruthfulQA,41.8,,hf_open_llm_v1_240829_frozen.csv satoshinv5,HFv1 Winogrande,78.69,,hf_open_llm_v1_240829_frozen.csv satyr_7b_model_stock,HF OpenLLM v1,71.74,,hf_open_llm_v1_240829_frozen.csv satyr_7b_model_stock,HFv1 ARC,68.6,,hf_open_llm_v1_240829_frozen.csv satyr_7b_model_stock,HFv1 GSM8K,65.66,,hf_open_llm_v1_240829_frozen.csv satyr_7b_model_stock,HFv1 HellaSwag,86.96,,hf_open_llm_v1_240829_frozen.csv satyr_7b_model_stock,HFv1 MMLU,65.02,,hf_open_llm_v1_240829_frozen.csv satyr_7b_model_stock,HFv1 TruthfulQA,63.76,,hf_open_llm_v1_240829_frozen.csv satyr_7b_model_stock,HFv1 Winogrande,80.43,,hf_open_llm_v1_240829_frozen.csv sauerkrautlm_7b_laserchat,HF OpenLLM v1,70.32,,hf_open_llm_v1_240829_frozen.csv sauerkrautlm_7b_laserchat,HFv1 ARC,67.58,,hf_open_llm_v1_240829_frozen.csv sauerkrautlm_7b_laserchat,HFv1 GSM8K,68.84,,hf_open_llm_v1_240829_frozen.csv sauerkrautlm_7b_laserchat,HFv1 HellaSwag,83.58,,hf_open_llm_v1_240829_frozen.csv sauerkrautlm_7b_laserchat,HFv1 MMLU,64.93,,hf_open_llm_v1_240829_frozen.csv sauerkrautlm_7b_laserchat,HFv1 TruthfulQA,56.08,,hf_open_llm_v1_240829_frozen.csv sauerkrautlm_7b_laserchat,HFv1 Winogrande,80.9,,hf_open_llm_v1_240829_frozen.csv sauerkrautlm_una_solar_instruct,HF OpenLLM v1,74.26,,hf_open_llm_v1_240829_frozen.csv sauerkrautlm_una_solar_instruct,HFv1 ARC,70.9,,hf_open_llm_v1_240829_frozen.csv sauerkrautlm_una_solar_instruct,HFv1 GSM8K,64.67,,hf_open_llm_v1_240829_frozen.csv sauerkrautlm_una_solar_instruct,HFv1 HellaSwag,88.3,,hf_open_llm_v1_240829_frozen.csv sauerkrautlm_una_solar_instruct,HFv1 MMLU,66.15,,hf_open_llm_v1_240829_frozen.csv sauerkrautlm_una_solar_instruct,HFv1 TruthfulQA,71.8,,hf_open_llm_v1_240829_frozen.csv sauerkrautlm_una_solar_instruct,HFv1 Winogrande,83.74,,hf_open_llm_v1_240829_frozen.csv seagull_llama3_8b_orpo_v0_1,HF OpenLLM v1,64.25,,hf_open_llm_v1_240829_frozen.csv seagull_llama3_8b_orpo_v0_1,HFv1 ARC,58.62,,hf_open_llm_v1_240829_frozen.csv seagull_llama3_8b_orpo_v0_1,HFv1 GSM8K,46.55,,hf_open_llm_v1_240829_frozen.csv seagull_llama3_8b_orpo_v0_1,HFv1 HellaSwag,81.75,,hf_open_llm_v1_240829_frozen.csv seagull_llama3_8b_orpo_v0_1,HFv1 MMLU,65.38,,hf_open_llm_v1_240829_frozen.csv seagull_llama3_8b_orpo_v0_1,HFv1 TruthfulQA,54.89,,hf_open_llm_v1_240829_frozen.csv seagull_llama3_8b_orpo_v0_1,HFv1 Winogrande,78.3,,hf_open_llm_v1_240829_frozen.csv seagull_llama3_8b_orpo_v0_3,HF OpenLLM v1,64.0,,hf_open_llm_v1_240829_frozen.csv seagull_llama3_8b_orpo_v0_3,HFv1 ARC,58.28,,hf_open_llm_v1_240829_frozen.csv seagull_llama3_8b_orpo_v0_3,HFv1 GSM8K,46.32,,hf_open_llm_v1_240829_frozen.csv seagull_llama3_8b_orpo_v0_3,HFv1 HellaSwag,81.35,,hf_open_llm_v1_240829_frozen.csv seagull_llama3_8b_orpo_v0_3,HFv1 MMLU,65.44,,hf_open_llm_v1_240829_frozen.csv seagull_llama3_8b_orpo_v0_3,HFv1 TruthfulQA,54.3,,hf_open_llm_v1_240829_frozen.csv seagull_llama3_8b_orpo_v0_3,HFv1 Winogrande,78.3,,hf_open_llm_v1_240829_frozen.csv seagull_llama3_8b_orpo_v0_4,HF OpenLLM v1,63.65,,hf_open_llm_v1_240829_frozen.csv seagull_llama3_8b_orpo_v0_4,HFv1 ARC,58.02,,hf_open_llm_v1_240829_frozen.csv seagull_llama3_8b_orpo_v0_4,HFv1 GSM8K,45.11,,hf_open_llm_v1_240829_frozen.csv seagull_llama3_8b_orpo_v0_4,HFv1 HellaSwag,81.77,,hf_open_llm_v1_240829_frozen.csv seagull_llama3_8b_orpo_v0_4,HFv1 MMLU,65.59,,hf_open_llm_v1_240829_frozen.csv seagull_llama3_8b_orpo_v0_4,HFv1 TruthfulQA,53.74,,hf_open_llm_v1_240829_frozen.csv seagull_llama3_8b_orpo_v0_4,HFv1 Winogrande,77.66,,hf_open_llm_v1_240829_frozen.csv seagull_llama3_8b_orpo_v0_5,HF OpenLLM v1,64.09,,hf_open_llm_v1_240829_frozen.csv seagull_llama3_8b_orpo_v0_5,HFv1 ARC,58.11,,hf_open_llm_v1_240829_frozen.csv seagull_llama3_8b_orpo_v0_5,HFv1 GSM8K,46.25,,hf_open_llm_v1_240829_frozen.csv seagull_llama3_8b_orpo_v0_5,HFv1 HellaSwag,81.6,,hf_open_llm_v1_240829_frozen.csv seagull_llama3_8b_orpo_v0_5,HFv1 MMLU,65.17,,hf_open_llm_v1_240829_frozen.csv seagull_llama3_8b_orpo_v0_5,HFv1 TruthfulQA,54.7,,hf_open_llm_v1_240829_frozen.csv seagull_llama3_8b_orpo_v0_5,HFv1 Winogrande,78.69,,hf_open_llm_v1_240829_frozen.csv senzu_7b_v0_1,HF OpenLLM v1,56.4,,hf_open_llm_v1_240829_frozen.csv senzu_7b_v0_1,HFv1 ARC,58.19,,hf_open_llm_v1_240829_frozen.csv senzu_7b_v0_1,HFv1 GSM8K,18.2,,hf_open_llm_v1_240829_frozen.csv senzu_7b_v0_1,HFv1 HellaSwag,81.98,,hf_open_llm_v1_240829_frozen.csv senzu_7b_v0_1,HFv1 MMLU,63.2,,hf_open_llm_v1_240829_frozen.csv senzu_7b_v0_1,HFv1 TruthfulQA,40.2,,hf_open_llm_v1_240829_frozen.csv senzu_7b_v0_1,HFv1 Winogrande,76.64,,hf_open_llm_v1_240829_frozen.csv senzu_7b_v0_1_dpo,HF OpenLLM v1,61.9,,hf_open_llm_v1_240829_frozen.csv senzu_7b_v0_1_dpo,HFv1 ARC,66.72,,hf_open_llm_v1_240829_frozen.csv senzu_7b_v0_1_dpo,HFv1 GSM8K,32.98,,hf_open_llm_v1_240829_frozen.csv senzu_7b_v0_1_dpo,HFv1 HellaSwag,84.34,,hf_open_llm_v1_240829_frozen.csv senzu_7b_v0_1_dpo,HFv1 MMLU,62.12,,hf_open_llm_v1_240829_frozen.csv senzu_7b_v0_1_dpo,HFv1 TruthfulQA,45.29,,hf_open_llm_v1_240829_frozen.csv senzu_7b_v0_1_dpo,HFv1 Winogrande,79.95,,hf_open_llm_v1_240829_frozen.csv servile_harpsichord_cdpo,HF OpenLLM v1,68.98,,hf_open_llm_v1_240829_frozen.csv servile_harpsichord_cdpo,HFv1 ARC,67.32,,hf_open_llm_v1_240829_frozen.csv servile_harpsichord_cdpo,HFv1 GSM8K,57.09,,hf_open_llm_v1_240829_frozen.csv servile_harpsichord_cdpo,HFv1 HellaSwag,85.18,,hf_open_llm_v1_240829_frozen.csv servile_harpsichord_cdpo,HFv1 MMLU,64.54,,hf_open_llm_v1_240829_frozen.csv servile_harpsichord_cdpo,HFv1 TruthfulQA,60.61,,hf_open_llm_v1_240829_frozen.csv servile_harpsichord_cdpo,HFv1 Winogrande,79.16,,hf_open_llm_v1_240829_frozen.csv sf_72b_v1,HF OpenLLM v1,28.75,,hf_open_llm_v1_240829_frozen.csv sf_72b_v1,HFv1 ARC,26.28,,hf_open_llm_v1_240829_frozen.csv sf_72b_v1,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv sf_72b_v1,HFv1 HellaSwag,24.87,,hf_open_llm_v1_240829_frozen.csv sf_72b_v1,HFv1 MMLU,23.03,,hf_open_llm_v1_240829_frozen.csv sf_72b_v1,HFv1 TruthfulQA,48.78,,hf_open_llm_v1_240829_frozen.csv sf_72b_v1,HFv1 Winogrande,49.57,,hf_open_llm_v1_240829_frozen.csv sg_raccoon_yi_200k_2_0,HF OpenLLM v1,62.72,,hf_open_llm_v1_240829_frozen.csv sg_raccoon_yi_200k_2_0,HFv1 ARC,62.54,,hf_open_llm_v1_240829_frozen.csv sg_raccoon_yi_200k_2_0,HFv1 GSM8K,30.71,,hf_open_llm_v1_240829_frozen.csv sg_raccoon_yi_200k_2_0,HFv1 HellaSwag,80.26,,hf_open_llm_v1_240829_frozen.csv sg_raccoon_yi_200k_2_0,HFv1 MMLU,73.29,,hf_open_llm_v1_240829_frozen.csv sg_raccoon_yi_200k_2_0,HFv1 TruthfulQA,53.21,,hf_open_llm_v1_240829_frozen.csv sg_raccoon_yi_200k_2_0,HFv1 Winogrande,76.32,,hf_open_llm_v1_240829_frozen.csv shark_tank_ai_7_b,HF OpenLLM v1,71.1,,hf_open_llm_v1_240829_frozen.csv shark_tank_ai_7_b,HFv1 ARC,66.89,,hf_open_llm_v1_240829_frozen.csv shark_tank_ai_7_b,HFv1 GSM8K,65.73,,hf_open_llm_v1_240829_frozen.csv shark_tank_ai_7_b,HFv1 HellaSwag,86.61,,hf_open_llm_v1_240829_frozen.csv shark_tank_ai_7_b,HFv1 MMLU,65.27,,hf_open_llm_v1_240829_frozen.csv shark_tank_ai_7_b,HFv1 TruthfulQA,60.19,,hf_open_llm_v1_240829_frozen.csv shark_tank_ai_7_b,HFv1 Winogrande,81.93,,hf_open_llm_v1_240829_frozen.csv shark_tank_ai_7b_v2,HF OpenLLM v1,66.55,,hf_open_llm_v1_240829_frozen.csv shark_tank_ai_7b_v2,HFv1 ARC,67.75,,hf_open_llm_v1_240829_frozen.csv shark_tank_ai_7b_v2,HFv1 GSM8K,45.11,,hf_open_llm_v1_240829_frozen.csv shark_tank_ai_7b_v2,HFv1 HellaSwag,87.06,,hf_open_llm_v1_240829_frozen.csv shark_tank_ai_7b_v2,HFv1 MMLU,58.79,,hf_open_llm_v1_240829_frozen.csv shark_tank_ai_7b_v2,HFv1 TruthfulQA,62.15,,hf_open_llm_v1_240829_frozen.csv shark_tank_ai_7b_v2,HFv1 Winogrande,78.45,,hf_open_llm_v1_240829_frozen.csv sheared_pythia_160m,HF OpenLLM v1,29.41,,hf_open_llm_v1_240829_frozen.csv sheared_pythia_160m,HFv1 ARC,22.44,,hf_open_llm_v1_240829_frozen.csv sheared_pythia_160m,HFv1 GSM8K,0.38,,hf_open_llm_v1_240829_frozen.csv sheared_pythia_160m,HFv1 HellaSwag,32.07,,hf_open_llm_v1_240829_frozen.csv sheared_pythia_160m,HFv1 MMLU,26.65,,hf_open_llm_v1_240829_frozen.csv sheared_pythia_160m,HFv1 TruthfulQA,43.22,,hf_open_llm_v1_240829_frozen.csv sheared_pythia_160m,HFv1 Winogrande,51.7,,hf_open_llm_v1_240829_frozen.csv shearedllama_1_3b_fft_test1,HF OpenLLM v1,35.71,,hf_open_llm_v1_240829_frozen.csv shearedllama_1_3b_fft_test1,HFv1 ARC,32.68,,hf_open_llm_v1_240829_frozen.csv shearedllama_1_3b_fft_test1,HFv1 GSM8K,0.23,,hf_open_llm_v1_240829_frozen.csv shearedllama_1_3b_fft_test1,HFv1 HellaSwag,59.99,,hf_open_llm_v1_240829_frozen.csv shearedllama_1_3b_fft_test1,HFv1 MMLU,25.69,,hf_open_llm_v1_240829_frozen.csv shearedllama_1_3b_fft_test1,HFv1 TruthfulQA,36.97,,hf_open_llm_v1_240829_frozen.csv shearedllama_1_3b_fft_test1,HFv1 Winogrande,58.72,,hf_open_llm_v1_240829_frozen.csv shearedplats_1_3b_v1,HF OpenLLM v1,35.97,,hf_open_llm_v1_240829_frozen.csv shearedplats_1_3b_v1,HFv1 ARC,35.41,,hf_open_llm_v1_240829_frozen.csv shearedplats_1_3b_v1,HFv1 GSM8K,0.53,,hf_open_llm_v1_240829_frozen.csv shearedplats_1_3b_v1,HFv1 HellaSwag,62.75,,hf_open_llm_v1_240829_frozen.csv shearedplats_1_3b_v1,HFv1 MMLU,24.75,,hf_open_llm_v1_240829_frozen.csv shearedplats_1_3b_v1,HFv1 TruthfulQA,33.93,,hf_open_llm_v1_240829_frozen.csv shearedplats_1_3b_v1,HFv1 Winogrande,58.48,,hf_open_llm_v1_240829_frozen.csv shearedplats_2_7b_v2,HF OpenLLM v1,41.61,,hf_open_llm_v1_240829_frozen.csv shearedplats_2_7b_v2,HFv1 ARC,42.41,,hf_open_llm_v1_240829_frozen.csv shearedplats_2_7b_v2,HFv1 GSM8K,1.52,,hf_open_llm_v1_240829_frozen.csv shearedplats_2_7b_v2,HFv1 HellaSwag,72.58,,hf_open_llm_v1_240829_frozen.csv shearedplats_2_7b_v2,HFv1 MMLU,27.52,,hf_open_llm_v1_240829_frozen.csv shearedplats_2_7b_v2,HFv1 TruthfulQA,39.76,,hf_open_llm_v1_240829_frozen.csv shearedplats_2_7b_v2,HFv1 Winogrande,65.9,,hf_open_llm_v1_240829_frozen.csv shearedplats_2_7b_v2_instruct_v0_1,HF OpenLLM v1,41.13,,hf_open_llm_v1_240829_frozen.csv shearedplats_2_7b_v2_instruct_v0_1,HFv1 ARC,40.19,,hf_open_llm_v1_240829_frozen.csv shearedplats_2_7b_v2_instruct_v0_1,HFv1 GSM8K,2.12,,hf_open_llm_v1_240829_frozen.csv shearedplats_2_7b_v2_instruct_v0_1,HFv1 HellaSwag,70.08,,hf_open_llm_v1_240829_frozen.csv shearedplats_2_7b_v2_instruct_v0_1,HFv1 MMLU,28.12,,hf_open_llm_v1_240829_frozen.csv shearedplats_2_7b_v2_instruct_v0_1,HFv1 TruthfulQA,41.23,,hf_open_llm_v1_240829_frozen.csv shearedplats_2_7b_v2_instruct_v0_1,HFv1 Winogrande,65.04,,hf_open_llm_v1_240829_frozen.csv sheep_duck_llama_2_70b_v1_1,HF OpenLLM v1,71.22,,hf_open_llm_v1_240829_frozen.csv sheep_duck_llama_2_70b_v1_1,HFv1 ARC,73.12,,hf_open_llm_v1_240829_frozen.csv sheep_duck_llama_2_70b_v1_1,HFv1 GSM8K,47.99,,hf_open_llm_v1_240829_frozen.csv sheep_duck_llama_2_70b_v1_1,HFv1 HellaSwag,87.77,,hf_open_llm_v1_240829_frozen.csv sheep_duck_llama_2_70b_v1_1,HFv1 MMLU,70.77,,hf_open_llm_v1_240829_frozen.csv sheep_duck_llama_2_70b_v1_1,HFv1 TruthfulQA,64.55,,hf_open_llm_v1_240829_frozen.csv sheep_duck_llama_2_70b_v1_1,HFv1 Winogrande,83.11,,hf_open_llm_v1_240829_frozen.csv shisa_base_7b_v1,HF OpenLLM v1,51.64,,hf_open_llm_v1_240829_frozen.csv shisa_base_7b_v1,HFv1 ARC,52.3,,hf_open_llm_v1_240829_frozen.csv shisa_base_7b_v1,HFv1 GSM8K,35.86,,hf_open_llm_v1_240829_frozen.csv shisa_base_7b_v1,HFv1 HellaSwag,77.63,,hf_open_llm_v1_240829_frozen.csv shisa_base_7b_v1,HFv1 MMLU,23.12,,hf_open_llm_v1_240829_frozen.csv shisa_base_7b_v1,HFv1 TruthfulQA,42.4,,hf_open_llm_v1_240829_frozen.csv shisa_base_7b_v1,HFv1 Winogrande,78.53,,hf_open_llm_v1_240829_frozen.csv shqiponja_59b_v1,HF OpenLLM v1,65.97,,hf_open_llm_v1_240829_frozen.csv shqiponja_59b_v1,HFv1 ARC,70.05,,hf_open_llm_v1_240829_frozen.csv shqiponja_59b_v1,HFv1 GSM8K,15.47,,hf_open_llm_v1_240829_frozen.csv shqiponja_59b_v1,HFv1 HellaSwag,84.06,,hf_open_llm_v1_240829_frozen.csv shqiponja_59b_v1,HFv1 MMLU,75.54,,hf_open_llm_v1_240829_frozen.csv shqiponja_59b_v1,HFv1 TruthfulQA,70.43,,hf_open_llm_v1_240829_frozen.csv shqiponja_59b_v1,HFv1 Winogrande,80.27,,hf_open_llm_v1_240829_frozen.csv silicon_medley,HF OpenLLM v1,69.49,,hf_open_llm_v1_240829_frozen.csv silicon_medley,HFv1 ARC,67.24,,hf_open_llm_v1_240829_frozen.csv silicon_medley,HFv1 GSM8K,58.38,,hf_open_llm_v1_240829_frozen.csv silicon_medley,HFv1 HellaSwag,86.21,,hf_open_llm_v1_240829_frozen.csv silicon_medley,HFv1 MMLU,64.51,,hf_open_llm_v1_240829_frozen.csv silicon_medley,HFv1 TruthfulQA,61.34,,hf_open_llm_v1_240829_frozen.csv silicon_medley,HFv1 Winogrande,79.24,,hf_open_llm_v1_240829_frozen.csv silvermaiden_7b_slerp,HF OpenLLM v1,74.74,,hf_open_llm_v1_240829_frozen.csv silvermaiden_7b_slerp,HFv1 ARC,71.93,,hf_open_llm_v1_240829_frozen.csv silvermaiden_7b_slerp,HFv1 GSM8K,70.36,,hf_open_llm_v1_240829_frozen.csv silvermaiden_7b_slerp,HFv1 HellaSwag,88.12,,hf_open_llm_v1_240829_frozen.csv silvermaiden_7b_slerp,HFv1 MMLU,65.14,,hf_open_llm_v1_240829_frozen.csv silvermaiden_7b_slerp,HFv1 TruthfulQA,69.91,,hf_open_llm_v1_240829_frozen.csv silvermaiden_7b_slerp,HFv1 Winogrande,82.95,,hf_open_llm_v1_240829_frozen.csv siren_7b_slerp,HF OpenLLM v1,72.5,,hf_open_llm_v1_240829_frozen.csv siren_7b_slerp,HFv1 ARC,69.97,,hf_open_llm_v1_240829_frozen.csv siren_7b_slerp,HFv1 GSM8K,66.03,,hf_open_llm_v1_240829_frozen.csv siren_7b_slerp,HFv1 HellaSwag,87.14,,hf_open_llm_v1_240829_frozen.csv siren_7b_slerp,HFv1 MMLU,65.03,,hf_open_llm_v1_240829_frozen.csv siren_7b_slerp,HFv1 TruthfulQA,64.57,,hf_open_llm_v1_240829_frozen.csv siren_7b_slerp,HFv1 Winogrande,82.24,,hf_open_llm_v1_240829_frozen.csv sixtyoneeighty_7b_dpo,HF OpenLLM v1,71.74,,hf_open_llm_v1_240829_frozen.csv sixtyoneeighty_7b_dpo,HFv1 ARC,68.69,,hf_open_llm_v1_240829_frozen.csv sixtyoneeighty_7b_dpo,HFv1 GSM8K,62.24,,hf_open_llm_v1_240829_frozen.csv sixtyoneeighty_7b_dpo,HFv1 HellaSwag,86.41,,hf_open_llm_v1_240829_frozen.csv sixtyoneeighty_7b_dpo,HFv1 MMLU,64.93,,hf_open_llm_v1_240829_frozen.csv sixtyoneeighty_7b_dpo,HFv1 TruthfulQA,67.64,,hf_open_llm_v1_240829_frozen.csv sixtyoneeighty_7b_dpo,HFv1 Winogrande,80.51,,hf_open_llm_v1_240829_frozen.csv sj_solar_10_7b_dpo,HF OpenLLM v1,72.67,,hf_open_llm_v1_240829_frozen.csv sj_solar_10_7b_dpo,HFv1 ARC,68.26,,hf_open_llm_v1_240829_frozen.csv sj_solar_10_7b_dpo,HFv1 GSM8K,62.09,,hf_open_llm_v1_240829_frozen.csv sj_solar_10_7b_dpo,HFv1 HellaSwag,86.95,,hf_open_llm_v1_240829_frozen.csv sj_solar_10_7b_dpo,HFv1 MMLU,66.73,,hf_open_llm_v1_240829_frozen.csv sj_solar_10_7b_dpo,HFv1 TruthfulQA,67.74,,hf_open_llm_v1_240829_frozen.csv sj_solar_10_7b_dpo,HFv1 Winogrande,84.21,,hf_open_llm_v1_240829_frozen.csv skkudatascienceglobal_10_7b,HF OpenLLM v1,74.5,,hf_open_llm_v1_240829_frozen.csv skkudatascienceglobal_10_7b,HFv1 ARC,71.25,,hf_open_llm_v1_240829_frozen.csv skkudatascienceglobal_10_7b,HFv1 GSM8K,65.73,,hf_open_llm_v1_240829_frozen.csv skkudatascienceglobal_10_7b,HFv1 HellaSwag,88.41,,hf_open_llm_v1_240829_frozen.csv skkudatascienceglobal_10_7b,HFv1 MMLU,66.31,,hf_open_llm_v1_240829_frozen.csv skkudatascienceglobal_10_7b,HFv1 TruthfulQA,71.92,,hf_open_llm_v1_240829_frozen.csv skkudatascienceglobal_10_7b,HFv1 Winogrande,83.35,,hf_open_llm_v1_240829_frozen.csv skkuds_dpo_72b_v1,HF OpenLLM v1,72.89,,hf_open_llm_v1_240829_frozen.csv skkuds_dpo_72b_v1,HFv1 ARC,65.96,,hf_open_llm_v1_240829_frozen.csv skkuds_dpo_72b_v1,HFv1 GSM8K,65.88,,hf_open_llm_v1_240829_frozen.csv skkuds_dpo_72b_v1,HFv1 HellaSwag,86.0,,hf_open_llm_v1_240829_frozen.csv skkuds_dpo_72b_v1,HFv1 MMLU,77.33,,hf_open_llm_v1_240829_frozen.csv skkuds_dpo_72b_v1,HFv1 TruthfulQA,59.54,,hf_open_llm_v1_240829_frozen.csv skkuds_dpo_72b_v1,HFv1 Winogrande,82.64,,hf_open_llm_v1_240829_frozen.csv skkuds_dpo_72b_v3,HF OpenLLM v1,72.8,,hf_open_llm_v1_240829_frozen.csv skkuds_dpo_72b_v3,HFv1 ARC,66.04,,hf_open_llm_v1_240829_frozen.csv skkuds_dpo_72b_v3,HFv1 GSM8K,64.97,,hf_open_llm_v1_240829_frozen.csv skkuds_dpo_72b_v3,HFv1 HellaSwag,86.11,,hf_open_llm_v1_240829_frozen.csv skkuds_dpo_72b_v3,HFv1 MMLU,77.34,,hf_open_llm_v1_240829_frozen.csv skkuds_dpo_72b_v3,HFv1 TruthfulQA,59.73,,hf_open_llm_v1_240829_frozen.csv skkuds_dpo_72b_v3,HFv1 Winogrande,82.64,,hf_open_llm_v1_240829_frozen.csv slal_0_1,HF OpenLLM v1,67.83,,hf_open_llm_v1_240829_frozen.csv slal_0_1,HFv1 ARC,57.94,,hf_open_llm_v1_240829_frozen.csv slal_0_1,HFv1 GSM8K,63.15,,hf_open_llm_v1_240829_frozen.csv slal_0_1,HFv1 HellaSwag,80.14,,hf_open_llm_v1_240829_frozen.csv slal_0_1,HFv1 MMLU,65.99,,hf_open_llm_v1_240829_frozen.csv slal_0_1,HFv1 TruthfulQA,54.22,,hf_open_llm_v1_240829_frozen.csv slal_0_1,HFv1 Winogrande,85.56,,hf_open_llm_v1_240829_frozen.csv slerp_test_turdus_beagle,HF OpenLLM v1,75.11,,hf_open_llm_v1_240829_frozen.csv slerp_test_turdus_beagle,HFv1 ARC,73.55,,hf_open_llm_v1_240829_frozen.csv slerp_test_turdus_beagle,HFv1 GSM8K,70.05,,hf_open_llm_v1_240829_frozen.csv slerp_test_turdus_beagle,HFv1 HellaSwag,88.85,,hf_open_llm_v1_240829_frozen.csv slerp_test_turdus_beagle,HFv1 MMLU,64.62,,hf_open_llm_v1_240829_frozen.csv slerp_test_turdus_beagle,HFv1 TruthfulQA,69.69,,hf_open_llm_v1_240829_frozen.csv slerp_test_turdus_beagle,HFv1 Winogrande,83.9,,hf_open_llm_v1_240829_frozen.csv slimhercules_4_0_mistral_7b_v0_2,HF OpenLLM v1,62.75,,hf_open_llm_v1_240829_frozen.csv slimhercules_4_0_mistral_7b_v0_2,HFv1 ARC,60.07,,hf_open_llm_v1_240829_frozen.csv slimhercules_4_0_mistral_7b_v0_2,HFv1 GSM8K,45.34,,hf_open_llm_v1_240829_frozen.csv slimhercules_4_0_mistral_7b_v0_2,HFv1 HellaSwag,83.54,,hf_open_llm_v1_240829_frozen.csv slimhercules_4_0_mistral_7b_v0_2,HFv1 MMLU,62.67,,hf_open_llm_v1_240829_frozen.csv slimhercules_4_0_mistral_7b_v0_2,HFv1 TruthfulQA,45.33,,hf_open_llm_v1_240829_frozen.csv slimhercules_4_0_mistral_7b_v0_2,HFv1 Winogrande,79.56,,hf_open_llm_v1_240829_frozen.csv smartllama3_8b_ms_v0_1,HF OpenLLM v1,69.49,,hf_open_llm_v1_240829_frozen.csv smartllama3_8b_ms_v0_1,HFv1 ARC,62.63,,hf_open_llm_v1_240829_frozen.csv smartllama3_8b_ms_v0_1,HFv1 GSM8K,71.19,,hf_open_llm_v1_240829_frozen.csv smartllama3_8b_ms_v0_1,HFv1 HellaSwag,82.34,,hf_open_llm_v1_240829_frozen.csv smartllama3_8b_ms_v0_1,HFv1 MMLU,67.7,,hf_open_llm_v1_240829_frozen.csv smartllama3_8b_ms_v0_1,HFv1 TruthfulQA,55.56,,hf_open_llm_v1_240829_frozen.csv smartllama3_8b_ms_v0_1,HFv1 Winogrande,77.51,,hf_open_llm_v1_240829_frozen.csv smartqwen1_5_1_8b_orpo_v1,HF OpenLLM v1,41.8,,hf_open_llm_v1_240829_frozen.csv smartqwen1_5_1_8b_orpo_v1,HFv1 ARC,36.09,,hf_open_llm_v1_240829_frozen.csv smartqwen1_5_1_8b_orpo_v1,HFv1 GSM8K,12.13,,hf_open_llm_v1_240829_frozen.csv smartqwen1_5_1_8b_orpo_v1,HFv1 HellaSwag,62.3,,hf_open_llm_v1_240829_frozen.csv smartqwen1_5_1_8b_orpo_v1,HFv1 MMLU,43.06,,hf_open_llm_v1_240829_frozen.csv smartqwen1_5_1_8b_orpo_v1,HFv1 TruthfulQA,39.82,,hf_open_llm_v1_240829_frozen.csv smartqwen1_5_1_8b_orpo_v1,HFv1 Winogrande,57.38,,hf_open_llm_v1_240829_frozen.csv smartyplats_3b_v1,HF OpenLLM v1,40.0,,hf_open_llm_v1_240829_frozen.csv smartyplats_3b_v1,HFv1 ARC,40.53,,hf_open_llm_v1_240829_frozen.csv smartyplats_3b_v1,HFv1 GSM8K,1.06,,hf_open_llm_v1_240829_frozen.csv smartyplats_3b_v1,HFv1 HellaSwag,70.85,,hf_open_llm_v1_240829_frozen.csv smartyplats_3b_v1,HFv1 MMLU,25.31,,hf_open_llm_v1_240829_frozen.csv smartyplats_3b_v1,HFv1 TruthfulQA,36.53,,hf_open_llm_v1_240829_frozen.csv smartyplats_3b_v1,HFv1 Winogrande,65.75,,hf_open_llm_v1_240829_frozen.csv smartyplats_3b_v2,HF OpenLLM v1,40.29,,hf_open_llm_v1_240829_frozen.csv smartyplats_3b_v2,HFv1 ARC,41.04,,hf_open_llm_v1_240829_frozen.csv smartyplats_3b_v2,HFv1 GSM8K,1.59,,hf_open_llm_v1_240829_frozen.csv smartyplats_3b_v2,HFv1 HellaSwag,71.19,,hf_open_llm_v1_240829_frozen.csv smartyplats_3b_v2,HFv1 MMLU,24.32,,hf_open_llm_v1_240829_frozen.csv smartyplats_3b_v2,HFv1 TruthfulQA,36.66,,hf_open_llm_v1_240829_frozen.csv smartyplats_3b_v2,HFv1 Winogrande,66.93,,hf_open_llm_v1_240829_frozen.csv smartyplats_7b_v2,HF OpenLLM v1,60.24,,hf_open_llm_v1_240829_frozen.csv smartyplats_7b_v2,HFv1 ARC,57.94,,hf_open_llm_v1_240829_frozen.csv smartyplats_7b_v2,HFv1 GSM8K,38.82,,hf_open_llm_v1_240829_frozen.csv smartyplats_7b_v2,HFv1 HellaSwag,80.76,,hf_open_llm_v1_240829_frozen.csv smartyplats_7b_v2,HFv1 MMLU,58.16,,hf_open_llm_v1_240829_frozen.csv smartyplats_7b_v2,HFv1 TruthfulQA,50.26,,hf_open_llm_v1_240829_frozen.csv smartyplats_7b_v2,HFv1 Winogrande,75.53,,hf_open_llm_v1_240829_frozen.csv smol_llama_101m_chat_v1,HF OpenLLM v1,28.73,,hf_open_llm_v1_240829_frozen.csv smol_llama_101m_chat_v1,HFv1 ARC,22.87,,hf_open_llm_v1_240829_frozen.csv smol_llama_101m_chat_v1,HFv1 GSM8K,0.08,,hf_open_llm_v1_240829_frozen.csv smol_llama_101m_chat_v1,HFv1 HellaSwag,28.71,,hf_open_llm_v1_240829_frozen.csv smol_llama_101m_chat_v1,HFv1 MMLU,24.93,,hf_open_llm_v1_240829_frozen.csv smol_llama_101m_chat_v1,HFv1 TruthfulQA,45.76,,hf_open_llm_v1_240829_frozen.csv smol_llama_101m_chat_v1,HFv1 Winogrande,50.04,,hf_open_llm_v1_240829_frozen.csv smol_llama_101m_gqa,HF OpenLLM v1,28.97,,hf_open_llm_v1_240829_frozen.csv smol_llama_101m_gqa,HFv1 ARC,23.55,,hf_open_llm_v1_240829_frozen.csv smol_llama_101m_gqa,HFv1 GSM8K,0.83,,hf_open_llm_v1_240829_frozen.csv smol_llama_101m_gqa,HFv1 HellaSwag,28.77,,hf_open_llm_v1_240829_frozen.csv smol_llama_101m_gqa,HFv1 MMLU,24.24,,hf_open_llm_v1_240829_frozen.csv smol_llama_101m_gqa,HFv1 TruthfulQA,45.76,,hf_open_llm_v1_240829_frozen.csv smol_llama_101m_gqa,HFv1 Winogrande,50.67,,hf_open_llm_v1_240829_frozen.csv smol_llama_220m_gqa,HF OpenLLM v1,29.44,,hf_open_llm_v1_240829_frozen.csv smol_llama_220m_gqa,HFv1 ARC,24.83,,hf_open_llm_v1_240829_frozen.csv smol_llama_220m_gqa,HFv1 GSM8K,0.68,,hf_open_llm_v1_240829_frozen.csv smol_llama_220m_gqa,HFv1 HellaSwag,29.76,,hf_open_llm_v1_240829_frozen.csv smol_llama_220m_gqa,HFv1 MMLU,25.85,,hf_open_llm_v1_240829_frozen.csv smol_llama_220m_gqa,HFv1 TruthfulQA,44.55,,hf_open_llm_v1_240829_frozen.csv smol_llama_220m_gqa,HFv1 Winogrande,50.99,,hf_open_llm_v1_240829_frozen.csv smol_llama_220m_open_instruct,HF OpenLLM v1,29.19,,hf_open_llm_v1_240829_frozen.csv smol_llama_220m_open_instruct,HFv1 ARC,25.0,,hf_open_llm_v1_240829_frozen.csv smol_llama_220m_open_instruct,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv smol_llama_220m_open_instruct,HFv1 HellaSwag,29.71,,hf_open_llm_v1_240829_frozen.csv smol_llama_220m_open_instruct,HFv1 MMLU,26.11,,hf_open_llm_v1_240829_frozen.csv smol_llama_220m_open_instruct,HFv1 TruthfulQA,44.06,,hf_open_llm_v1_240829_frozen.csv smol_llama_220m_open_instruct,HFv1 Winogrande,50.28,,hf_open_llm_v1_240829_frozen.csv smol_llama_220m_openhermes,HF OpenLLM v1,29.34,,hf_open_llm_v1_240829_frozen.csv smol_llama_220m_openhermes,HFv1 ARC,25.17,,hf_open_llm_v1_240829_frozen.csv smol_llama_220m_openhermes,HFv1 GSM8K,0.61,,hf_open_llm_v1_240829_frozen.csv smol_llama_220m_openhermes,HFv1 HellaSwag,28.98,,hf_open_llm_v1_240829_frozen.csv smol_llama_220m_openhermes,HFv1 MMLU,26.17,,hf_open_llm_v1_240829_frozen.csv smol_llama_220m_openhermes,HFv1 TruthfulQA,43.08,,hf_open_llm_v1_240829_frozen.csv smol_llama_220m_openhermes,HFv1 Winogrande,52.01,,hf_open_llm_v1_240829_frozen.csv smol_llama_4x220m_moe,HF OpenLLM v1,29.25,,hf_open_llm_v1_240829_frozen.csv smol_llama_4x220m_moe,HFv1 ARC,25.09,,hf_open_llm_v1_240829_frozen.csv smol_llama_4x220m_moe,HFv1 GSM8K,0.15,,hf_open_llm_v1_240829_frozen.csv smol_llama_4x220m_moe,HFv1 HellaSwag,29.24,,hf_open_llm_v1_240829_frozen.csv smol_llama_4x220m_moe,HFv1 MMLU,25.88,,hf_open_llm_v1_240829_frozen.csv smol_llama_4x220m_moe,HFv1 TruthfulQA,43.92,,hf_open_llm_v1_240829_frozen.csv smol_llama_4x220m_moe,HFv1 Winogrande,51.22,,hf_open_llm_v1_240829_frozen.csv smol_llama_81m_tied,HF OpenLLM v1,28.17,,hf_open_llm_v1_240829_frozen.csv smol_llama_81m_tied,HFv1 ARC,22.18,,hf_open_llm_v1_240829_frozen.csv smol_llama_81m_tied,HFv1 GSM8K,0.23,,hf_open_llm_v1_240829_frozen.csv smol_llama_81m_tied,HFv1 HellaSwag,29.33,,hf_open_llm_v1_240829_frozen.csv smol_llama_81m_tied,HFv1 MMLU,24.06,,hf_open_llm_v1_240829_frozen.csv smol_llama_81m_tied,HFv1 TruthfulQA,43.97,,hf_open_llm_v1_240829_frozen.csv smol_llama_81m_tied,HFv1 Winogrande,49.25,,hf_open_llm_v1_240829_frozen.csv smolllamix_8x101m,HF OpenLLM v1,28.98,,hf_open_llm_v1_240829_frozen.csv smolllamix_8x101m,HFv1 ARC,22.7,,hf_open_llm_v1_240829_frozen.csv smolllamix_8x101m,HFv1 GSM8K,0.61,,hf_open_llm_v1_240829_frozen.csv smolllamix_8x101m,HFv1 HellaSwag,28.5,,hf_open_llm_v1_240829_frozen.csv smolllamix_8x101m,HFv1 MMLU,24.69,,hf_open_llm_v1_240829_frozen.csv smolllamix_8x101m,HFv1 TruthfulQA,46.09,,hf_open_llm_v1_240829_frozen.csv smolllamix_8x101m,HFv1 Winogrande,51.3,,hf_open_llm_v1_240829_frozen.csv smolllamix_8x101m_take2,HF OpenLLM v1,29.35,,hf_open_llm_v1_240829_frozen.csv smolllamix_8x101m_take2,HFv1 ARC,23.98,,hf_open_llm_v1_240829_frozen.csv smolllamix_8x101m_take2,HFv1 GSM8K,0.53,,hf_open_llm_v1_240829_frozen.csv smolllamix_8x101m_take2,HFv1 HellaSwag,28.43,,hf_open_llm_v1_240829_frozen.csv smolllamix_8x101m_take2,HFv1 MMLU,25.07,,hf_open_llm_v1_240829_frozen.csv smolllamix_8x101m_take2,HFv1 TruthfulQA,45.87,,hf_open_llm_v1_240829_frozen.csv smolllamix_8x101m_take2,HFv1 Winogrande,52.25,,hf_open_llm_v1_240829_frozen.csv snorkel_mistral_pairrm_dpo,HF OpenLLM v1,66.31,,hf_open_llm_v1_240829_frozen.csv snorkel_mistral_pairrm_dpo,HFv1 ARC,66.04,,hf_open_llm_v1_240829_frozen.csv snorkel_mistral_pairrm_dpo,HFv1 GSM8K,36.77,,hf_open_llm_v1_240829_frozen.csv snorkel_mistral_pairrm_dpo,HFv1 HellaSwag,85.64,,hf_open_llm_v1_240829_frozen.csv snorkel_mistral_pairrm_dpo,HFv1 MMLU,60.85,,hf_open_llm_v1_240829_frozen.csv snorkel_mistral_pairrm_dpo,HFv1 TruthfulQA,70.91,,hf_open_llm_v1_240829_frozen.csv snorkel_mistral_pairrm_dpo,HFv1 Winogrande,77.74,,hf_open_llm_v1_240829_frozen.csv solar_0_70b_16bit,HF OpenLLM v1,70.11,,hf_open_llm_v1_240829_frozen.csv solar_0_70b_16bit,HFv1 ARC,71.08,,hf_open_llm_v1_240829_frozen.csv solar_0_70b_16bit,HFv1 GSM8K,45.26,,hf_open_llm_v1_240829_frozen.csv solar_0_70b_16bit,HFv1 HellaSwag,87.89,,hf_open_llm_v1_240829_frozen.csv solar_0_70b_16bit,HFv1 MMLU,70.58,,hf_open_llm_v1_240829_frozen.csv solar_0_70b_16bit,HFv1 TruthfulQA,62.25,,hf_open_llm_v1_240829_frozen.csv solar_0_70b_16bit,HFv1 Winogrande,83.58,,hf_open_llm_v1_240829_frozen.csv solar_10_7b_dpo_instruct_tuned_v0_1,HF OpenLLM v1,68.68,,hf_open_llm_v1_240829_frozen.csv solar_10_7b_dpo_instruct_tuned_v0_1,HFv1 ARC,65.19,,hf_open_llm_v1_240829_frozen.csv solar_10_7b_dpo_instruct_tuned_v0_1,HFv1 GSM8K,58.76,,hf_open_llm_v1_240829_frozen.csv solar_10_7b_dpo_instruct_tuned_v0_1,HFv1 HellaSwag,86.09,,hf_open_llm_v1_240829_frozen.csv solar_10_7b_dpo_instruct_tuned_v0_1,HFv1 MMLU,66.25,,hf_open_llm_v1_240829_frozen.csv solar_10_7b_dpo_instruct_tuned_v0_1,HFv1 TruthfulQA,51.81,,hf_open_llm_v1_240829_frozen.csv solar_10_7b_dpo_instruct_tuned_v0_1,HFv1 Winogrande,83.98,,hf_open_llm_v1_240829_frozen.csv solar_10_7b_instruct_forest_dpo_v1,HF OpenLLM v1,74.8,,hf_open_llm_v1_240829_frozen.csv solar_10_7b_instruct_forest_dpo_v1,HFv1 ARC,71.93,,hf_open_llm_v1_240829_frozen.csv solar_10_7b_instruct_forest_dpo_v1,HFv1 GSM8K,64.52,,hf_open_llm_v1_240829_frozen.csv solar_10_7b_instruct_forest_dpo_v1,HFv1 HellaSwag,88.44,,hf_open_llm_v1_240829_frozen.csv solar_10_7b_instruct_forest_dpo_v1,HFv1 MMLU,65.63,,hf_open_llm_v1_240829_frozen.csv solar_10_7b_instruct_forest_dpo_v1,HFv1 TruthfulQA,76.13,,hf_open_llm_v1_240829_frozen.csv solar_10_7b_instruct_forest_dpo_v1,HFv1 Winogrande,82.16,,hf_open_llm_v1_240829_frozen.csv solar_10b_nector_dpo_jawade,HF OpenLLM v1,74.19,,hf_open_llm_v1_240829_frozen.csv solar_10b_nector_dpo_jawade,HFv1 ARC,71.33,,hf_open_llm_v1_240829_frozen.csv solar_10b_nector_dpo_jawade,HFv1 GSM8K,64.59,,hf_open_llm_v1_240829_frozen.csv solar_10b_nector_dpo_jawade,HFv1 HellaSwag,88.62,,hf_open_llm_v1_240829_frozen.csv solar_10b_nector_dpo_jawade,HFv1 MMLU,66.22,,hf_open_llm_v1_240829_frozen.csv solar_10b_nector_dpo_jawade,HFv1 TruthfulQA,70.92,,hf_open_llm_v1_240829_frozen.csv solar_10b_nector_dpo_jawade,HFv1 Winogrande,83.43,,hf_open_llm_v1_240829_frozen.csv solar_10b_orcadpo_jawade,HF OpenLLM v1,74.27,,hf_open_llm_v1_240829_frozen.csv solar_10b_orcadpo_jawade,HFv1 ARC,71.16,,hf_open_llm_v1_240829_frozen.csv solar_10b_orcadpo_jawade,HFv1 GSM8K,64.82,,hf_open_llm_v1_240829_frozen.csv solar_10b_orcadpo_jawade,HFv1 HellaSwag,88.27,,hf_open_llm_v1_240829_frozen.csv solar_10b_orcadpo_jawade,HFv1 MMLU,66.12,,hf_open_llm_v1_240829_frozen.csv solar_10b_orcadpo_jawade,HFv1 TruthfulQA,71.57,,hf_open_llm_v1_240829_frozen.csv solar_10b_orcadpo_jawade,HFv1 Winogrande,83.66,,hf_open_llm_v1_240829_frozen.csv solar_13b_instruct_v1_0,HF OpenLLM v1,56.65,,hf_open_llm_v1_240829_frozen.csv solar_13b_instruct_v1_0,HFv1 ARC,57.25,,hf_open_llm_v1_240829_frozen.csv solar_13b_instruct_v1_0,HFv1 GSM8K,16.6,,hf_open_llm_v1_240829_frozen.csv solar_13b_instruct_v1_0,HFv1 HellaSwag,78.03,,hf_open_llm_v1_240829_frozen.csv solar_13b_instruct_v1_0,HFv1 MMLU,55.75,,hf_open_llm_v1_240829_frozen.csv solar_13b_instruct_v1_0,HFv1 TruthfulQA,61.99,,hf_open_llm_v1_240829_frozen.csv solar_13b_instruct_v1_0,HFv1 Winogrande,70.24,,hf_open_llm_v1_240829_frozen.csv solar_dus_implement,HF OpenLLM v1,58.1,,hf_open_llm_v1_240829_frozen.csv solar_dus_implement,HFv1 ARC,59.56,,hf_open_llm_v1_240829_frozen.csv solar_dus_implement,HFv1 GSM8K,26.99,,hf_open_llm_v1_240829_frozen.csv solar_dus_implement,HFv1 HellaSwag,81.18,,hf_open_llm_v1_240829_frozen.csv solar_dus_implement,HFv1 MMLU,63.68,,hf_open_llm_v1_240829_frozen.csv solar_dus_implement,HFv1 TruthfulQA,40.72,,hf_open_llm_v1_240829_frozen.csv solar_dus_implement,HFv1 Winogrande,76.48,,hf_open_llm_v1_240829_frozen.csv solar_instruct_ko_adapter_attach,HF OpenLLM v1,74.11,,hf_open_llm_v1_240829_frozen.csv solar_instruct_ko_adapter_attach,HFv1 ARC,71.08,,hf_open_llm_v1_240829_frozen.csv solar_instruct_ko_adapter_attach,HFv1 GSM8K,64.29,,hf_open_llm_v1_240829_frozen.csv solar_instruct_ko_adapter_attach,HFv1 HellaSwag,88.2,,hf_open_llm_v1_240829_frozen.csv solar_instruct_ko_adapter_attach,HFv1 MMLU,66.09,,hf_open_llm_v1_240829_frozen.csv solar_instruct_ko_adapter_attach,HFv1 TruthfulQA,71.51,,hf_open_llm_v1_240829_frozen.csv solar_instruct_ko_adapter_attach,HFv1 Winogrande,83.5,,hf_open_llm_v1_240829_frozen.csv solar_ko_1_3_deup,HF OpenLLM v1,56.47,,hf_open_llm_v1_240829_frozen.csv solar_ko_1_3_deup,HFv1 ARC,55.97,,hf_open_llm_v1_240829_frozen.csv solar_ko_1_3_deup,HFv1 GSM8K,22.59,,hf_open_llm_v1_240829_frozen.csv solar_ko_1_3_deup,HFv1 HellaSwag,79.97,,hf_open_llm_v1_240829_frozen.csv solar_ko_1_3_deup,HFv1 MMLU,55.88,,hf_open_llm_v1_240829_frozen.csv solar_ko_1_3_deup,HFv1 TruthfulQA,47.55,,hf_open_llm_v1_240829_frozen.csv solar_ko_1_3_deup,HFv1 Winogrande,76.87,,hf_open_llm_v1_240829_frozen.csv solar_math_2x10_7b,HF OpenLLM v1,73.37,,hf_open_llm_v1_240829_frozen.csv solar_math_2x10_7b,HFv1 ARC,68.43,,hf_open_llm_v1_240829_frozen.csv solar_math_2x10_7b,HFv1 GSM8K,71.04,,hf_open_llm_v1_240829_frozen.csv solar_math_2x10_7b,HFv1 HellaSwag,86.31,,hf_open_llm_v1_240829_frozen.csv solar_math_2x10_7b,HFv1 MMLU,66.9,,hf_open_llm_v1_240829_frozen.csv solar_math_2x10_7b,HFv1 TruthfulQA,64.21,,hf_open_llm_v1_240829_frozen.csv solar_math_2x10_7b,HFv1 Winogrande,83.35,,hf_open_llm_v1_240829_frozen.csv solar_math_2x10_7b_v0_2,HF OpenLLM v1,74.25,,hf_open_llm_v1_240829_frozen.csv solar_math_2x10_7b_v0_2,HFv1 ARC,70.9,,hf_open_llm_v1_240829_frozen.csv solar_math_2x10_7b_v0_2,HFv1 GSM8K,64.9,,hf_open_llm_v1_240829_frozen.csv solar_math_2x10_7b_v0_2,HFv1 HellaSwag,88.29,,hf_open_llm_v1_240829_frozen.csv solar_math_2x10_7b_v0_2,HFv1 MMLU,66.25,,hf_open_llm_v1_240829_frozen.csv solar_math_2x10_7b_v0_2,HFv1 TruthfulQA,71.68,,hf_open_llm_v1_240829_frozen.csv solar_math_2x10_7b_v0_2,HFv1 Winogrande,83.5,,hf_open_llm_v1_240829_frozen.csv solar_megamerge_dare_10_7b_v1,HF OpenLLM v1,68.79,,hf_open_llm_v1_240829_frozen.csv solar_megamerge_dare_10_7b_v1,HFv1 ARC,66.13,,hf_open_llm_v1_240829_frozen.csv solar_megamerge_dare_10_7b_v1,HFv1 GSM8K,58.0,,hf_open_llm_v1_240829_frozen.csv solar_megamerge_dare_10_7b_v1,HFv1 HellaSwag,85.3,,hf_open_llm_v1_240829_frozen.csv solar_megamerge_dare_10_7b_v1,HFv1 MMLU,66.03,,hf_open_llm_v1_240829_frozen.csv solar_megamerge_dare_10_7b_v1,HFv1 TruthfulQA,54.33,,hf_open_llm_v1_240829_frozen.csv solar_megamerge_dare_10_7b_v1,HFv1 Winogrande,82.95,,hf_open_llm_v1_240829_frozen.csv solar_merge2_dpo,HF OpenLLM v1,65.6,,hf_open_llm_v1_240829_frozen.csv solar_merge2_dpo,HFv1 ARC,64.42,,hf_open_llm_v1_240829_frozen.csv solar_merge2_dpo,HFv1 GSM8K,48.82,,hf_open_llm_v1_240829_frozen.csv solar_merge2_dpo,HFv1 HellaSwag,82.73,,hf_open_llm_v1_240829_frozen.csv solar_merge2_dpo,HFv1 MMLU,64.57,,hf_open_llm_v1_240829_frozen.csv solar_merge2_dpo,HFv1 TruthfulQA,51.28,,hf_open_llm_v1_240829_frozen.csv solar_merge2_dpo,HFv1 Winogrande,81.77,,hf_open_llm_v1_240829_frozen.csv solar_merge_adapter_dpo_orca,HF OpenLLM v1,65.96,,hf_open_llm_v1_240829_frozen.csv solar_merge_adapter_dpo_orca,HFv1 ARC,63.91,,hf_open_llm_v1_240829_frozen.csv solar_merge_adapter_dpo_orca,HFv1 GSM8K,50.57,,hf_open_llm_v1_240829_frozen.csv solar_merge_adapter_dpo_orca,HFv1 HellaSwag,84.58,,hf_open_llm_v1_240829_frozen.csv solar_merge_adapter_dpo_orca,HFv1 MMLU,63.18,,hf_open_llm_v1_240829_frozen.csv solar_merge_adapter_dpo_orca,HFv1 TruthfulQA,51.49,,hf_open_llm_v1_240829_frozen.csv solar_merge_adapter_dpo_orca,HFv1 Winogrande,82.0,,hf_open_llm_v1_240829_frozen.csv solar_platypus_10_7b_v1,HF OpenLLM v1,58.62,,hf_open_llm_v1_240829_frozen.csv solar_platypus_10_7b_v1,HFv1 ARC,61.69,,hf_open_llm_v1_240829_frozen.csv solar_platypus_10_7b_v1,HFv1 GSM8K,11.07,,hf_open_llm_v1_240829_frozen.csv solar_platypus_10_7b_v1,HFv1 HellaSwag,84.23,,hf_open_llm_v1_240829_frozen.csv solar_platypus_10_7b_v1,HFv1 MMLU,60.37,,hf_open_llm_v1_240829_frozen.csv solar_platypus_10_7b_v1,HFv1 TruthfulQA,51.58,,hf_open_llm_v1_240829_frozen.csv solar_platypus_10_7b_v1,HFv1 Winogrande,82.79,,hf_open_llm_v1_240829_frozen.csv solar_platypus_10_7b_v2,HF OpenLLM v1,55.25,,hf_open_llm_v1_240829_frozen.csv solar_platypus_10_7b_v2,HFv1 ARC,59.39,,hf_open_llm_v1_240829_frozen.csv solar_platypus_10_7b_v2,HFv1 GSM8K,4.02,,hf_open_llm_v1_240829_frozen.csv solar_platypus_10_7b_v2,HFv1 HellaSwag,83.57,,hf_open_llm_v1_240829_frozen.csv solar_platypus_10_7b_v2,HFv1 MMLU,59.93,,hf_open_llm_v1_240829_frozen.csv solar_platypus_10_7b_v2,HFv1 TruthfulQA,43.15,,hf_open_llm_v1_240829_frozen.csv solar_platypus_10_7b_v2,HFv1 Winogrande,81.45,,hf_open_llm_v1_240829_frozen.csv solarized_13b_dpo,HF OpenLLM v1,62.05,,hf_open_llm_v1_240829_frozen.csv solarized_13b_dpo,HFv1 ARC,62.71,,hf_open_llm_v1_240829_frozen.csv solarized_13b_dpo,HFv1 GSM8K,26.38,,hf_open_llm_v1_240829_frozen.csv solarized_13b_dpo,HFv1 HellaSwag,81.82,,hf_open_llm_v1_240829_frozen.csv solarized_13b_dpo,HFv1 MMLU,59.12,,hf_open_llm_v1_240829_frozen.csv solarized_13b_dpo,HFv1 TruthfulQA,66.25,,hf_open_llm_v1_240829_frozen.csv solarized_13b_dpo,HFv1 Winogrande,76.01,,hf_open_llm_v1_240829_frozen.csv solarized_18b_dpo,HF OpenLLM v1,67.88,,hf_open_llm_v1_240829_frozen.csv solarized_18b_dpo,HFv1 ARC,68.34,,hf_open_llm_v1_240829_frozen.csv solarized_18b_dpo,HFv1 GSM8K,40.26,,hf_open_llm_v1_240829_frozen.csv solarized_18b_dpo,HFv1 HellaSwag,87.79,,hf_open_llm_v1_240829_frozen.csv solarized_18b_dpo,HFv1 MMLU,63.89,,hf_open_llm_v1_240829_frozen.csv solarized_18b_dpo,HFv1 TruthfulQA,66.49,,hf_open_llm_v1_240829_frozen.csv solarized_18b_dpo,HFv1 Winogrande,80.51,,hf_open_llm_v1_240829_frozen.csv spaetzle_v44_7b,HF OpenLLM v1,66.34,,hf_open_llm_v1_240829_frozen.csv spaetzle_v44_7b,HFv1 ARC,64.59,,hf_open_llm_v1_240829_frozen.csv spaetzle_v44_7b,HFv1 GSM8K,53.68,,hf_open_llm_v1_240829_frozen.csv spaetzle_v44_7b,HFv1 HellaSwag,84.76,,hf_open_llm_v1_240829_frozen.csv spaetzle_v44_7b,HFv1 MMLU,61.76,,hf_open_llm_v1_240829_frozen.csv spaetzle_v44_7b,HFv1 TruthfulQA,54.45,,hf_open_llm_v1_240829_frozen.csv spaetzle_v44_7b,HFv1 Winogrande,78.77,,hf_open_llm_v1_240829_frozen.csv speechless_codellama_orca_13b,HF OpenLLM v1,44.83,,hf_open_llm_v1_240829_frozen.csv speechless_codellama_orca_13b,HFv1 ARC,44.37,,hf_open_llm_v1_240829_frozen.csv speechless_codellama_orca_13b,HFv1 GSM8K,5.99,,hf_open_llm_v1_240829_frozen.csv speechless_codellama_orca_13b,HFv1 HellaSwag,65.2,,hf_open_llm_v1_240829_frozen.csv speechless_codellama_orca_13b,HFv1 MMLU,43.46,,hf_open_llm_v1_240829_frozen.csv speechless_codellama_orca_13b,HFv1 TruthfulQA,45.94,,hf_open_llm_v1_240829_frozen.csv speechless_codellama_orca_13b,HFv1 Winogrande,64.01,,hf_open_llm_v1_240829_frozen.csv speechless_codellama_orca_airoboros_13b_0_10e,HF OpenLLM v1,30.36,,hf_open_llm_v1_240829_frozen.csv speechless_codellama_orca_airoboros_13b_0_10e,HFv1 ARC,29.44,,hf_open_llm_v1_240829_frozen.csv speechless_codellama_orca_airoboros_13b_0_10e,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv speechless_codellama_orca_airoboros_13b_0_10e,HFv1 HellaSwag,25.71,,hf_open_llm_v1_240829_frozen.csv speechless_codellama_orca_airoboros_13b_0_10e,HFv1 MMLU,25.43,,hf_open_llm_v1_240829_frozen.csv speechless_codellama_orca_airoboros_13b_0_10e,HFv1 TruthfulQA,49.64,,hf_open_llm_v1_240829_frozen.csv speechless_codellama_orca_airoboros_13b_0_10e,HFv1 Winogrande,51.93,,hf_open_llm_v1_240829_frozen.csv speechless_codellama_orca_platypus_13b_0_10e,HF OpenLLM v1,29.83,,hf_open_llm_v1_240829_frozen.csv speechless_codellama_orca_platypus_13b_0_10e,HFv1 ARC,28.75,,hf_open_llm_v1_240829_frozen.csv speechless_codellama_orca_platypus_13b_0_10e,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv speechless_codellama_orca_platypus_13b_0_10e,HFv1 HellaSwag,25.88,,hf_open_llm_v1_240829_frozen.csv speechless_codellama_orca_platypus_13b_0_10e,HFv1 MMLU,25.36,,hf_open_llm_v1_240829_frozen.csv speechless_codellama_orca_platypus_13b_0_10e,HFv1 TruthfulQA,49.27,,hf_open_llm_v1_240829_frozen.csv speechless_codellama_orca_platypus_13b_0_10e,HFv1 Winogrande,49.72,,hf_open_llm_v1_240829_frozen.csv speechless_codellama_platypus_13b,HF OpenLLM v1,45.64,,hf_open_llm_v1_240829_frozen.csv speechless_codellama_platypus_13b,HFv1 ARC,45.31,,hf_open_llm_v1_240829_frozen.csv speechless_codellama_platypus_13b,HFv1 GSM8K,9.1,,hf_open_llm_v1_240829_frozen.csv speechless_codellama_platypus_13b,HFv1 HellaSwag,68.63,,hf_open_llm_v1_240829_frozen.csv speechless_codellama_platypus_13b,HFv1 MMLU,42.82,,hf_open_llm_v1_240829_frozen.csv speechless_codellama_platypus_13b,HFv1 TruthfulQA,42.38,,hf_open_llm_v1_240829_frozen.csv speechless_codellama_platypus_13b,HFv1 Winogrande,65.59,,hf_open_llm_v1_240829_frozen.csv speechlessv1_nova_13b,HF OpenLLM v1,56.14,,hf_open_llm_v1_240829_frozen.csv speechlessv1_nova_13b,HFv1 ARC,61.77,,hf_open_llm_v1_240829_frozen.csv speechlessv1_nova_13b,HFv1 GSM8K,5.76,,hf_open_llm_v1_240829_frozen.csv speechlessv1_nova_13b,HFv1 HellaSwag,82.68,,hf_open_llm_v1_240829_frozen.csv speechlessv1_nova_13b,HFv1 MMLU,57.75,,hf_open_llm_v1_240829_frozen.csv speechlessv1_nova_13b,HFv1 TruthfulQA,51.44,,hf_open_llm_v1_240829_frozen.csv speechlessv1_nova_13b,HFv1 Winogrande,77.43,,hf_open_llm_v1_240829_frozen.csv sphinx_7b_model_stock,HF OpenLLM v1,73.2,,hf_open_llm_v1_240829_frozen.csv sphinx_7b_model_stock,HFv1 ARC,70.9,,hf_open_llm_v1_240829_frozen.csv sphinx_7b_model_stock,HFv1 GSM8K,68.16,,hf_open_llm_v1_240829_frozen.csv sphinx_7b_model_stock,HFv1 HellaSwag,87.2,,hf_open_llm_v1_240829_frozen.csv sphinx_7b_model_stock,HFv1 MMLU,64.8,,hf_open_llm_v1_240829_frozen.csv sphinx_7b_model_stock,HFv1 TruthfulQA,65.12,,hf_open_llm_v1_240829_frozen.csv sphinx_7b_model_stock,HFv1 Winogrande,83.03,,hf_open_llm_v1_240829_frozen.csv spin_phi2,HF OpenLLM v1,61.67,,hf_open_llm_v1_240829_frozen.csv spin_phi2,HFv1 ARC,63.14,,hf_open_llm_v1_240829_frozen.csv spin_phi2,HFv1 GSM8K,54.28,,hf_open_llm_v1_240829_frozen.csv spin_phi2,HFv1 HellaSwag,75.56,,hf_open_llm_v1_240829_frozen.csv spin_phi2,HFv1 MMLU,57.08,,hf_open_llm_v1_240829_frozen.csv spin_phi2,HFv1 TruthfulQA,45.77,,hf_open_llm_v1_240829_frozen.csv spin_phi2,HFv1 Winogrande,74.19,,hf_open_llm_v1_240829_frozen.csv srbosgpt_7b_slerp,HF OpenLLM v1,50.19,,hf_open_llm_v1_240829_frozen.csv srbosgpt_7b_slerp,HFv1 ARC,49.15,,hf_open_llm_v1_240829_frozen.csv srbosgpt_7b_slerp,HFv1 GSM8K,0.99,,hf_open_llm_v1_240829_frozen.csv srbosgpt_7b_slerp,HFv1 HellaSwag,62.28,,hf_open_llm_v1_240829_frozen.csv srbosgpt_7b_slerp,HFv1 MMLU,61.95,,hf_open_llm_v1_240829_frozen.csv srbosgpt_7b_slerp,HFv1 TruthfulQA,60.23,,hf_open_llm_v1_240829_frozen.csv srbosgpt_7b_slerp,HFv1 Winogrande,66.54,,hf_open_llm_v1_240829_frozen.csv stable_platypus2_13b_qlora_0_80_epoch,HF OpenLLM v1,55.56,,hf_open_llm_v1_240829_frozen.csv stable_platypus2_13b_qlora_0_80_epoch,HFv1 ARC,62.29,,hf_open_llm_v1_240829_frozen.csv stable_platypus2_13b_qlora_0_80_epoch,HFv1 GSM8K,3.56,,hf_open_llm_v1_240829_frozen.csv stable_platypus2_13b_qlora_0_80_epoch,HFv1 HellaSwag,82.46,,hf_open_llm_v1_240829_frozen.csv stable_platypus2_13b_qlora_0_80_epoch,HFv1 MMLU,57.09,,hf_open_llm_v1_240829_frozen.csv stable_platypus2_13b_qlora_0_80_epoch,HFv1 TruthfulQA,51.41,,hf_open_llm_v1_240829_frozen.csv stable_platypus2_13b_qlora_0_80_epoch,HFv1 Winogrande,76.56,,hf_open_llm_v1_240829_frozen.csv stable_vicuna_13b,HF OpenLLM v1,51.64,,hf_open_llm_v1_240829_frozen.csv stable_vicuna_13b,HFv1 ARC,53.41,,hf_open_llm_v1_240829_frozen.csv stable_vicuna_13b,HFv1 GSM8K,4.09,,hf_open_llm_v1_240829_frozen.csv stable_vicuna_13b,HFv1 HellaSwag,78.57,,hf_open_llm_v1_240829_frozen.csv stable_vicuna_13b,HFv1 MMLU,50.37,,hf_open_llm_v1_240829_frozen.csv stable_vicuna_13b,HFv1 TruthfulQA,48.38,,hf_open_llm_v1_240829_frozen.csv stable_vicuna_13b,HFv1 Winogrande,75.22,,hf_open_llm_v1_240829_frozen.csv stablelm_2_12b,HF OpenLLM v1,63.48,,hf_open_llm_v1_240829_frozen.csv stablelm_2_12b,HFv1 ARC,58.45,,hf_open_llm_v1_240829_frozen.csv stablelm_2_12b,HFv1 GSM8K,56.03,,hf_open_llm_v1_240829_frozen.csv stablelm_2_12b,HFv1 HellaSwag,84.33,,hf_open_llm_v1_240829_frozen.csv stablelm_2_12b,HFv1 MMLU,62.04,,hf_open_llm_v1_240829_frozen.csv stablelm_2_12b,HFv1 TruthfulQA,42.16,,hf_open_llm_v1_240829_frozen.csv stablelm_2_12b,HFv1 Winogrande,77.9,,hf_open_llm_v1_240829_frozen.csv stablelm_2_12b_chat,HF OpenLLM v1,68.38,,hf_open_llm_v1_240829_frozen.csv stablelm_2_12b_chat,HFv1 ARC,64.85,,hf_open_llm_v1_240829_frozen.csv stablelm_2_12b_chat,HFv1 GSM8K,57.85,,hf_open_llm_v1_240829_frozen.csv stablelm_2_12b_chat,HFv1 HellaSwag,85.96,,hf_open_llm_v1_240829_frozen.csv stablelm_2_12b_chat,HFv1 MMLU,61.06,,hf_open_llm_v1_240829_frozen.csv stablelm_2_12b_chat,HFv1 TruthfulQA,62.01,,hf_open_llm_v1_240829_frozen.csv stablelm_2_12b_chat,HFv1 Winogrande,78.53,,hf_open_llm_v1_240829_frozen.csv stablelm_2_1_6b,HF OpenLLM v1,45.25,,hf_open_llm_v1_240829_frozen.csv stablelm_2_1_6b,HFv1 ARC,43.34,,hf_open_llm_v1_240829_frozen.csv stablelm_2_1_6b,HFv1 GSM8K,17.44,,hf_open_llm_v1_240829_frozen.csv stablelm_2_1_6b,HFv1 HellaSwag,70.45,,hf_open_llm_v1_240829_frozen.csv stablelm_2_1_6b,HFv1 MMLU,38.95,,hf_open_llm_v1_240829_frozen.csv stablelm_2_1_6b,HFv1 TruthfulQA,36.78,,hf_open_llm_v1_240829_frozen.csv stablelm_2_1_6b,HFv1 Winogrande,64.56,,hf_open_llm_v1_240829_frozen.csv stablelm_2_1_6b_chat,HF OpenLLM v1,50.71,,hf_open_llm_v1_240829_frozen.csv stablelm_2_1_6b_chat,HFv1 ARC,43.52,,hf_open_llm_v1_240829_frozen.csv stablelm_2_1_6b_chat,HFv1 GSM8K,38.82,,hf_open_llm_v1_240829_frozen.csv stablelm_2_1_6b_chat,HFv1 HellaSwag,69.24,,hf_open_llm_v1_240829_frozen.csv stablelm_2_1_6b_chat,HFv1 MMLU,41.47,,hf_open_llm_v1_240829_frozen.csv stablelm_2_1_6b_chat,HFv1 TruthfulQA,46.5,,hf_open_llm_v1_240829_frozen.csv stablelm_2_1_6b_chat,HFv1 Winogrande,64.72,,hf_open_llm_v1_240829_frozen.csv stablelm_2_zephyr_1_6b,HF OpenLLM v1,49.99,,hf_open_llm_v1_240829_frozen.csv stablelm_2_zephyr_1_6b,HFv1 ARC,43.69,,hf_open_llm_v1_240829_frozen.csv stablelm_2_zephyr_1_6b,HFv1 GSM8K,35.33,,hf_open_llm_v1_240829_frozen.csv stablelm_2_zephyr_1_6b,HFv1 HellaSwag,69.3,,hf_open_llm_v1_240829_frozen.csv stablelm_2_zephyr_1_6b,HFv1 MMLU,42.03,,hf_open_llm_v1_240829_frozen.csv stablelm_2_zephyr_1_6b,HFv1 TruthfulQA,45.11,,hf_open_llm_v1_240829_frozen.csv stablelm_2_zephyr_1_6b,HFv1 Winogrande,64.48,,hf_open_llm_v1_240829_frozen.csv stablelm_3b_4e1t,HF OpenLLM v1,46.58,,hf_open_llm_v1_240829_frozen.csv stablelm_3b_4e1t,HFv1 ARC,46.59,,hf_open_llm_v1_240829_frozen.csv stablelm_3b_4e1t,HFv1 GSM8K,3.34,,hf_open_llm_v1_240829_frozen.csv stablelm_3b_4e1t,HFv1 HellaSwag,75.94,,hf_open_llm_v1_240829_frozen.csv stablelm_3b_4e1t,HFv1 MMLU,45.23,,hf_open_llm_v1_240829_frozen.csv stablelm_3b_4e1t,HFv1 TruthfulQA,37.2,,hf_open_llm_v1_240829_frozen.csv stablelm_3b_4e1t,HFv1 Winogrande,71.19,,hf_open_llm_v1_240829_frozen.csv stablelm_base_alpha_3b,HF OpenLLM v1,31.5,,hf_open_llm_v1_240829_frozen.csv stablelm_base_alpha_3b,HFv1 ARC,26.45,,hf_open_llm_v1_240829_frozen.csv stablelm_base_alpha_3b,HFv1 GSM8K,0.45,,hf_open_llm_v1_240829_frozen.csv stablelm_base_alpha_3b,HFv1 HellaSwag,42.24,,hf_open_llm_v1_240829_frozen.csv stablelm_base_alpha_3b,HFv1 MMLU,25.43,,hf_open_llm_v1_240829_frozen.csv stablelm_base_alpha_3b,HFv1 TruthfulQA,40.5,,hf_open_llm_v1_240829_frozen.csv stablelm_base_alpha_3b,HFv1 Winogrande,53.91,,hf_open_llm_v1_240829_frozen.csv stablelm_base_alpha_7b,HF OpenLLM v1,34.37,,hf_open_llm_v1_240829_frozen.csv stablelm_base_alpha_7b,HFv1 ARC,32.0,,hf_open_llm_v1_240829_frozen.csv stablelm_base_alpha_7b,HFv1 GSM8K,0.61,,hf_open_llm_v1_240829_frozen.csv stablelm_base_alpha_7b,HFv1 HellaSwag,51.78,,hf_open_llm_v1_240829_frozen.csv stablelm_base_alpha_7b,HFv1 MMLU,26.21,,hf_open_llm_v1_240829_frozen.csv stablelm_base_alpha_7b,HFv1 TruthfulQA,40.19,,hf_open_llm_v1_240829_frozen.csv stablelm_base_alpha_7b,HFv1 Winogrande,55.41,,hf_open_llm_v1_240829_frozen.csv stablelm_base_alpha_7b_v2,HF OpenLLM v1,46.18,,hf_open_llm_v1_240829_frozen.csv stablelm_base_alpha_7b_v2,HFv1 ARC,47.35,,hf_open_llm_v1_240829_frozen.csv stablelm_base_alpha_7b_v2,HFv1 GSM8K,2.58,,hf_open_llm_v1_240829_frozen.csv stablelm_base_alpha_7b_v2,HFv1 HellaSwag,77.08,,hf_open_llm_v1_240829_frozen.csv stablelm_base_alpha_7b_v2,HFv1 MMLU,45.1,,hf_open_llm_v1_240829_frozen.csv stablelm_base_alpha_7b_v2,HFv1 TruthfulQA,36.46,,hf_open_llm_v1_240829_frozen.csv stablelm_base_alpha_7b_v2,HFv1 Winogrande,68.51,,hf_open_llm_v1_240829_frozen.csv stablelm_zephyr_3b,HF OpenLLM v1,53.43,,hf_open_llm_v1_240829_frozen.csv stablelm_zephyr_3b,HFv1 ARC,46.08,,hf_open_llm_v1_240829_frozen.csv stablelm_zephyr_3b,HFv1 GSM8K,42.15,,hf_open_llm_v1_240829_frozen.csv stablelm_zephyr_3b,HFv1 HellaSwag,74.16,,hf_open_llm_v1_240829_frozen.csv stablelm_zephyr_3b,HFv1 MMLU,46.17,,hf_open_llm_v1_240829_frozen.csv stablelm_zephyr_3b,HFv1 TruthfulQA,46.49,,hf_open_llm_v1_240829_frozen.csv stablelm_zephyr_3b,HFv1 Winogrande,65.51,,hf_open_llm_v1_240829_frozen.csv starcoder,HF OpenLLM v1,35.73,,hf_open_llm_v1_240829_frozen.csv starcoder,HFv1 ARC,30.29,,hf_open_llm_v1_240829_frozen.csv starcoder,HFv1 GSM8K,9.17,,hf_open_llm_v1_240829_frozen.csv starcoder,HFv1 HellaSwag,47.88,,hf_open_llm_v1_240829_frozen.csv starcoder,HFv1 MMLU,29.47,,hf_open_llm_v1_240829_frozen.csv starcoder,HFv1 TruthfulQA,41.3,,hf_open_llm_v1_240829_frozen.csv starcoder,HFv1 Winogrande,56.27,,hf_open_llm_v1_240829_frozen.csv starcoder2_15b,HF OpenLLM v1,52.79,,hf_open_llm_v1_240829_frozen.csv starcoder2_15b,HFv1 ARC,47.35,,hf_open_llm_v1_240829_frozen.csv starcoder2_15b,HFv1 GSM8K,52.24,,hf_open_llm_v1_240829_frozen.csv starcoder2_15b,HFv1 HellaSwag,64.09,,hf_open_llm_v1_240829_frozen.csv starcoder2_15b,HFv1 MMLU,51.35,,hf_open_llm_v1_240829_frozen.csv starcoder2_15b,HFv1 TruthfulQA,37.87,,hf_open_llm_v1_240829_frozen.csv starcoder2_15b,HFv1 Winogrande,63.85,,hf_open_llm_v1_240829_frozen.csv starcoder2_3b,HF OpenLLM v1,39.25,,hf_open_llm_v1_240829_frozen.csv starcoder2_3b,HFv1 ARC,34.56,,hf_open_llm_v1_240829_frozen.csv starcoder2_3b,HFv1 GSM8K,19.64,,hf_open_llm_v1_240829_frozen.csv starcoder2_3b,HFv1 HellaSwag,47.62,,hf_open_llm_v1_240829_frozen.csv starcoder2_3b,HFv1 MMLU,38.65,,hf_open_llm_v1_240829_frozen.csv starcoder2_3b,HFv1 TruthfulQA,40.49,,hf_open_llm_v1_240829_frozen.csv starcoder2_3b,HFv1 Winogrande,54.54,,hf_open_llm_v1_240829_frozen.csv starcoder2_7b,HF OpenLLM v1,42.95,,hf_open_llm_v1_240829_frozen.csv starcoder2_7b,HFv1 ARC,38.31,,hf_open_llm_v1_240829_frozen.csv starcoder2_7b,HFv1 GSM8K,25.09,,hf_open_llm_v1_240829_frozen.csv starcoder2_7b,HFv1 HellaSwag,51.91,,hf_open_llm_v1_240829_frozen.csv starcoder2_7b,HFv1 MMLU,41.21,,hf_open_llm_v1_240829_frozen.csv starcoder2_7b,HFv1 TruthfulQA,41.99,,hf_open_llm_v1_240829_frozen.csv starcoder2_7b,HFv1 Winogrande,59.19,,hf_open_llm_v1_240829_frozen.csv starcoderbase,HF OpenLLM v1,35.55,,hf_open_llm_v1_240829_frozen.csv starcoderbase,HFv1 ARC,30.29,,hf_open_llm_v1_240829_frozen.csv starcoderbase,HFv1 GSM8K,7.88,,hf_open_llm_v1_240829_frozen.csv starcoderbase,HFv1 HellaSwag,47.21,,hf_open_llm_v1_240829_frozen.csv starcoderbase,HFv1 MMLU,32.12,,hf_open_llm_v1_240829_frozen.csv starcoderbase,HFv1 TruthfulQA,40.02,,hf_open_llm_v1_240829_frozen.csv starcoderbase,HFv1 Winogrande,55.8,,hf_open_llm_v1_240829_frozen.csv starcoderbase_1b,HF OpenLLM v1,30.06,,hf_open_llm_v1_240829_frozen.csv starcoderbase_1b,HFv1 ARC,22.7,,hf_open_llm_v1_240829_frozen.csv starcoderbase_1b,HFv1 GSM8K,0.91,,hf_open_llm_v1_240829_frozen.csv starcoderbase_1b,HFv1 HellaSwag,34.31,,hf_open_llm_v1_240829_frozen.csv starcoderbase_1b,HFv1 MMLU,26.67,,hf_open_llm_v1_240829_frozen.csv starcoderbase_1b,HFv1 TruthfulQA,45.79,,hf_open_llm_v1_240829_frozen.csv starcoderbase_1b,HFv1 Winogrande,49.96,,hf_open_llm_v1_240829_frozen.csv starcoderbase_3b,HF OpenLLM v1,31.38,,hf_open_llm_v1_240829_frozen.csv starcoderbase_3b,HFv1 ARC,25.85,,hf_open_llm_v1_240829_frozen.csv starcoderbase_3b,HFv1 GSM8K,1.74,,hf_open_llm_v1_240829_frozen.csv starcoderbase_3b,HFv1 HellaSwag,39.11,,hf_open_llm_v1_240829_frozen.csv starcoderbase_3b,HFv1 MMLU,27.35,,hf_open_llm_v1_240829_frozen.csv starcoderbase_3b,HFv1 TruthfulQA,43.05,,hf_open_llm_v1_240829_frozen.csv starcoderbase_3b,HFv1 Winogrande,51.14,,hf_open_llm_v1_240829_frozen.csv starcoderbase_7b,HF OpenLLM v1,33.75,,hf_open_llm_v1_240829_frozen.csv starcoderbase_7b,HFv1 ARC,29.86,,hf_open_llm_v1_240829_frozen.csv starcoderbase_7b,HFv1 GSM8K,5.46,,hf_open_llm_v1_240829_frozen.csv starcoderbase_7b,HFv1 HellaSwag,43.87,,hf_open_llm_v1_240829_frozen.csv starcoderbase_7b,HFv1 MMLU,28.45,,hf_open_llm_v1_240829_frozen.csv starcoderbase_7b,HFv1 TruthfulQA,40.46,,hf_open_llm_v1_240829_frozen.csv starcoderbase_7b,HFv1 Winogrande,54.38,,hf_open_llm_v1_240829_frozen.csv starling_7b,HF OpenLLM v1,50.73,,hf_open_llm_v1_240829_frozen.csv starling_7b,HFv1 ARC,51.02,,hf_open_llm_v1_240829_frozen.csv starling_7b,HFv1 GSM8K,10.08,,hf_open_llm_v1_240829_frozen.csv starling_7b,HFv1 HellaSwag,76.77,,hf_open_llm_v1_240829_frozen.csv starling_7b,HFv1 MMLU,47.75,,hf_open_llm_v1_240829_frozen.csv starling_7b,HFv1 TruthfulQA,48.18,,hf_open_llm_v1_240829_frozen.csv starling_7b,HFv1 Winogrande,70.56,,hf_open_llm_v1_240829_frozen.csv starling_lm_7b_alpha_expo,HF OpenLLM v1,66.94,,hf_open_llm_v1_240829_frozen.csv starling_lm_7b_alpha_expo,HFv1 ARC,63.91,,hf_open_llm_v1_240829_frozen.csv starling_lm_7b_alpha_expo,HFv1 GSM8K,61.56,,hf_open_llm_v1_240829_frozen.csv starling_lm_7b_alpha_expo,HFv1 HellaSwag,84.79,,hf_open_llm_v1_240829_frozen.csv starling_lm_7b_alpha_expo,HFv1 MMLU,64.64,,hf_open_llm_v1_240829_frozen.csv starling_lm_7b_alpha_expo,HFv1 TruthfulQA,46.38,,hf_open_llm_v1_240829_frozen.csv starling_lm_7b_alpha_expo,HFv1 Winogrande,80.35,,hf_open_llm_v1_240829_frozen.csv starling_lm_7b_beta,HF OpenLLM v1,69.88,,hf_open_llm_v1_240829_frozen.csv starling_lm_7b_beta,HFv1 ARC,67.24,,hf_open_llm_v1_240829_frozen.csv starling_lm_7b_beta,HFv1 GSM8K,66.64,,hf_open_llm_v1_240829_frozen.csv starling_lm_7b_beta,HFv1 HellaSwag,83.47,,hf_open_llm_v1_240829_frozen.csv starling_lm_7b_beta,HFv1 MMLU,65.14,,hf_open_llm_v1_240829_frozen.csv starling_lm_7b_beta,HFv1 TruthfulQA,55.47,,hf_open_llm_v1_240829_frozen.csv starling_lm_7b_beta,HFv1 Winogrande,81.29,,hf_open_llm_v1_240829_frozen.csv starling_lm_7b_beta_expo,HF OpenLLM v1,70.17,,hf_open_llm_v1_240829_frozen.csv starling_lm_7b_beta_expo,HFv1 ARC,67.92,,hf_open_llm_v1_240829_frozen.csv starling_lm_7b_beta_expo,HFv1 GSM8K,65.66,,hf_open_llm_v1_240829_frozen.csv starling_lm_7b_beta_expo,HFv1 HellaSwag,83.62,,hf_open_llm_v1_240829_frozen.csv starling_lm_7b_beta_expo,HFv1 MMLU,65.3,,hf_open_llm_v1_240829_frozen.csv starling_lm_7b_beta_expo,HFv1 TruthfulQA,57.16,,hf_open_llm_v1_240829_frozen.csv starling_lm_7b_beta_expo,HFv1 Winogrande,81.37,,hf_open_llm_v1_240829_frozen.csv starling_lm_7b_beta_laser_dpo,HF OpenLLM v1,70.14,,hf_open_llm_v1_240829_frozen.csv starling_lm_7b_beta_laser_dpo,HFv1 ARC,67.41,,hf_open_llm_v1_240829_frozen.csv starling_lm_7b_beta_laser_dpo,HFv1 GSM8K,67.93,,hf_open_llm_v1_240829_frozen.csv starling_lm_7b_beta_laser_dpo,HFv1 HellaSwag,83.38,,hf_open_llm_v1_240829_frozen.csv starling_lm_7b_beta_laser_dpo,HFv1 MMLU,65.29,,hf_open_llm_v1_240829_frozen.csv starling_lm_7b_beta_laser_dpo,HFv1 TruthfulQA,55.47,,hf_open_llm_v1_240829_frozen.csv starling_lm_7b_beta_laser_dpo,HFv1 Winogrande,81.37,,hf_open_llm_v1_240829_frozen.csv starlinghermes_2_5_mistral_7b_slerp,HF OpenLLM v1,68.53,,hf_open_llm_v1_240829_frozen.csv starlinghermes_2_5_mistral_7b_slerp,HFv1 ARC,66.04,,hf_open_llm_v1_240829_frozen.csv starlinghermes_2_5_mistral_7b_slerp,HFv1 GSM8K,65.96,,hf_open_llm_v1_240829_frozen.csv starlinghermes_2_5_mistral_7b_slerp,HFv1 HellaSwag,85.18,,hf_open_llm_v1_240829_frozen.csv starlinghermes_2_5_mistral_7b_slerp,HFv1 MMLU,64.72,,hf_open_llm_v1_240829_frozen.csv starlinghermes_2_5_mistral_7b_slerp,HFv1 TruthfulQA,49.56,,hf_open_llm_v1_240829_frozen.csv starlinghermes_2_5_mistral_7b_slerp,HFv1 Winogrande,79.72,,hf_open_llm_v1_240829_frozen.csv stealth_v2,HF OpenLLM v1,76.37,,hf_open_llm_v1_240829_frozen.csv stealth_v2,HFv1 ARC,73.89,,hf_open_llm_v1_240829_frozen.csv stealth_v2,HFv1 GSM8K,69.67,,hf_open_llm_v1_240829_frozen.csv stealth_v2,HFv1 HellaSwag,89.26,,hf_open_llm_v1_240829_frozen.csv stealth_v2,HFv1 MMLU,64.94,,hf_open_llm_v1_240829_frozen.csv stealth_v2,HFv1 TruthfulQA,72.47,,hf_open_llm_v1_240829_frozen.csv stealth_v2,HFv1 Winogrande,88.0,,hf_open_llm_v1_240829_frozen.csv stellarx_4b_v0,HF OpenLLM v1,37.31,,hf_open_llm_v1_240829_frozen.csv stellarx_4b_v0,HFv1 ARC,36.95,,hf_open_llm_v1_240829_frozen.csv stellarx_4b_v0,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv stellarx_4b_v0,HFv1 HellaSwag,61.9,,hf_open_llm_v1_240829_frozen.csv stellarx_4b_v0,HFv1 MMLU,26.85,,hf_open_llm_v1_240829_frozen.csv stellarx_4b_v0,HFv1 TruthfulQA,34.3,,hf_open_llm_v1_240829_frozen.csv stellarx_4b_v0,HFv1 Winogrande,63.85,,hf_open_llm_v1_240829_frozen.csv stellarx_4b_v0_2,HF OpenLLM v1,36.15,,hf_open_llm_v1_240829_frozen.csv stellarx_4b_v0_2,HFv1 ARC,34.64,,hf_open_llm_v1_240829_frozen.csv stellarx_4b_v0_2,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv stellarx_4b_v0_2,HFv1 HellaSwag,56.74,,hf_open_llm_v1_240829_frozen.csv stellarx_4b_v0_2,HFv1 MMLU,25.55,,hf_open_llm_v1_240829_frozen.csv stellarx_4b_v0_2,HFv1 TruthfulQA,38.55,,hf_open_llm_v1_240829_frozen.csv stellarx_4b_v0_2,HFv1 Winogrande,61.4,,hf_open_llm_v1_240829_frozen.csv stopcarbon_10_7b_v1,HF OpenLLM v1,74.2,,hf_open_llm_v1_240829_frozen.csv stopcarbon_10_7b_v1,HFv1 ARC,70.9,,hf_open_llm_v1_240829_frozen.csv stopcarbon_10_7b_v1,HFv1 GSM8K,64.14,,hf_open_llm_v1_240829_frozen.csv stopcarbon_10_7b_v1,HFv1 HellaSwag,88.41,,hf_open_llm_v1_240829_frozen.csv stopcarbon_10_7b_v1,HFv1 MMLU,66.32,,hf_open_llm_v1_240829_frozen.csv stopcarbon_10_7b_v1,HFv1 TruthfulQA,71.71,,hf_open_llm_v1_240829_frozen.csv stopcarbon_10_7b_v1,HFv1 Winogrande,83.74,,hf_open_llm_v1_240829_frozen.csv stopcarbon_10_7b_v2,HF OpenLLM v1,74.21,,hf_open_llm_v1_240829_frozen.csv stopcarbon_10_7b_v2,HFv1 ARC,71.08,,hf_open_llm_v1_240829_frozen.csv stopcarbon_10_7b_v2,HFv1 GSM8K,63.84,,hf_open_llm_v1_240829_frozen.csv stopcarbon_10_7b_v2,HFv1 HellaSwag,88.6,,hf_open_llm_v1_240829_frozen.csv stopcarbon_10_7b_v2,HFv1 MMLU,66.23,,hf_open_llm_v1_240829_frozen.csv stopcarbon_10_7b_v2,HFv1 TruthfulQA,72.01,,hf_open_llm_v1_240829_frozen.csv stopcarbon_10_7b_v2,HFv1 Winogrande,83.5,,hf_open_llm_v1_240829_frozen.csv stopcarbon_10_7b_v3,HF OpenLLM v1,74.01,,hf_open_llm_v1_240829_frozen.csv stopcarbon_10_7b_v3,HFv1 ARC,70.99,,hf_open_llm_v1_240829_frozen.csv stopcarbon_10_7b_v3,HFv1 GSM8K,63.23,,hf_open_llm_v1_240829_frozen.csv stopcarbon_10_7b_v3,HFv1 HellaSwag,88.57,,hf_open_llm_v1_240829_frozen.csv stopcarbon_10_7b_v3,HFv1 MMLU,66.13,,hf_open_llm_v1_240829_frozen.csv stopcarbon_10_7b_v3,HFv1 TruthfulQA,71.94,,hf_open_llm_v1_240829_frozen.csv stopcarbon_10_7b_v3,HFv1 Winogrande,83.19,,hf_open_llm_v1_240829_frozen.csv stopcarbon_10_7b_v4,HF OpenLLM v1,74.29,,hf_open_llm_v1_240829_frozen.csv stopcarbon_10_7b_v4,HFv1 ARC,71.25,,hf_open_llm_v1_240829_frozen.csv stopcarbon_10_7b_v4,HFv1 GSM8K,64.44,,hf_open_llm_v1_240829_frozen.csv stopcarbon_10_7b_v4,HFv1 HellaSwag,88.5,,hf_open_llm_v1_240829_frozen.csv stopcarbon_10_7b_v4,HFv1 MMLU,66.24,,hf_open_llm_v1_240829_frozen.csv stopcarbon_10_7b_v4,HFv1 TruthfulQA,71.89,,hf_open_llm_v1_240829_frozen.csv stopcarbon_10_7b_v4,HFv1 Winogrande,83.43,,hf_open_llm_v1_240829_frozen.csv stopcarbon_10_7b_v5,HF OpenLLM v1,74.41,,hf_open_llm_v1_240829_frozen.csv stopcarbon_10_7b_v5,HFv1 ARC,70.99,,hf_open_llm_v1_240829_frozen.csv stopcarbon_10_7b_v5,HFv1 GSM8K,65.2,,hf_open_llm_v1_240829_frozen.csv stopcarbon_10_7b_v5,HFv1 HellaSwag,88.48,,hf_open_llm_v1_240829_frozen.csv stopcarbon_10_7b_v5,HFv1 MMLU,66.34,,hf_open_llm_v1_240829_frozen.csv stopcarbon_10_7b_v5,HFv1 TruthfulQA,71.84,,hf_open_llm_v1_240829_frozen.csv stopcarbon_10_7b_v5,HFv1 Winogrande,83.58,,hf_open_llm_v1_240829_frozen.csv stopcarbon_10_7b_v6,HF OpenLLM v1,74.31,,hf_open_llm_v1_240829_frozen.csv stopcarbon_10_7b_v6,HFv1 ARC,71.16,,hf_open_llm_v1_240829_frozen.csv stopcarbon_10_7b_v6,HFv1 GSM8K,64.52,,hf_open_llm_v1_240829_frozen.csv stopcarbon_10_7b_v6,HFv1 HellaSwag,88.5,,hf_open_llm_v1_240829_frozen.csv stopcarbon_10_7b_v6,HFv1 MMLU,66.31,,hf_open_llm_v1_240829_frozen.csv stopcarbon_10_7b_v6,HFv1 TruthfulQA,71.96,,hf_open_llm_v1_240829_frozen.csv stopcarbon_10_7b_v6,HFv1 Winogrande,83.43,,hf_open_llm_v1_240829_frozen.csv storytime_13b,HF OpenLLM v1,56.64,,hf_open_llm_v1_240829_frozen.csv storytime_13b,HFv1 ARC,62.03,,hf_open_llm_v1_240829_frozen.csv storytime_13b,HFv1 GSM8K,8.34,,hf_open_llm_v1_240829_frozen.csv storytime_13b,HFv1 HellaSwag,83.96,,hf_open_llm_v1_240829_frozen.csv storytime_13b,HFv1 MMLU,57.48,,hf_open_llm_v1_240829_frozen.csv storytime_13b,HFv1 TruthfulQA,52.5,,hf_open_llm_v1_240829_frozen.csv storytime_13b,HFv1 Winogrande,75.53,,hf_open_llm_v1_240829_frozen.csv superaligned_jawade,HF OpenLLM v1,70.86,,hf_open_llm_v1_240829_frozen.csv superaligned_jawade,HFv1 ARC,71.59,,hf_open_llm_v1_240829_frozen.csv superaligned_jawade,HFv1 GSM8K,49.2,,hf_open_llm_v1_240829_frozen.csv superaligned_jawade,HFv1 HellaSwag,90.58,,hf_open_llm_v1_240829_frozen.csv superaligned_jawade,HFv1 MMLU,60.81,,hf_open_llm_v1_240829_frozen.csv superaligned_jawade,HFv1 TruthfulQA,69.17,,hf_open_llm_v1_240829_frozen.csv superaligned_jawade,HFv1 Winogrande,83.82,,hf_open_llm_v1_240829_frozen.csv sus_chat_34b,HF OpenLLM v1,73.22,,hf_open_llm_v1_240829_frozen.csv sus_chat_34b,HFv1 ARC,66.3,,hf_open_llm_v1_240829_frozen.csv sus_chat_34b,HFv1 GSM8K,72.18,,hf_open_llm_v1_240829_frozen.csv sus_chat_34b,HFv1 HellaSwag,83.91,,hf_open_llm_v1_240829_frozen.csv sus_chat_34b,HFv1 MMLU,76.41,,hf_open_llm_v1_240829_frozen.csv sus_chat_34b,HFv1 TruthfulQA,57.04,,hf_open_llm_v1_240829_frozen.csv sus_chat_34b,HFv1 Winogrande,83.5,,hf_open_llm_v1_240829_frozen.csv sydney_overthinker_13b_hf,HF OpenLLM v1,54.94,,hf_open_llm_v1_240829_frozen.csv sydney_overthinker_13b_hf,HFv1 ARC,58.96,,hf_open_llm_v1_240829_frozen.csv sydney_overthinker_13b_hf,HFv1 GSM8K,18.88,,hf_open_llm_v1_240829_frozen.csv sydney_overthinker_13b_hf,HFv1 HellaSwag,80.85,,hf_open_llm_v1_240829_frozen.csv sydney_overthinker_13b_hf,HFv1 MMLU,51.28,,hf_open_llm_v1_240829_frozen.csv sydney_overthinker_13b_hf,HFv1 TruthfulQA,45.7,,hf_open_llm_v1_240829_frozen.csv sydney_overthinker_13b_hf,HFv1 Winogrande,73.95,,hf_open_llm_v1_240829_frozen.csv synatra_10_7b_v0_4,HF OpenLLM v1,65.48,,hf_open_llm_v1_240829_frozen.csv synatra_10_7b_v0_4,HFv1 ARC,64.93,,hf_open_llm_v1_240829_frozen.csv synatra_10_7b_v0_4,HFv1 GSM8K,50.04,,hf_open_llm_v1_240829_frozen.csv synatra_10_7b_v0_4,HFv1 HellaSwag,82.47,,hf_open_llm_v1_240829_frozen.csv synatra_10_7b_v0_4,HFv1 MMLU,62.5,,hf_open_llm_v1_240829_frozen.csv synatra_10_7b_v0_4,HFv1 TruthfulQA,51.11,,hf_open_llm_v1_240829_frozen.csv synatra_10_7b_v0_4,HFv1 Winogrande,81.85,,hf_open_llm_v1_240829_frozen.csv synatra_11b_testbench,HF OpenLLM v1,56.17,,hf_open_llm_v1_240829_frozen.csv synatra_11b_testbench,HFv1 ARC,57.34,,hf_open_llm_v1_240829_frozen.csv synatra_11b_testbench,HFv1 GSM8K,17.74,,hf_open_llm_v1_240829_frozen.csv synatra_11b_testbench,HFv1 HellaSwag,78.66,,hf_open_llm_v1_240829_frozen.csv synatra_11b_testbench,HFv1 MMLU,55.56,,hf_open_llm_v1_240829_frozen.csv synatra_11b_testbench,HFv1 TruthfulQA,51.97,,hf_open_llm_v1_240829_frozen.csv synatra_11b_testbench,HFv1 Winogrande,75.77,,hf_open_llm_v1_240829_frozen.csv synatra_7b_v0_3_dpo,HF OpenLLM v1,60.55,,hf_open_llm_v1_240829_frozen.csv synatra_7b_v0_3_dpo,HFv1 ARC,62.8,,hf_open_llm_v1_240829_frozen.csv synatra_7b_v0_3_dpo,HFv1 GSM8K,23.73,,hf_open_llm_v1_240829_frozen.csv synatra_7b_v0_3_dpo,HFv1 HellaSwag,82.58,,hf_open_llm_v1_240829_frozen.csv synatra_7b_v0_3_dpo,HFv1 MMLU,61.46,,hf_open_llm_v1_240829_frozen.csv synatra_7b_v0_3_dpo,HFv1 TruthfulQA,56.46,,hf_open_llm_v1_240829_frozen.csv synatra_7b_v0_3_dpo,HFv1 Winogrande,76.24,,hf_open_llm_v1_240829_frozen.csv synatra_7b_v0_3_rp,HF OpenLLM v1,59.26,,hf_open_llm_v1_240829_frozen.csv synatra_7b_v0_3_rp,HFv1 ARC,62.2,,hf_open_llm_v1_240829_frozen.csv synatra_7b_v0_3_rp,HFv1 GSM8K,21.15,,hf_open_llm_v1_240829_frozen.csv synatra_7b_v0_3_rp,HFv1 HellaSwag,82.29,,hf_open_llm_v1_240829_frozen.csv synatra_7b_v0_3_rp,HFv1 MMLU,60.8,,hf_open_llm_v1_240829_frozen.csv synatra_7b_v0_3_rp,HFv1 TruthfulQA,52.64,,hf_open_llm_v1_240829_frozen.csv synatra_7b_v0_3_rp,HFv1 Winogrande,76.48,,hf_open_llm_v1_240829_frozen.csv synatra_rp_orca_2_7b_v0_1,HF OpenLLM v1,59.65,,hf_open_llm_v1_240829_frozen.csv synatra_rp_orca_2_7b_v0_1,HFv1 ARC,57.68,,hf_open_llm_v1_240829_frozen.csv synatra_rp_orca_2_7b_v0_1,HFv1 GSM8K,39.65,,hf_open_llm_v1_240829_frozen.csv synatra_rp_orca_2_7b_v0_1,HFv1 HellaSwag,77.37,,hf_open_llm_v1_240829_frozen.csv synatra_rp_orca_2_7b_v0_1,HFv1 MMLU,56.1,,hf_open_llm_v1_240829_frozen.csv synatra_rp_orca_2_7b_v0_1,HFv1 TruthfulQA,52.52,,hf_open_llm_v1_240829_frozen.csv synatra_rp_orca_2_7b_v0_1,HFv1 Winogrande,74.59,,hf_open_llm_v1_240829_frozen.csv synatra_v0_1_7b_instruct,HF OpenLLM v1,55.86,,hf_open_llm_v1_240829_frozen.csv synatra_v0_1_7b_instruct,HFv1 ARC,55.29,,hf_open_llm_v1_240829_frozen.csv synatra_v0_1_7b_instruct,HFv1 GSM8K,19.41,,hf_open_llm_v1_240829_frozen.csv synatra_v0_1_7b_instruct,HFv1 HellaSwag,76.63,,hf_open_llm_v1_240829_frozen.csv synatra_v0_1_7b_instruct,HFv1 MMLU,55.29,,hf_open_llm_v1_240829_frozen.csv synatra_v0_1_7b_instruct,HFv1 TruthfulQA,55.76,,hf_open_llm_v1_240829_frozen.csv synatra_v0_1_7b_instruct,HFv1 Winogrande,72.77,,hf_open_llm_v1_240829_frozen.csv systemconfighermes_7b,HF OpenLLM v1,68.47,,hf_open_llm_v1_240829_frozen.csv systemconfighermes_7b,HFv1 ARC,65.19,,hf_open_llm_v1_240829_frozen.csv systemconfighermes_7b,HFv1 GSM8K,61.49,,hf_open_llm_v1_240829_frozen.csv systemconfighermes_7b,HFv1 HellaSwag,84.41,,hf_open_llm_v1_240829_frozen.csv systemconfighermes_7b,HFv1 MMLU,61.89,,hf_open_llm_v1_240829_frozen.csv systemconfighermes_7b,HFv1 TruthfulQA,60.11,,hf_open_llm_v1_240829_frozen.csv systemconfighermes_7b,HFv1 Winogrande,77.74,,hf_open_llm_v1_240829_frozen.csv systemhermes_2_7b,HF OpenLLM v1,67.92,,hf_open_llm_v1_240829_frozen.csv systemhermes_2_7b,HFv1 ARC,65.02,,hf_open_llm_v1_240829_frozen.csv systemhermes_2_7b,HFv1 GSM8K,61.56,,hf_open_llm_v1_240829_frozen.csv systemhermes_2_7b,HFv1 HellaSwag,84.05,,hf_open_llm_v1_240829_frozen.csv systemhermes_2_7b,HFv1 MMLU,63.16,,hf_open_llm_v1_240829_frozen.csv systemhermes_2_7b,HFv1 TruthfulQA,56.42,,hf_open_llm_v1_240829_frozen.csv systemhermes_2_7b,HFv1 Winogrande,77.35,,hf_open_llm_v1_240829_frozen.csv systemhermes_7b,HF OpenLLM v1,66.86,,hf_open_llm_v1_240829_frozen.csv systemhermes_7b,HFv1 ARC,64.76,,hf_open_llm_v1_240829_frozen.csv systemhermes_7b,HFv1 GSM8K,58.83,,hf_open_llm_v1_240829_frozen.csv systemhermes_7b,HFv1 HellaSwag,83.68,,hf_open_llm_v1_240829_frozen.csv systemhermes_7b,HFv1 MMLU,63.23,,hf_open_llm_v1_240829_frozen.csv systemhermes_7b,HFv1 TruthfulQA,52.81,,hf_open_llm_v1_240829_frozen.csv systemhermes_7b,HFv1 Winogrande,77.82,,hf_open_llm_v1_240829_frozen.csv taiwan_llm_8x7b_dpo,HF OpenLLM v1,73.09,,hf_open_llm_v1_240829_frozen.csv taiwan_llm_8x7b_dpo,HFv1 ARC,70.99,,hf_open_llm_v1_240829_frozen.csv taiwan_llm_8x7b_dpo,HFv1 GSM8K,71.11,,hf_open_llm_v1_240829_frozen.csv taiwan_llm_8x7b_dpo,HFv1 HellaSwag,87.21,,hf_open_llm_v1_240829_frozen.csv taiwan_llm_8x7b_dpo,HFv1 MMLU,72.43,,hf_open_llm_v1_240829_frozen.csv taiwan_llm_8x7b_dpo,HFv1 TruthfulQA,54.87,,hf_open_llm_v1_240829_frozen.csv taiwan_llm_8x7b_dpo,HFv1 Winogrande,81.93,,hf_open_llm_v1_240829_frozen.csv taketwo,HF OpenLLM v1,38.6,,hf_open_llm_v1_240829_frozen.csv taketwo,HFv1 ARC,37.2,,hf_open_llm_v1_240829_frozen.csv taketwo,HFv1 GSM8K,2.58,,hf_open_llm_v1_240829_frozen.csv taketwo,HFv1 HellaSwag,62.01,,hf_open_llm_v1_240829_frozen.csv taketwo,HFv1 MMLU,23.8,,hf_open_llm_v1_240829_frozen.csv taketwo,HFv1 TruthfulQA,36.02,,hf_open_llm_v1_240829_frozen.csv taketwo,HFv1 Winogrande,70.01,,hf_open_llm_v1_240829_frozen.csv taliml_7b_v_1_eng,HF OpenLLM v1,60.54,,hf_open_llm_v1_240829_frozen.csv taliml_7b_v_1_eng,HFv1 ARC,59.98,,hf_open_llm_v1_240829_frozen.csv taliml_7b_v_1_eng,HFv1 GSM8K,35.03,,hf_open_llm_v1_240829_frozen.csv taliml_7b_v_1_eng,HFv1 HellaSwag,83.27,,hf_open_llm_v1_240829_frozen.csv taliml_7b_v_1_eng,HFv1 MMLU,60.57,,hf_open_llm_v1_240829_frozen.csv taliml_7b_v_1_eng,HFv1 TruthfulQA,47.13,,hf_open_llm_v1_240829_frozen.csv taliml_7b_v_1_eng,HFv1 Winogrande,77.27,,hf_open_llm_v1_240829_frozen.csv tamil_llama_13b_instruct_v0_1,HF OpenLLM v1,51.59,,hf_open_llm_v1_240829_frozen.csv tamil_llama_13b_instruct_v0_1,HFv1 ARC,54.52,,hf_open_llm_v1_240829_frozen.csv tamil_llama_13b_instruct_v0_1,HFv1 GSM8K,7.51,,hf_open_llm_v1_240829_frozen.csv tamil_llama_13b_instruct_v0_1,HFv1 HellaSwag,79.35,,hf_open_llm_v1_240829_frozen.csv tamil_llama_13b_instruct_v0_1,HFv1 MMLU,50.37,,hf_open_llm_v1_240829_frozen.csv tamil_llama_13b_instruct_v0_1,HFv1 TruthfulQA,41.22,,hf_open_llm_v1_240829_frozen.csv tamil_llama_13b_instruct_v0_1,HFv1 Winogrande,76.56,,hf_open_llm_v1_240829_frozen.csv tamil_llama_7b_instruct_v0_2,HF OpenLLM v1,42.39,,hf_open_llm_v1_240829_frozen.csv tamil_llama_7b_instruct_v0_2,HFv1 ARC,40.44,,hf_open_llm_v1_240829_frozen.csv tamil_llama_7b_instruct_v0_2,HFv1 GSM8K,5.31,,hf_open_llm_v1_240829_frozen.csv tamil_llama_7b_instruct_v0_2,HFv1 HellaSwag,68.88,,hf_open_llm_v1_240829_frozen.csv tamil_llama_7b_instruct_v0_2,HFv1 MMLU,23.12,,hf_open_llm_v1_240829_frozen.csv tamil_llama_7b_instruct_v0_2,HFv1 TruthfulQA,50.11,,hf_open_llm_v1_240829_frozen.csv tamil_llama_7b_instruct_v0_2,HFv1 Winogrande,66.46,,hf_open_llm_v1_240829_frozen.csv tau_0_5b_instruct_dpop,HF OpenLLM v1,35.54,,hf_open_llm_v1_240829_frozen.csv tau_0_5b_instruct_dpop,HFv1 ARC,28.92,,hf_open_llm_v1_240829_frozen.csv tau_0_5b_instruct_dpop,HFv1 GSM8K,6.97,,hf_open_llm_v1_240829_frozen.csv tau_0_5b_instruct_dpop,HFv1 HellaSwag,43.63,,hf_open_llm_v1_240829_frozen.csv tau_0_5b_instruct_dpop,HFv1 MMLU,33.92,,hf_open_llm_v1_240829_frozen.csv tau_0_5b_instruct_dpop,HFv1 TruthfulQA,42.73,,hf_open_llm_v1_240829_frozen.csv tau_0_5b_instruct_dpop,HFv1 Winogrande,57.06,,hf_open_llm_v1_240829_frozen.csv tekniumairoboros_nebula_7b,HF OpenLLM v1,54.74,,hf_open_llm_v1_240829_frozen.csv tekniumairoboros_nebula_7b,HFv1 ARC,57.17,,hf_open_llm_v1_240829_frozen.csv tekniumairoboros_nebula_7b,HFv1 GSM8K,9.4,,hf_open_llm_v1_240829_frozen.csv tekniumairoboros_nebula_7b,HFv1 HellaSwag,81.72,,hf_open_llm_v1_240829_frozen.csv tekniumairoboros_nebula_7b,HFv1 MMLU,55.25,,hf_open_llm_v1_240829_frozen.csv tekniumairoboros_nebula_7b,HFv1 TruthfulQA,51.64,,hf_open_llm_v1_240829_frozen.csv tekniumairoboros_nebula_7b,HFv1 Winogrande,73.24,,hf_open_llm_v1_240829_frozen.csv telugu_llama2_7b_v0_instruct,HF OpenLLM v1,52.86,,hf_open_llm_v1_240829_frozen.csv telugu_llama2_7b_v0_instruct,HFv1 ARC,53.58,,hf_open_llm_v1_240829_frozen.csv telugu_llama2_7b_v0_instruct,HFv1 GSM8K,20.39,,hf_open_llm_v1_240829_frozen.csv telugu_llama2_7b_v0_instruct,HFv1 HellaSwag,78.33,,hf_open_llm_v1_240829_frozen.csv telugu_llama2_7b_v0_instruct,HFv1 MMLU,47.63,,hf_open_llm_v1_240829_frozen.csv telugu_llama2_7b_v0_instruct,HFv1 TruthfulQA,43.26,,hf_open_llm_v1_240829_frozen.csv telugu_llama2_7b_v0_instruct,HFv1 Winogrande,73.95,,hf_open_llm_v1_240829_frozen.csv telugu_llama_7b_instruct_v0_1,HF OpenLLM v1,39.71,,hf_open_llm_v1_240829_frozen.csv telugu_llama_7b_instruct_v0_1,HFv1 ARC,36.95,,hf_open_llm_v1_240829_frozen.csv telugu_llama_7b_instruct_v0_1,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv telugu_llama_7b_instruct_v0_1,HFv1 HellaSwag,67.88,,hf_open_llm_v1_240829_frozen.csv telugu_llama_7b_instruct_v0_1,HFv1 MMLU,23.12,,hf_open_llm_v1_240829_frozen.csv telugu_llama_7b_instruct_v0_1,HFv1 TruthfulQA,48.97,,hf_open_llm_v1_240829_frozen.csv telugu_llama_7b_instruct_v0_1,HFv1 Winogrande,61.33,,hf_open_llm_v1_240829_frozen.csv test1_slide,HF OpenLLM v1,65.31,,hf_open_llm_v1_240829_frozen.csv test1_slide,HFv1 ARC,57.42,,hf_open_llm_v1_240829_frozen.csv test1_slide,HFv1 GSM8K,61.18,,hf_open_llm_v1_240829_frozen.csv test1_slide,HFv1 HellaSwag,78.73,,hf_open_llm_v1_240829_frozen.csv test1_slide,HFv1 MMLU,63.8,,hf_open_llm_v1_240829_frozen.csv test1_slide,HFv1 TruthfulQA,55.48,,hf_open_llm_v1_240829_frozen.csv test1_slide,HFv1 Winogrande,75.22,,hf_open_llm_v1_240829_frozen.csv test3_sft_16bit_dpo2,HF OpenLLM v1,74.98,,hf_open_llm_v1_240829_frozen.csv test3_sft_16bit_dpo2,HFv1 ARC,73.63,,hf_open_llm_v1_240829_frozen.csv test3_sft_16bit_dpo2,HFv1 GSM8K,67.48,,hf_open_llm_v1_240829_frozen.csv test3_sft_16bit_dpo2,HFv1 HellaSwag,89.03,,hf_open_llm_v1_240829_frozen.csv test3_sft_16bit_dpo2,HFv1 MMLU,64.63,,hf_open_llm_v1_240829_frozen.csv test3_sft_16bit_dpo2,HFv1 TruthfulQA,70.71,,hf_open_llm_v1_240829_frozen.csv test3_sft_16bit_dpo2,HFv1 Winogrande,84.37,,hf_open_llm_v1_240829_frozen.csv test_22b,HF OpenLLM v1,37.71,,hf_open_llm_v1_240829_frozen.csv test_22b,HFv1 ARC,39.42,,hf_open_llm_v1_240829_frozen.csv test_22b,HFv1 GSM8K,0.38,,hf_open_llm_v1_240829_frozen.csv test_22b,HFv1 HellaSwag,64.51,,hf_open_llm_v1_240829_frozen.csv test_22b,HFv1 MMLU,27.13,,hf_open_llm_v1_240829_frozen.csv test_22b,HFv1 TruthfulQA,37.13,,hf_open_llm_v1_240829_frozen.csv test_22b,HFv1 Winogrande,57.7,,hf_open_llm_v1_240829_frozen.csv test_model,HF OpenLLM v1,29.31,,hf_open_llm_v1_240829_frozen.csv test_model,HFv1 ARC,24.4,,hf_open_llm_v1_240829_frozen.csv test_model,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv test_model,HFv1 HellaSwag,30.17,,hf_open_llm_v1_240829_frozen.csv test_model,HFv1 MMLU,25.88,,hf_open_llm_v1_240829_frozen.csv test_model,HFv1 TruthfulQA,44.59,,hf_open_llm_v1_240829_frozen.csv test_model,HFv1 Winogrande,50.83,,hf_open_llm_v1_240829_frozen.csv test_qwen1_5_0_5b,HF OpenLLM v1,35.78,,hf_open_llm_v1_240829_frozen.csv test_qwen1_5_0_5b,HFv1 ARC,31.14,,hf_open_llm_v1_240829_frozen.csv test_qwen1_5_0_5b,HFv1 GSM8K,7.58,,hf_open_llm_v1_240829_frozen.csv test_qwen1_5_0_5b,HFv1 HellaSwag,44.12,,hf_open_llm_v1_240829_frozen.csv test_qwen1_5_0_5b,HFv1 MMLU,33.69,,hf_open_llm_v1_240829_frozen.csv test_qwen1_5_0_5b,HFv1 TruthfulQA,42.9,,hf_open_llm_v1_240829_frozen.csv test_qwen1_5_0_5b,HFv1 Winogrande,55.25,,hf_open_llm_v1_240829_frozen.csv thetawave_14b_v0_1,HF OpenLLM v1,44.54,,hf_open_llm_v1_240829_frozen.csv thetawave_14b_v0_1,HFv1 ARC,42.83,,hf_open_llm_v1_240829_frozen.csv thetawave_14b_v0_1,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv thetawave_14b_v0_1,HFv1 HellaSwag,47.09,,hf_open_llm_v1_240829_frozen.csv thetawave_14b_v0_1,HFv1 MMLU,61.45,,hf_open_llm_v1_240829_frozen.csv thetawave_14b_v0_1,HFv1 TruthfulQA,50.41,,hf_open_llm_v1_240829_frozen.csv thetawave_14b_v0_1,HFv1 Winogrande,65.43,,hf_open_llm_v1_240829_frozen.csv thetawave_28b_v0_1,HF OpenLLM v1,40.4,,hf_open_llm_v1_240829_frozen.csv thetawave_28b_v0_1,HFv1 ARC,36.6,,hf_open_llm_v1_240829_frozen.csv thetawave_28b_v0_1,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv thetawave_28b_v0_1,HFv1 HellaSwag,35.54,,hf_open_llm_v1_240829_frozen.csv thetawave_28b_v0_1,HFv1 MMLU,54.5,,hf_open_llm_v1_240829_frozen.csv thetawave_28b_v0_1,HFv1 TruthfulQA,49.86,,hf_open_llm_v1_240829_frozen.csv thetawave_28b_v0_1,HFv1 Winogrande,65.9,,hf_open_llm_v1_240829_frozen.csv thetawave_7b,HF OpenLLM v1,69.35,,hf_open_llm_v1_240829_frozen.csv thetawave_7b,HFv1 ARC,67.49,,hf_open_llm_v1_240829_frozen.csv thetawave_7b,HFv1 GSM8K,56.1,,hf_open_llm_v1_240829_frozen.csv thetawave_7b,HFv1 HellaSwag,86.01,,hf_open_llm_v1_240829_frozen.csv thetawave_7b,HFv1 MMLU,62.26,,hf_open_llm_v1_240829_frozen.csv thetawave_7b,HFv1 TruthfulQA,65.26,,hf_open_llm_v1_240829_frozen.csv thetawave_7b,HFv1 Winogrande,79.01,,hf_open_llm_v1_240829_frozen.csv thetawave_7b_v0_1,HF OpenLLM v1,70.49,,hf_open_llm_v1_240829_frozen.csv thetawave_7b_v0_1,HFv1 ARC,68.09,,hf_open_llm_v1_240829_frozen.csv thetawave_7b_v0_1,HFv1 GSM8K,55.65,,hf_open_llm_v1_240829_frozen.csv thetawave_7b_v0_1,HFv1 HellaSwag,86.33,,hf_open_llm_v1_240829_frozen.csv thetawave_7b_v0_1,HFv1 MMLU,62.11,,hf_open_llm_v1_240829_frozen.csv thetawave_7b_v0_1,HFv1 TruthfulQA,71.68,,hf_open_llm_v1_240829_frozen.csv thetawave_7b_v0_1,HFv1 Winogrande,79.08,,hf_open_llm_v1_240829_frozen.csv tiamat_8b_1_2_llama3_dpo,HF OpenLLM v1,68.77,,hf_open_llm_v1_240829_frozen.csv tiamat_8b_1_2_llama3_dpo,HFv1 ARC,62.29,,hf_open_llm_v1_240829_frozen.csv tiamat_8b_1_2_llama3_dpo,HFv1 GSM8K,68.84,,hf_open_llm_v1_240829_frozen.csv tiamat_8b_1_2_llama3_dpo,HFv1 HellaSwag,83.45,,hf_open_llm_v1_240829_frozen.csv tiamat_8b_1_2_llama3_dpo,HFv1 MMLU,65.19,,hf_open_llm_v1_240829_frozen.csv tiamat_8b_1_2_llama3_dpo,HFv1 TruthfulQA,56.08,,hf_open_llm_v1_240829_frozen.csv tiamat_8b_1_2_llama3_dpo,HFv1 Winogrande,76.8,,hf_open_llm_v1_240829_frozen.csv tigerbot_13b_base,HF OpenLLM v1,53.42,,hf_open_llm_v1_240829_frozen.csv tigerbot_13b_base,HFv1 ARC,53.84,,hf_open_llm_v1_240829_frozen.csv tigerbot_13b_base,HFv1 GSM8K,17.06,,hf_open_llm_v1_240829_frozen.csv tigerbot_13b_base,HFv1 HellaSwag,77.05,,hf_open_llm_v1_240829_frozen.csv tigerbot_13b_base,HFv1 MMLU,53.57,,hf_open_llm_v1_240829_frozen.csv tigerbot_13b_base,HFv1 TruthfulQA,44.06,,hf_open_llm_v1_240829_frozen.csv tigerbot_13b_base,HFv1 Winogrande,74.98,,hf_open_llm_v1_240829_frozen.csv tigerbot_70b_base,HF OpenLLM v1,63.71,,hf_open_llm_v1_240829_frozen.csv tigerbot_70b_base,HFv1 ARC,62.46,,hf_open_llm_v1_240829_frozen.csv tigerbot_70b_base,HFv1 GSM8K,37.76,,hf_open_llm_v1_240829_frozen.csv tigerbot_70b_base,HFv1 HellaSwag,83.61,,hf_open_llm_v1_240829_frozen.csv tigerbot_70b_base,HFv1 MMLU,65.49,,hf_open_llm_v1_240829_frozen.csv tigerbot_70b_base,HFv1 TruthfulQA,52.76,,hf_open_llm_v1_240829_frozen.csv tigerbot_70b_base,HFv1 Winogrande,80.19,,hf_open_llm_v1_240829_frozen.csv tiny_llama3_7b,HF OpenLLM v1,35.6,,hf_open_llm_v1_240829_frozen.csv tiny_llama3_7b,HFv1 ARC,34.64,,hf_open_llm_v1_240829_frozen.csv tiny_llama3_7b,HFv1 GSM8K,0.38,,hf_open_llm_v1_240829_frozen.csv tiny_llama3_7b,HFv1 HellaSwag,56.39,,hf_open_llm_v1_240829_frozen.csv tiny_llama3_7b,HFv1 MMLU,24.51,,hf_open_llm_v1_240829_frozen.csv tiny_llama3_7b,HFv1 TruthfulQA,38.03,,hf_open_llm_v1_240829_frozen.csv tiny_llama3_7b,HFv1 Winogrande,59.67,,hf_open_llm_v1_240829_frozen.csv tiny_starcoder_py,HF OpenLLM v1,29.41,,hf_open_llm_v1_240829_frozen.csv tiny_starcoder_py,HFv1 ARC,20.99,,hf_open_llm_v1_240829_frozen.csv tiny_starcoder_py,HFv1 GSM8K,0.99,,hf_open_llm_v1_240829_frozen.csv tiny_starcoder_py,HFv1 HellaSwag,28.77,,hf_open_llm_v1_240829_frozen.csv tiny_starcoder_py,HFv1 MMLU,26.79,,hf_open_llm_v1_240829_frozen.csv tiny_starcoder_py,HFv1 TruthfulQA,47.68,,hf_open_llm_v1_240829_frozen.csv tiny_starcoder_py,HFv1 Winogrande,51.22,,hf_open_llm_v1_240829_frozen.csv tiny_vicuna_1b,HF OpenLLM v1,34.76,,hf_open_llm_v1_240829_frozen.csv tiny_vicuna_1b,HFv1 ARC,33.45,,hf_open_llm_v1_240829_frozen.csv tiny_vicuna_1b,HFv1 GSM8K,1.52,,hf_open_llm_v1_240829_frozen.csv tiny_vicuna_1b,HFv1 HellaSwag,55.92,,hf_open_llm_v1_240829_frozen.csv tiny_vicuna_1b,HFv1 MMLU,25.45,,hf_open_llm_v1_240829_frozen.csv tiny_vicuna_1b,HFv1 TruthfulQA,33.82,,hf_open_llm_v1_240829_frozen.csv tiny_vicuna_1b,HFv1 Winogrande,58.41,,hf_open_llm_v1_240829_frozen.csv tinydolphin_2_8_1_1_1b,HF OpenLLM v1,36.21,,hf_open_llm_v1_240829_frozen.csv tinydolphin_2_8_1_1_1b,HFv1 ARC,34.98,,hf_open_llm_v1_240829_frozen.csv tinydolphin_2_8_1_1_1b,HFv1 GSM8K,0.68,,hf_open_llm_v1_240829_frozen.csv tinydolphin_2_8_1_1_1b,HFv1 HellaSwag,60.11,,hf_open_llm_v1_240829_frozen.csv tinydolphin_2_8_1_1_1b,HFv1 MMLU,25.31,,hf_open_llm_v1_240829_frozen.csv tinydolphin_2_8_1_1_1b,HFv1 TruthfulQA,35.51,,hf_open_llm_v1_240829_frozen.csv tinydolphin_2_8_1_1_1b,HFv1 Winogrande,60.69,,hf_open_llm_v1_240829_frozen.csv tinydolphin_2_8_1_1b,HF OpenLLM v1,36.34,,hf_open_llm_v1_240829_frozen.csv tinydolphin_2_8_1_1b,HFv1 ARC,34.3,,hf_open_llm_v1_240829_frozen.csv tinydolphin_2_8_1_1b,HFv1 GSM8K,1.52,,hf_open_llm_v1_240829_frozen.csv tinydolphin_2_8_1_1b,HFv1 HellaSwag,59.44,,hf_open_llm_v1_240829_frozen.csv tinydolphin_2_8_1_1b,HFv1 MMLU,25.59,,hf_open_llm_v1_240829_frozen.csv tinydolphin_2_8_1_1b,HFv1 TruthfulQA,36.51,,hf_open_llm_v1_240829_frozen.csv tinydolphin_2_8_1_1b,HFv1 Winogrande,60.69,,hf_open_llm_v1_240829_frozen.csv tinydolphin_2_8_2_1_1b_laser,HF OpenLLM v1,35.93,,hf_open_llm_v1_240829_frozen.csv tinydolphin_2_8_2_1_1b_laser,HFv1 ARC,33.36,,hf_open_llm_v1_240829_frozen.csv tinydolphin_2_8_2_1_1b_laser,HFv1 GSM8K,1.29,,hf_open_llm_v1_240829_frozen.csv tinydolphin_2_8_2_1_1b_laser,HFv1 HellaSwag,58.53,,hf_open_llm_v1_240829_frozen.csv tinydolphin_2_8_2_1_1b_laser,HFv1 MMLU,25.93,,hf_open_llm_v1_240829_frozen.csv tinydolphin_2_8_2_1_1b_laser,HFv1 TruthfulQA,36.33,,hf_open_llm_v1_240829_frozen.csv tinydolphin_2_8_2_1_1b_laser,HFv1 Winogrande,60.14,,hf_open_llm_v1_240829_frozen.csv tinyllama,HF OpenLLM v1,35.8,,hf_open_llm_v1_240829_frozen.csv tinyllama,HFv1 ARC,34.98,,hf_open_llm_v1_240829_frozen.csv tinyllama,HFv1 GSM8K,0.99,,hf_open_llm_v1_240829_frozen.csv tinyllama,HFv1 HellaSwag,58.24,,hf_open_llm_v1_240829_frozen.csv tinyllama,HFv1 MMLU,26.49,,hf_open_llm_v1_240829_frozen.csv tinyllama,HFv1 TruthfulQA,35.62,,hf_open_llm_v1_240829_frozen.csv tinyllama,HFv1 Winogrande,58.48,,hf_open_llm_v1_240829_frozen.csv tinyllama_1_1b_1_5t_openorca_alpha,HF OpenLLM v1,35.39,,hf_open_llm_v1_240829_frozen.csv tinyllama_1_1b_1_5t_openorca_alpha,HFv1 ARC,32.76,,hf_open_llm_v1_240829_frozen.csv tinyllama_1_1b_1_5t_openorca_alpha,HFv1 GSM8K,0.61,,hf_open_llm_v1_240829_frozen.csv tinyllama_1_1b_1_5t_openorca_alpha,HFv1 HellaSwag,53.77,,hf_open_llm_v1_240829_frozen.csv tinyllama_1_1b_1_5t_openorca_alpha,HFv1 MMLU,25.73,,hf_open_llm_v1_240829_frozen.csv tinyllama_1_1b_1_5t_openorca_alpha,HFv1 TruthfulQA,40.52,,hf_open_llm_v1_240829_frozen.csv tinyllama_1_1b_1_5t_openorca_alpha,HFv1 Winogrande,58.96,,hf_open_llm_v1_240829_frozen.csv tinyllama_1_1b_1t_openorca,HF OpenLLM v1,34.58,,hf_open_llm_v1_240829_frozen.csv tinyllama_1_1b_1t_openorca,HFv1 ARC,31.31,,hf_open_llm_v1_240829_frozen.csv tinyllama_1_1b_1t_openorca,HFv1 GSM8K,1.67,,hf_open_llm_v1_240829_frozen.csv tinyllama_1_1b_1t_openorca,HFv1 HellaSwag,52.34,,hf_open_llm_v1_240829_frozen.csv tinyllama_1_1b_1t_openorca,HFv1 MMLU,25.31,,hf_open_llm_v1_240829_frozen.csv tinyllama_1_1b_1t_openorca,HFv1 TruthfulQA,38.58,,hf_open_llm_v1_240829_frozen.csv tinyllama_1_1b_1t_openorca,HFv1 Winogrande,58.25,,hf_open_llm_v1_240829_frozen.csv tinyllama_1_1b_chat_v0_3_platypus,HF OpenLLM v1,34.5,,hf_open_llm_v1_240829_frozen.csv tinyllama_1_1b_chat_v0_3_platypus,HFv1 ARC,30.29,,hf_open_llm_v1_240829_frozen.csv tinyllama_1_1b_chat_v0_3_platypus,HFv1 GSM8K,0.53,,hf_open_llm_v1_240829_frozen.csv tinyllama_1_1b_chat_v0_3_platypus,HFv1 HellaSwag,55.12,,hf_open_llm_v1_240829_frozen.csv tinyllama_1_1b_chat_v0_3_platypus,HFv1 MMLU,26.13,,hf_open_llm_v1_240829_frozen.csv tinyllama_1_1b_chat_v0_3_platypus,HFv1 TruthfulQA,39.15,,hf_open_llm_v1_240829_frozen.csv tinyllama_1_1b_chat_v0_3_platypus,HFv1 Winogrande,55.8,,hf_open_llm_v1_240829_frozen.csv tinyllama_1_1b_chat_v1_0_intel_dpo,HF OpenLLM v1,37.09,,hf_open_llm_v1_240829_frozen.csv tinyllama_1_1b_chat_v1_0_intel_dpo,HFv1 ARC,35.84,,hf_open_llm_v1_240829_frozen.csv tinyllama_1_1b_chat_v1_0_intel_dpo,HFv1 GSM8K,1.97,,hf_open_llm_v1_240829_frozen.csv tinyllama_1_1b_chat_v1_0_intel_dpo,HFv1 HellaSwag,61.29,,hf_open_llm_v1_240829_frozen.csv tinyllama_1_1b_chat_v1_0_intel_dpo,HFv1 MMLU,25.05,,hf_open_llm_v1_240829_frozen.csv tinyllama_1_1b_chat_v1_0_intel_dpo,HFv1 TruthfulQA,37.38,,hf_open_llm_v1_240829_frozen.csv tinyllama_1_1b_chat_v1_0_intel_dpo,HFv1 Winogrande,61.01,,hf_open_llm_v1_240829_frozen.csv tinyllama_1_1b_intermediate_step_1431k_3t,HF OpenLLM v1,36.42,,hf_open_llm_v1_240829_frozen.csv tinyllama_1_1b_intermediate_step_1431k_3t,HFv1 ARC,33.87,,hf_open_llm_v1_240829_frozen.csv tinyllama_1_1b_intermediate_step_1431k_3t,HFv1 GSM8K,1.44,,hf_open_llm_v1_240829_frozen.csv tinyllama_1_1b_intermediate_step_1431k_3t,HFv1 HellaSwag,60.31,,hf_open_llm_v1_240829_frozen.csv tinyllama_1_1b_intermediate_step_1431k_3t,HFv1 MMLU,26.04,,hf_open_llm_v1_240829_frozen.csv tinyllama_1_1b_intermediate_step_1431k_3t,HFv1 TruthfulQA,37.32,,hf_open_llm_v1_240829_frozen.csv tinyllama_1_1b_intermediate_step_1431k_3t,HFv1 Winogrande,59.51,,hf_open_llm_v1_240829_frozen.csv tinyllama_1_1b_intermediate_step_1431k_3t_laser_dpo,HF OpenLLM v1,36.46,,hf_open_llm_v1_240829_frozen.csv tinyllama_1_1b_intermediate_step_1431k_3t_laser_dpo,HFv1 ARC,33.02,,hf_open_llm_v1_240829_frozen.csv tinyllama_1_1b_intermediate_step_1431k_3t_laser_dpo,HFv1 GSM8K,1.21,,hf_open_llm_v1_240829_frozen.csv tinyllama_1_1b_intermediate_step_1431k_3t_laser_dpo,HFv1 HellaSwag,60.0,,hf_open_llm_v1_240829_frozen.csv tinyllama_1_1b_intermediate_step_1431k_3t_laser_dpo,HFv1 MMLU,26.88,,hf_open_llm_v1_240829_frozen.csv tinyllama_1_1b_intermediate_step_1431k_3t_laser_dpo,HFv1 TruthfulQA,38.08,,hf_open_llm_v1_240829_frozen.csv tinyllama_1_1b_intermediate_step_1431k_3t_laser_dpo,HFv1 Winogrande,59.59,,hf_open_llm_v1_240829_frozen.csv tinyllama_1_1b_intermediate_step_240k_503b,HF OpenLLM v1,33.72,,hf_open_llm_v1_240829_frozen.csv tinyllama_1_1b_intermediate_step_240k_503b,HFv1 ARC,29.27,,hf_open_llm_v1_240829_frozen.csv tinyllama_1_1b_intermediate_step_240k_503b,HFv1 GSM8K,0.3,,hf_open_llm_v1_240829_frozen.csv tinyllama_1_1b_intermediate_step_240k_503b,HFv1 HellaSwag,49.71,,hf_open_llm_v1_240829_frozen.csv tinyllama_1_1b_intermediate_step_240k_503b,HFv1 MMLU,26.26,,hf_open_llm_v1_240829_frozen.csv tinyllama_1_1b_intermediate_step_240k_503b,HFv1 TruthfulQA,40.17,,hf_open_llm_v1_240829_frozen.csv tinyllama_1_1b_intermediate_step_240k_503b,HFv1 Winogrande,56.59,,hf_open_llm_v1_240829_frozen.csv tinyllama_1_1b_intermediate_step_480k_1t,HF OpenLLM v1,34.37,,hf_open_llm_v1_240829_frozen.csv tinyllama_1_1b_intermediate_step_480k_1t,HFv1 ARC,30.89,,hf_open_llm_v1_240829_frozen.csv tinyllama_1_1b_intermediate_step_480k_1t,HFv1 GSM8K,0.53,,hf_open_llm_v1_240829_frozen.csv tinyllama_1_1b_intermediate_step_480k_1t,HFv1 HellaSwag,52.97,,hf_open_llm_v1_240829_frozen.csv tinyllama_1_1b_intermediate_step_480k_1t,HFv1 MMLU,25.0,,hf_open_llm_v1_240829_frozen.csv tinyllama_1_1b_intermediate_step_480k_1t,HFv1 TruthfulQA,39.55,,hf_open_llm_v1_240829_frozen.csv tinyllama_1_1b_intermediate_step_480k_1t,HFv1 Winogrande,57.3,,hf_open_llm_v1_240829_frozen.csv tinyllama_1_1b_layla_v4,HF OpenLLM v1,37.37,,hf_open_llm_v1_240829_frozen.csv tinyllama_1_1b_layla_v4,HFv1 ARC,34.81,,hf_open_llm_v1_240829_frozen.csv tinyllama_1_1b_layla_v4,HFv1 GSM8K,2.2,,hf_open_llm_v1_240829_frozen.csv tinyllama_1_1b_layla_v4,HFv1 HellaSwag,61.25,,hf_open_llm_v1_240829_frozen.csv tinyllama_1_1b_layla_v4,HFv1 MMLU,25.53,,hf_open_llm_v1_240829_frozen.csv tinyllama_1_1b_layla_v4,HFv1 TruthfulQA,38.97,,hf_open_llm_v1_240829_frozen.csv tinyllama_1_1b_layla_v4,HFv1 Winogrande,61.48,,hf_open_llm_v1_240829_frozen.csv tinyllama_1_1b_orca_v1_0,HF OpenLLM v1,37.17,,hf_open_llm_v1_240829_frozen.csv tinyllama_1_1b_orca_v1_0,HFv1 ARC,36.35,,hf_open_llm_v1_240829_frozen.csv tinyllama_1_1b_orca_v1_0,HFv1 GSM8K,2.27,,hf_open_llm_v1_240829_frozen.csv tinyllama_1_1b_orca_v1_0,HFv1 HellaSwag,61.23,,hf_open_llm_v1_240829_frozen.csv tinyllama_1_1b_orca_v1_0,HFv1 MMLU,25.18,,hf_open_llm_v1_240829_frozen.csv tinyllama_1_1b_orca_v1_0,HFv1 TruthfulQA,36.58,,hf_open_llm_v1_240829_frozen.csv tinyllama_1_1b_orca_v1_0,HFv1 Winogrande,61.4,,hf_open_llm_v1_240829_frozen.csv tinyllama_1_1b_slimorca_function_calling_3t,HF OpenLLM v1,37.38,,hf_open_llm_v1_240829_frozen.csv tinyllama_1_1b_slimorca_function_calling_3t,HFv1 ARC,36.09,,hf_open_llm_v1_240829_frozen.csv tinyllama_1_1b_slimorca_function_calling_3t,HFv1 GSM8K,4.47,,hf_open_llm_v1_240829_frozen.csv tinyllama_1_1b_slimorca_function_calling_3t,HFv1 HellaSwag,59.66,,hf_open_llm_v1_240829_frozen.csv tinyllama_1_1b_slimorca_function_calling_3t,HFv1 MMLU,28.21,,hf_open_llm_v1_240829_frozen.csv tinyllama_1_1b_slimorca_function_calling_3t,HFv1 TruthfulQA,36.74,,hf_open_llm_v1_240829_frozen.csv tinyllama_1_1b_slimorca_function_calling_3t,HFv1 Winogrande,59.12,,hf_open_llm_v1_240829_frozen.csv tinyllama_1_1b_step_50k_105b,HF OpenLLM v1,31.86,,hf_open_llm_v1_240829_frozen.csv tinyllama_1_1b_step_50k_105b,HFv1 ARC,25.85,,hf_open_llm_v1_240829_frozen.csv tinyllama_1_1b_step_50k_105b,HFv1 GSM8K,0.53,,hf_open_llm_v1_240829_frozen.csv tinyllama_1_1b_step_50k_105b,HFv1 HellaSwag,44.1,,hf_open_llm_v1_240829_frozen.csv tinyllama_1_1b_step_50k_105b,HFv1 MMLU,26.78,,hf_open_llm_v1_240829_frozen.csv tinyllama_1_1b_step_50k_105b,HFv1 TruthfulQA,39.51,,hf_open_llm_v1_240829_frozen.csv tinyllama_1_1b_step_50k_105b,HFv1 Winogrande,54.38,,hf_open_llm_v1_240829_frozen.csv tinyllama_chat_sft,HF OpenLLM v1,37.21,,hf_open_llm_v1_240829_frozen.csv tinyllama_chat_sft,HFv1 ARC,34.47,,hf_open_llm_v1_240829_frozen.csv tinyllama_chat_sft,HFv1 GSM8K,1.44,,hf_open_llm_v1_240829_frozen.csv tinyllama_chat_sft,HFv1 HellaSwag,61.03,,hf_open_llm_v1_240829_frozen.csv tinyllama_chat_sft,HFv1 MMLU,25.77,,hf_open_llm_v1_240829_frozen.csv tinyllama_chat_sft,HFv1 TruthfulQA,39.29,,hf_open_llm_v1_240829_frozen.csv tinyllama_chat_sft,HFv1 Winogrande,61.25,,hf_open_llm_v1_240829_frozen.csv tinyllama_frankenmerge,HF OpenLLM v1,34.64,,hf_open_llm_v1_240829_frozen.csv tinyllama_frankenmerge,HFv1 ARC,30.2,,hf_open_llm_v1_240829_frozen.csv tinyllama_frankenmerge,HFv1 GSM8K,1.59,,hf_open_llm_v1_240829_frozen.csv tinyllama_frankenmerge,HFv1 HellaSwag,51.01,,hf_open_llm_v1_240829_frozen.csv tinyllama_frankenmerge,HFv1 MMLU,26.11,,hf_open_llm_v1_240829_frozen.csv tinyllama_frankenmerge,HFv1 TruthfulQA,40.18,,hf_open_llm_v1_240829_frozen.csv tinyllama_frankenmerge,HFv1 Winogrande,58.72,,hf_open_llm_v1_240829_frozen.csv tinyllama_moe_chat,HF OpenLLM v1,37.81,,hf_open_llm_v1_240829_frozen.csv tinyllama_moe_chat,HFv1 ARC,34.73,,hf_open_llm_v1_240829_frozen.csv tinyllama_moe_chat,HFv1 GSM8K,1.21,,hf_open_llm_v1_240829_frozen.csv tinyllama_moe_chat,HFv1 HellaSwag,59.29,,hf_open_llm_v1_240829_frozen.csv tinyllama_moe_chat,HFv1 MMLU,29.9,,hf_open_llm_v1_240829_frozen.csv tinyllama_moe_chat,HFv1 TruthfulQA,39.37,,hf_open_llm_v1_240829_frozen.csv tinyllama_moe_chat,HFv1 Winogrande,62.51,,hf_open_llm_v1_240829_frozen.csv tinyllama_moe_chat_0_1,HF OpenLLM v1,36.7,,hf_open_llm_v1_240829_frozen.csv tinyllama_moe_chat_0_1,HFv1 ARC,34.39,,hf_open_llm_v1_240829_frozen.csv tinyllama_moe_chat_0_1,HFv1 GSM8K,2.27,,hf_open_llm_v1_240829_frozen.csv tinyllama_moe_chat_0_1,HFv1 HellaSwag,56.72,,hf_open_llm_v1_240829_frozen.csv tinyllama_moe_chat_0_1,HFv1 MMLU,29.36,,hf_open_llm_v1_240829_frozen.csv tinyllama_moe_chat_0_1,HFv1 TruthfulQA,37.82,,hf_open_llm_v1_240829_frozen.csv tinyllama_moe_chat_0_1,HFv1 Winogrande,59.67,,hf_open_llm_v1_240829_frozen.csv tinymistral_248m,HF OpenLLM v1,27.73,,hf_open_llm_v1_240829_frozen.csv tinymistral_248m,HFv1 ARC,22.87,,hf_open_llm_v1_240829_frozen.csv tinymistral_248m,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv tinymistral_248m,HFv1 HellaSwag,28.02,,hf_open_llm_v1_240829_frozen.csv tinymistral_248m,HFv1 MMLU,23.15,,hf_open_llm_v1_240829_frozen.csv tinymistral_248m,HFv1 TruthfulQA,42.52,,hf_open_llm_v1_240829_frozen.csv tinymistral_248m,HFv1 Winogrande,49.8,,hf_open_llm_v1_240829_frozen.csv tinymistral_248m_chat_v1,HF OpenLLM v1,27.01,,hf_open_llm_v1_240829_frozen.csv tinymistral_248m_chat_v1,HFv1 ARC,21.59,,hf_open_llm_v1_240829_frozen.csv tinymistral_248m_chat_v1,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv tinymistral_248m_chat_v1,HFv1 HellaSwag,27.45,,hf_open_llm_v1_240829_frozen.csv tinymistral_248m_chat_v1,HFv1 MMLU,23.08,,hf_open_llm_v1_240829_frozen.csv tinymistral_248m_chat_v1,HFv1 TruthfulQA,40.91,,hf_open_llm_v1_240829_frozen.csv tinymistral_248m_chat_v1,HFv1 Winogrande,49.01,,hf_open_llm_v1_240829_frozen.csv tinymistral_248m_chat_v2,HF OpenLLM v1,27.42,,hf_open_llm_v1_240829_frozen.csv tinymistral_248m_chat_v2,HFv1 ARC,23.29,,hf_open_llm_v1_240829_frozen.csv tinymistral_248m_chat_v2,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv tinymistral_248m_chat_v2,HFv1 HellaSwag,27.39,,hf_open_llm_v1_240829_frozen.csv tinymistral_248m_chat_v2,HFv1 MMLU,23.52,,hf_open_llm_v1_240829_frozen.csv tinymistral_248m_chat_v2,HFv1 TruthfulQA,41.32,,hf_open_llm_v1_240829_frozen.csv tinymistral_248m_chat_v2,HFv1 Winogrande,49.01,,hf_open_llm_v1_240829_frozen.csv tinymistral_248m_instruct,HF OpenLLM v1,28.19,,hf_open_llm_v1_240829_frozen.csv tinymistral_248m_instruct,HFv1 ARC,24.32,,hf_open_llm_v1_240829_frozen.csv tinymistral_248m_instruct,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv tinymistral_248m_instruct,HFv1 HellaSwag,27.52,,hf_open_llm_v1_240829_frozen.csv tinymistral_248m_instruct,HFv1 MMLU,25.18,,hf_open_llm_v1_240829_frozen.csv tinymistral_248m_instruct,HFv1 TruthfulQA,41.94,,hf_open_llm_v1_240829_frozen.csv tinymistral_248m_instruct,HFv1 Winogrande,50.2,,hf_open_llm_v1_240829_frozen.csv tinymistral_248m_sft_v3,HF OpenLLM v1,28.78,,hf_open_llm_v1_240829_frozen.csv tinymistral_248m_sft_v3,HFv1 ARC,25.68,,hf_open_llm_v1_240829_frozen.csv tinymistral_248m_sft_v3,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv tinymistral_248m_sft_v3,HFv1 HellaSwag,25.31,,hf_open_llm_v1_240829_frozen.csv tinymistral_248m_sft_v3,HFv1 MMLU,24.41,,hf_open_llm_v1_240829_frozen.csv tinymistral_248m_sft_v3,HFv1 TruthfulQA,48.87,,hf_open_llm_v1_240829_frozen.csv tinymistral_248m_sft_v3,HFv1 Winogrande,48.38,,hf_open_llm_v1_240829_frozen.csv tinymistral_248m_sft_v4,HF OpenLLM v1,28.2,,hf_open_llm_v1_240829_frozen.csv tinymistral_248m_sft_v4,HFv1 ARC,24.91,,hf_open_llm_v1_240829_frozen.csv tinymistral_248m_sft_v4,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv tinymistral_248m_sft_v4,HFv1 HellaSwag,28.15,,hf_open_llm_v1_240829_frozen.csv tinymistral_248m_sft_v4,HFv1 MMLU,26.04,,hf_open_llm_v1_240829_frozen.csv tinymistral_248m_sft_v4,HFv1 TruthfulQA,39.56,,hf_open_llm_v1_240829_frozen.csv tinymistral_248m_sft_v4,HFv1 Winogrande,50.51,,hf_open_llm_v1_240829_frozen.csv tinymistral_248m_v2,HF OpenLLM v1,28.78,,hf_open_llm_v1_240829_frozen.csv tinymistral_248m_v2,HFv1 ARC,21.25,,hf_open_llm_v1_240829_frozen.csv tinymistral_248m_v2,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv tinymistral_248m_v2,HFv1 HellaSwag,26.56,,hf_open_llm_v1_240829_frozen.csv tinymistral_248m_v2,HFv1 MMLU,23.39,,hf_open_llm_v1_240829_frozen.csv tinymistral_248m_v2,HFv1 TruthfulQA,49.6,,hf_open_llm_v1_240829_frozen.csv tinymistral_248m_v2,HFv1 Winogrande,51.85,,hf_open_llm_v1_240829_frozen.csv tinymistral_248m_v2_5,HF OpenLLM v1,28.29,,hf_open_llm_v1_240829_frozen.csv tinymistral_248m_v2_5,HFv1 ARC,24.57,,hf_open_llm_v1_240829_frozen.csv tinymistral_248m_v2_5,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv tinymistral_248m_v2_5,HFv1 HellaSwag,27.49,,hf_open_llm_v1_240829_frozen.csv tinymistral_248m_v2_5,HFv1 MMLU,23.15,,hf_open_llm_v1_240829_frozen.csv tinymistral_248m_v2_5,HFv1 TruthfulQA,46.72,,hf_open_llm_v1_240829_frozen.csv tinymistral_248m_v2_5,HFv1 Winogrande,47.83,,hf_open_llm_v1_240829_frozen.csv tinymistral_248m_v2_5_instruct,HF OpenLLM v1,27.7,,hf_open_llm_v1_240829_frozen.csv tinymistral_248m_v2_5_instruct,HFv1 ARC,22.27,,hf_open_llm_v1_240829_frozen.csv tinymistral_248m_v2_5_instruct,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv tinymistral_248m_v2_5_instruct,HFv1 HellaSwag,27.6,,hf_open_llm_v1_240829_frozen.csv tinymistral_248m_v2_5_instruct,HFv1 MMLU,23.9,,hf_open_llm_v1_240829_frozen.csv tinymistral_248m_v2_5_instruct,HFv1 TruthfulQA,44.21,,hf_open_llm_v1_240829_frozen.csv tinymistral_248m_v2_5_instruct,HFv1 Winogrande,48.22,,hf_open_llm_v1_240829_frozen.csv tinymistral_6x248m_instruct,HF OpenLLM v1,27.89,,hf_open_llm_v1_240829_frozen.csv tinymistral_6x248m_instruct,HFv1 ARC,22.44,,hf_open_llm_v1_240829_frozen.csv tinymistral_6x248m_instruct,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv tinymistral_6x248m_instruct,HFv1 HellaSwag,27.02,,hf_open_llm_v1_240829_frozen.csv tinymistral_6x248m_instruct,HFv1 MMLU,24.13,,hf_open_llm_v1_240829_frozen.csv tinymistral_6x248m_instruct,HFv1 TruthfulQA,43.16,,hf_open_llm_v1_240829_frozen.csv tinymistral_6x248m_instruct,HFv1 Winogrande,50.59,,hf_open_llm_v1_240829_frozen.csv tinymistral_v2_5_minipile_guidelines_e1,HF OpenLLM v1,29.16,,hf_open_llm_v1_240829_frozen.csv tinymistral_v2_5_minipile_guidelines_e1,HFv1 ARC,26.54,,hf_open_llm_v1_240829_frozen.csv tinymistral_v2_5_minipile_guidelines_e1,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv tinymistral_v2_5_minipile_guidelines_e1,HFv1 HellaSwag,25.68,,hf_open_llm_v1_240829_frozen.csv tinymistral_v2_5_minipile_guidelines_e1,HFv1 MMLU,23.53,,hf_open_llm_v1_240829_frozen.csv tinymistral_v2_5_minipile_guidelines_e1,HFv1 TruthfulQA,49.9,,hf_open_llm_v1_240829_frozen.csv tinymistral_v2_5_minipile_guidelines_e1,HFv1 Winogrande,49.41,,hf_open_llm_v1_240829_frozen.csv tinymistral_v2_test1,HF OpenLLM v1,28.42,,hf_open_llm_v1_240829_frozen.csv tinymistral_v2_test1,HFv1 ARC,21.5,,hf_open_llm_v1_240829_frozen.csv tinymistral_v2_test1,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv tinymistral_v2_test1,HFv1 HellaSwag,26.79,,hf_open_llm_v1_240829_frozen.csv tinymistral_v2_test1,HFv1 MMLU,23.36,,hf_open_llm_v1_240829_frozen.csv tinymistral_v2_test1,HFv1 TruthfulQA,50.3,,hf_open_llm_v1_240829_frozen.csv tinymistral_v2_test1,HFv1 Winogrande,48.54,,hf_open_llm_v1_240829_frozen.csv tinymix,HF OpenLLM v1,35.91,,hf_open_llm_v1_240829_frozen.csv tinymix,HFv1 ARC,32.0,,hf_open_llm_v1_240829_frozen.csv tinymix,HFv1 GSM8K,1.97,,hf_open_llm_v1_240829_frozen.csv tinymix,HFv1 HellaSwag,53.69,,hf_open_llm_v1_240829_frozen.csv tinymix,HFv1 MMLU,24.27,,hf_open_llm_v1_240829_frozen.csv tinymix,HFv1 TruthfulQA,39.42,,hf_open_llm_v1_240829_frozen.csv tinymix,HFv1 Winogrande,64.09,,hf_open_llm_v1_240829_frozen.csv tinynaughtyllama_v1_0,HF OpenLLM v1,37.03,,hf_open_llm_v1_240829_frozen.csv tinynaughtyllama_v1_0,HFv1 ARC,35.92,,hf_open_llm_v1_240829_frozen.csv tinynaughtyllama_v1_0,HFv1 GSM8K,2.43,,hf_open_llm_v1_240829_frozen.csv tinynaughtyllama_v1_0,HFv1 HellaSwag,61.04,,hf_open_llm_v1_240829_frozen.csv tinynaughtyllama_v1_0,HFv1 MMLU,25.82,,hf_open_llm_v1_240829_frozen.csv tinynaughtyllama_v1_0,HFv1 TruthfulQA,36.77,,hf_open_llm_v1_240829_frozen.csv tinynaughtyllama_v1_0,HFv1 Winogrande,60.22,,hf_open_llm_v1_240829_frozen.csv tinyopenhermes_1_1b_4k,HF OpenLLM v1,35.98,,hf_open_llm_v1_240829_frozen.csv tinyopenhermes_1_1b_4k,HFv1 ARC,33.62,,hf_open_llm_v1_240829_frozen.csv tinyopenhermes_1_1b_4k,HFv1 GSM8K,0.08,,hf_open_llm_v1_240829_frozen.csv tinyopenhermes_1_1b_4k,HFv1 HellaSwag,58.53,,hf_open_llm_v1_240829_frozen.csv tinyopenhermes_1_1b_4k,HFv1 MMLU,26.45,,hf_open_llm_v1_240829_frozen.csv tinyopenhermes_1_1b_4k,HFv1 TruthfulQA,37.33,,hf_open_llm_v1_240829_frozen.csv tinyopenhermes_1_1b_4k,HFv1 Winogrande,59.91,,hf_open_llm_v1_240829_frozen.csv tinystories_1m,HF OpenLLM v1,29.14,,hf_open_llm_v1_240829_frozen.csv tinystories_1m,HFv1 ARC,23.46,,hf_open_llm_v1_240829_frozen.csv tinystories_1m,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv tinystories_1m,HFv1 HellaSwag,25.23,,hf_open_llm_v1_240829_frozen.csv tinystories_1m,HFv1 MMLU,24.57,,hf_open_llm_v1_240829_frozen.csv tinystories_1m,HFv1 TruthfulQA,49.4,,hf_open_llm_v1_240829_frozen.csv tinystories_1m,HFv1 Winogrande,52.17,,hf_open_llm_v1_240829_frozen.csv tinystories_28m,HF OpenLLM v1,28.44,,hf_open_llm_v1_240829_frozen.csv tinystories_28m,HFv1 ARC,22.78,,hf_open_llm_v1_240829_frozen.csv tinystories_28m,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv tinystories_28m,HFv1 HellaSwag,25.83,,hf_open_llm_v1_240829_frozen.csv tinystories_28m,HFv1 MMLU,23.53,,hf_open_llm_v1_240829_frozen.csv tinystories_28m,HFv1 TruthfulQA,48.08,,hf_open_llm_v1_240829_frozen.csv tinystories_28m,HFv1 Winogrande,50.43,,hf_open_llm_v1_240829_frozen.csv tinystories_33m,HF OpenLLM v1,28.41,,hf_open_llm_v1_240829_frozen.csv tinystories_33m,HFv1 ARC,24.23,,hf_open_llm_v1_240829_frozen.csv tinystories_33m,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv tinystories_33m,HFv1 HellaSwag,25.69,,hf_open_llm_v1_240829_frozen.csv tinystories_33m,HFv1 MMLU,23.82,,hf_open_llm_v1_240829_frozen.csv tinystories_33m,HFv1 TruthfulQA,47.64,,hf_open_llm_v1_240829_frozen.csv tinystories_33m,HFv1 Winogrande,49.09,,hf_open_llm_v1_240829_frozen.csv tinystories_3m,HF OpenLLM v1,28.19,,hf_open_llm_v1_240829_frozen.csv tinystories_3m,HFv1 ARC,22.01,,hf_open_llm_v1_240829_frozen.csv tinystories_3m,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv tinystories_3m,HFv1 HellaSwag,25.58,,hf_open_llm_v1_240829_frozen.csv tinystories_3m,HFv1 MMLU,24.99,,hf_open_llm_v1_240829_frozen.csv tinystories_3m,HFv1 TruthfulQA,47.33,,hf_open_llm_v1_240829_frozen.csv tinystories_3m,HFv1 Winogrande,49.25,,hf_open_llm_v1_240829_frozen.csv tinystories_8m,HF OpenLLM v1,28.31,,hf_open_llm_v1_240829_frozen.csv tinystories_8m,HFv1 ARC,24.66,,hf_open_llm_v1_240829_frozen.csv tinystories_8m,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv tinystories_8m,HFv1 HellaSwag,25.03,,hf_open_llm_v1_240829_frozen.csv tinystories_8m,HFv1 MMLU,23.33,,hf_open_llm_v1_240829_frozen.csv tinystories_8m,HFv1 TruthfulQA,46.54,,hf_open_llm_v1_240829_frozen.csv tinystories_8m,HFv1 Winogrande,50.28,,hf_open_llm_v1_240829_frozen.csv tinyultra_4x1_1b_base_alpha,HF OpenLLM v1,37.94,,hf_open_llm_v1_240829_frozen.csv tinyultra_4x1_1b_base_alpha,HFv1 ARC,34.9,,hf_open_llm_v1_240829_frozen.csv tinyultra_4x1_1b_base_alpha,HFv1 GSM8K,2.58,,hf_open_llm_v1_240829_frozen.csv tinyultra_4x1_1b_base_alpha,HFv1 HellaSwag,61.42,,hf_open_llm_v1_240829_frozen.csv tinyultra_4x1_1b_base_alpha,HFv1 MMLU,25.42,,hf_open_llm_v1_240829_frozen.csv tinyultra_4x1_1b_base_alpha,HFv1 TruthfulQA,37.59,,hf_open_llm_v1_240829_frozen.csv tinyultra_4x1_1b_base_alpha,HFv1 Winogrande,65.75,,hf_open_llm_v1_240829_frozen.csv tinywand_dpo,HF OpenLLM v1,35.13,,hf_open_llm_v1_240829_frozen.csv tinywand_dpo,HFv1 ARC,31.66,,hf_open_llm_v1_240829_frozen.csv tinywand_dpo,HFv1 GSM8K,1.9,,hf_open_llm_v1_240829_frozen.csv tinywand_dpo,HFv1 HellaSwag,50.42,,hf_open_llm_v1_240829_frozen.csv tinywand_dpo,HFv1 MMLU,26.22,,hf_open_llm_v1_240829_frozen.csv tinywand_dpo,HFv1 TruthfulQA,45.8,,hf_open_llm_v1_240829_frozen.csv tinywand_dpo,HFv1 Winogrande,54.78,,hf_open_llm_v1_240829_frozen.csv tinywand_sft,HF OpenLLM v1,34.61,,hf_open_llm_v1_240829_frozen.csv tinywand_sft,HFv1 ARC,31.4,,hf_open_llm_v1_240829_frozen.csv tinywand_sft,HFv1 GSM8K,2.05,,hf_open_llm_v1_240829_frozen.csv tinywand_sft,HFv1 HellaSwag,49.96,,hf_open_llm_v1_240829_frozen.csv tinywand_sft,HFv1 MMLU,25.98,,hf_open_llm_v1_240829_frozen.csv tinywand_sft,HFv1 TruthfulQA,43.08,,hf_open_llm_v1_240829_frozen.csv tinywand_sft,HFv1 Winogrande,55.17,,hf_open_llm_v1_240829_frozen.csv tmm_1b,HF OpenLLM v1,35.63,,hf_open_llm_v1_240829_frozen.csv tmm_1b,HFv1 ARC,33.36,,hf_open_llm_v1_240829_frozen.csv tmm_1b,HFv1 GSM8K,1.06,,hf_open_llm_v1_240829_frozen.csv tmm_1b,HFv1 HellaSwag,58.46,,hf_open_llm_v1_240829_frozen.csv tmm_1b,HFv1 MMLU,25.68,,hf_open_llm_v1_240829_frozen.csv tmm_1b,HFv1 TruthfulQA,37.22,,hf_open_llm_v1_240829_frozen.csv tmm_1b,HFv1 Winogrande,58.01,,hf_open_llm_v1_240829_frozen.csv toppyevil_7b_slerp,HF OpenLLM v1,65.16,,hf_open_llm_v1_240829_frozen.csv toppyevil_7b_slerp,HFv1 ARC,63.65,,hf_open_llm_v1_240829_frozen.csv toppyevil_7b_slerp,HFv1 GSM8K,55.8,,hf_open_llm_v1_240829_frozen.csv toppyevil_7b_slerp,HFv1 HellaSwag,84.29,,hf_open_llm_v1_240829_frozen.csv toppyevil_7b_slerp,HFv1 MMLU,63.6,,hf_open_llm_v1_240829_frozen.csv toppyevil_7b_slerp,HFv1 TruthfulQA,46.06,,hf_open_llm_v1_240829_frozen.csv toppyevil_7b_slerp,HFv1 Winogrande,77.58,,hf_open_llm_v1_240829_frozen.csv toppylake_7b_slerp,HF OpenLLM v1,72.05,,hf_open_llm_v1_240829_frozen.csv toppylake_7b_slerp,HFv1 ARC,69.2,,hf_open_llm_v1_240829_frozen.csv toppylake_7b_slerp,HFv1 GSM8K,65.96,,hf_open_llm_v1_240829_frozen.csv toppylake_7b_slerp,HFv1 HellaSwag,86.98,,hf_open_llm_v1_240829_frozen.csv toppylake_7b_slerp,HFv1 MMLU,64.85,,hf_open_llm_v1_240829_frozen.csv toppylake_7b_slerp,HFv1 TruthfulQA,62.54,,hf_open_llm_v1_240829_frozen.csv toppylake_7b_slerp,HFv1 Winogrande,82.79,,hf_open_llm_v1_240829_frozen.csv toppylake_bagel_7b_slerp,HF OpenLLM v1,70.14,,hf_open_llm_v1_240829_frozen.csv toppylake_bagel_7b_slerp,HFv1 ARC,67.66,,hf_open_llm_v1_240829_frozen.csv toppylake_bagel_7b_slerp,HFv1 GSM8K,57.7,,hf_open_llm_v1_240829_frozen.csv toppylake_bagel_7b_slerp,HFv1 HellaSwag,85.7,,hf_open_llm_v1_240829_frozen.csv toppylake_bagel_7b_slerp,HFv1 MMLU,64.87,,hf_open_llm_v1_240829_frozen.csv toppylake_bagel_7b_slerp,HFv1 TruthfulQA,61.74,,hf_open_llm_v1_240829_frozen.csv toppylake_bagel_7b_slerp,HFv1 Winogrande,83.19,,hf_open_llm_v1_240829_frozen.csv tora_13b_v1_0,HF OpenLLM v1,53.62,,hf_open_llm_v1_240829_frozen.csv tora_13b_v1_0,HFv1 ARC,58.96,,hf_open_llm_v1_240829_frozen.csv tora_13b_v1_0,HFv1 GSM8K,9.86,,hf_open_llm_v1_240829_frozen.csv tora_13b_v1_0,HFv1 HellaSwag,82.31,,hf_open_llm_v1_240829_frozen.csv tora_13b_v1_0,HFv1 MMLU,54.73,,hf_open_llm_v1_240829_frozen.csv tora_13b_v1_0,HFv1 TruthfulQA,40.25,,hf_open_llm_v1_240829_frozen.csv tora_13b_v1_0,HFv1 Winogrande,75.61,,hf_open_llm_v1_240829_frozen.csv tora_7b_v1_0,HF OpenLLM v1,48.5,,hf_open_llm_v1_240829_frozen.csv tora_7b_v1_0,HFv1 ARC,52.47,,hf_open_llm_v1_240829_frozen.csv tora_7b_v1_0,HFv1 GSM8K,2.5,,hf_open_llm_v1_240829_frozen.csv tora_7b_v1_0,HFv1 HellaSwag,78.68,,hf_open_llm_v1_240829_frozen.csv tora_7b_v1_0,HFv1 MMLU,45.9,,hf_open_llm_v1_240829_frozen.csv tora_7b_v1_0,HFv1 TruthfulQA,37.9,,hf_open_llm_v1_240829_frozen.csv tora_7b_v1_0,HFv1 Winogrande,73.56,,hf_open_llm_v1_240829_frozen.csv tora_code_13b_v1_0,HF OpenLLM v1,42.7,,hf_open_llm_v1_240829_frozen.csv tora_code_13b_v1_0,HFv1 ARC,44.45,,hf_open_llm_v1_240829_frozen.csv tora_code_13b_v1_0,HFv1 GSM8K,8.19,,hf_open_llm_v1_240829_frozen.csv tora_code_13b_v1_0,HFv1 HellaSwag,69.29,,hf_open_llm_v1_240829_frozen.csv tora_code_13b_v1_0,HFv1 MMLU,36.67,,hf_open_llm_v1_240829_frozen.csv tora_code_13b_v1_0,HFv1 TruthfulQA,34.98,,hf_open_llm_v1_240829_frozen.csv tora_code_13b_v1_0,HFv1 Winogrande,62.59,,hf_open_llm_v1_240829_frozen.csv tora_code_34b_v1_0,HF OpenLLM v1,48.95,,hf_open_llm_v1_240829_frozen.csv tora_code_34b_v1_0,HFv1 ARC,50.43,,hf_open_llm_v1_240829_frozen.csv tora_code_34b_v1_0,HFv1 GSM8K,13.12,,hf_open_llm_v1_240829_frozen.csv tora_code_34b_v1_0,HFv1 HellaSwag,75.54,,hf_open_llm_v1_240829_frozen.csv tora_code_34b_v1_0,HFv1 MMLU,46.78,,hf_open_llm_v1_240829_frozen.csv tora_code_34b_v1_0,HFv1 TruthfulQA,39.66,,hf_open_llm_v1_240829_frozen.csv tora_code_34b_v1_0,HFv1 Winogrande,68.19,,hf_open_llm_v1_240829_frozen.csv tora_code_7b_v1_0,HF OpenLLM v1,40.21,,hf_open_llm_v1_240829_frozen.csv tora_code_7b_v1_0,HFv1 ARC,40.7,,hf_open_llm_v1_240829_frozen.csv tora_code_7b_v1_0,HFv1 GSM8K,4.93,,hf_open_llm_v1_240829_frozen.csv tora_code_7b_v1_0,HFv1 HellaSwag,65.86,,hf_open_llm_v1_240829_frozen.csv tora_code_7b_v1_0,HFv1 MMLU,33.34,,hf_open_llm_v1_240829_frozen.csv tora_code_7b_v1_0,HFv1 TruthfulQA,34.84,,hf_open_llm_v1_240829_frozen.csv tora_code_7b_v1_0,HFv1 Winogrande,61.56,,hf_open_llm_v1_240829_frozen.csv towerinstruct_7b_v0_1,HF OpenLLM v1,52.39,,hf_open_llm_v1_240829_frozen.csv towerinstruct_7b_v0_1,HFv1 ARC,55.46,,hf_open_llm_v1_240829_frozen.csv towerinstruct_7b_v0_1,HFv1 GSM8K,16.45,,hf_open_llm_v1_240829_frozen.csv towerinstruct_7b_v0_1,HFv1 HellaSwag,79.0,,hf_open_llm_v1_240829_frozen.csv towerinstruct_7b_v0_1,HFv1 MMLU,46.88,,hf_open_llm_v1_240829_frozen.csv towerinstruct_7b_v0_1,HFv1 TruthfulQA,42.59,,hf_open_llm_v1_240829_frozen.csv towerinstruct_7b_v0_1,HFv1 Winogrande,73.95,,hf_open_llm_v1_240829_frozen.csv toxichermes_2_5_mistral_7b,HF OpenLLM v1,59.69,,hf_open_llm_v1_240829_frozen.csv toxichermes_2_5_mistral_7b,HFv1 ARC,64.59,,hf_open_llm_v1_240829_frozen.csv toxichermes_2_5_mistral_7b,HFv1 GSM8K,17.36,,hf_open_llm_v1_240829_frozen.csv toxichermes_2_5_mistral_7b,HFv1 HellaSwag,83.75,,hf_open_llm_v1_240829_frozen.csv toxichermes_2_5_mistral_7b,HFv1 MMLU,63.67,,hf_open_llm_v1_240829_frozen.csv toxichermes_2_5_mistral_7b,HFv1 TruthfulQA,50.84,,hf_open_llm_v1_240829_frozen.csv toxichermes_2_5_mistral_7b,HFv1 Winogrande,77.9,,hf_open_llm_v1_240829_frozen.csv traversaal_2_5_mistral_7b,HF OpenLLM v1,60.48,,hf_open_llm_v1_240829_frozen.csv traversaal_2_5_mistral_7b,HFv1 ARC,66.21,,hf_open_llm_v1_240829_frozen.csv traversaal_2_5_mistral_7b,HFv1 GSM8K,16.53,,hf_open_llm_v1_240829_frozen.csv traversaal_2_5_mistral_7b,HFv1 HellaSwag,85.02,,hf_open_llm_v1_240829_frozen.csv traversaal_2_5_mistral_7b,HFv1 MMLU,63.24,,hf_open_llm_v1_240829_frozen.csv traversaal_2_5_mistral_7b,HFv1 TruthfulQA,54.0,,hf_open_llm_v1_240829_frozen.csv traversaal_2_5_mistral_7b,HFv1 Winogrande,77.9,,hf_open_llm_v1_240829_frozen.csv trurl_2_13b_academic,HF OpenLLM v1,53.94,,hf_open_llm_v1_240829_frozen.csv trurl_2_13b_academic,HFv1 ARC,57.94,,hf_open_llm_v1_240829_frozen.csv trurl_2_13b_academic,HFv1 GSM8K,10.92,,hf_open_llm_v1_240829_frozen.csv trurl_2_13b_academic,HFv1 HellaSwag,79.55,,hf_open_llm_v1_240829_frozen.csv trurl_2_13b_academic,HFv1 MMLU,55.2,,hf_open_llm_v1_240829_frozen.csv trurl_2_13b_academic,HFv1 TruthfulQA,43.46,,hf_open_llm_v1_240829_frozen.csv trurl_2_13b_academic,HFv1 Winogrande,76.56,,hf_open_llm_v1_240829_frozen.csv truthful_dpo_moe_19b,HF OpenLLM v1,74.3,,hf_open_llm_v1_240829_frozen.csv truthful_dpo_moe_19b,HFv1 ARC,71.08,,hf_open_llm_v1_240829_frozen.csv truthful_dpo_moe_19b,HFv1 GSM8K,64.52,,hf_open_llm_v1_240829_frozen.csv truthful_dpo_moe_19b,HFv1 HellaSwag,88.46,,hf_open_llm_v1_240829_frozen.csv truthful_dpo_moe_19b,HFv1 MMLU,66.13,,hf_open_llm_v1_240829_frozen.csv truthful_dpo_moe_19b,HFv1 TruthfulQA,72.29,,hf_open_llm_v1_240829_frozen.csv truthful_dpo_moe_19b,HFv1 Winogrande,83.35,,hf_open_llm_v1_240829_frozen.csv truthful_dpo_tomgrc_fusionnet_7bx2_moe_13b,HF OpenLLM v1,77.44,,hf_open_llm_v1_240829_frozen.csv truthful_dpo_tomgrc_fusionnet_7bx2_moe_13b,HFv1 ARC,74.91,,hf_open_llm_v1_240829_frozen.csv truthful_dpo_tomgrc_fusionnet_7bx2_moe_13b,HFv1 GSM8K,69.52,,hf_open_llm_v1_240829_frozen.csv truthful_dpo_tomgrc_fusionnet_7bx2_moe_13b,HFv1 HellaSwag,89.3,,hf_open_llm_v1_240829_frozen.csv truthful_dpo_tomgrc_fusionnet_7bx2_moe_13b,HFv1 MMLU,64.67,,hf_open_llm_v1_240829_frozen.csv truthful_dpo_tomgrc_fusionnet_7bx2_moe_13b,HFv1 TruthfulQA,78.02,,hf_open_llm_v1_240829_frozen.csv truthful_dpo_tomgrc_fusionnet_7bx2_moe_13b,HFv1 Winogrande,88.24,,hf_open_llm_v1_240829_frozen.csv truthfulqwen1_5_1_8b,HF OpenLLM v1,44.81,,hf_open_llm_v1_240829_frozen.csv truthfulqwen1_5_1_8b,HFv1 ARC,38.99,,hf_open_llm_v1_240829_frozen.csv truthfulqwen1_5_1_8b,HFv1 GSM8K,14.86,,hf_open_llm_v1_240829_frozen.csv truthfulqwen1_5_1_8b,HFv1 HellaSwag,60.43,,hf_open_llm_v1_240829_frozen.csv truthfulqwen1_5_1_8b,HFv1 MMLU,44.54,,hf_open_llm_v1_240829_frozen.csv truthfulqwen1_5_1_8b,HFv1 TruthfulQA,50.86,,hf_open_llm_v1_240829_frozen.csv truthfulqwen1_5_1_8b,HFv1 Winogrande,59.19,,hf_open_llm_v1_240829_frozen.csv truthfulqwen1_5_4b,HF OpenLLM v1,57.41,,hf_open_llm_v1_240829_frozen.csv truthfulqwen1_5_4b,HFv1 ARC,47.1,,hf_open_llm_v1_240829_frozen.csv truthfulqwen1_5_4b,HFv1 GSM8K,52.54,,hf_open_llm_v1_240829_frozen.csv truthfulqwen1_5_4b,HFv1 HellaSwag,71.32,,hf_open_llm_v1_240829_frozen.csv truthfulqwen1_5_4b,HFv1 MMLU,56.04,,hf_open_llm_v1_240829_frozen.csv truthfulqwen1_5_4b,HFv1 TruthfulQA,50.6,,hf_open_llm_v1_240829_frozen.csv truthfulqwen1_5_4b,HFv1 Winogrande,66.85,,hf_open_llm_v1_240829_frozen.csv tulu_2_dpo_70b,HF OpenLLM v1,73.77,,hf_open_llm_v1_240829_frozen.csv tulu_2_dpo_70b,HFv1 ARC,72.1,,hf_open_llm_v1_240829_frozen.csv tulu_2_dpo_70b,HFv1 GSM8K,62.62,,hf_open_llm_v1_240829_frozen.csv tulu_2_dpo_70b,HFv1 HellaSwag,88.99,,hf_open_llm_v1_240829_frozen.csv tulu_2_dpo_70b,HFv1 MMLU,69.84,,hf_open_llm_v1_240829_frozen.csv tulu_2_dpo_70b,HFv1 TruthfulQA,65.78,,hf_open_llm_v1_240829_frozen.csv tulu_2_dpo_70b,HFv1 Winogrande,83.27,,hf_open_llm_v1_240829_frozen.csv tulu_2_dpo_70b_expo,HF OpenLLM v1,74.02,,hf_open_llm_v1_240829_frozen.csv tulu_2_dpo_70b_expo,HFv1 ARC,72.7,,hf_open_llm_v1_240829_frozen.csv tulu_2_dpo_70b_expo,HFv1 GSM8K,59.36,,hf_open_llm_v1_240829_frozen.csv tulu_2_dpo_70b_expo,HFv1 HellaSwag,89.29,,hf_open_llm_v1_240829_frozen.csv tulu_2_dpo_70b_expo,HFv1 MMLU,69.61,,hf_open_llm_v1_240829_frozen.csv tulu_2_dpo_70b_expo,HFv1 TruthfulQA,69.99,,hf_open_llm_v1_240829_frozen.csv tulu_2_dpo_70b_expo,HFv1 Winogrande,83.19,,hf_open_llm_v1_240829_frozen.csv tulu_2_dpo_7b_expo,HF OpenLLM v1,58.64,,hf_open_llm_v1_240829_frozen.csv tulu_2_dpo_7b_expo,HFv1 ARC,58.28,,hf_open_llm_v1_240829_frozen.csv tulu_2_dpo_7b_expo,HFv1 GSM8K,26.38,,hf_open_llm_v1_240829_frozen.csv tulu_2_dpo_7b_expo,HFv1 HellaSwag,81.36,,hf_open_llm_v1_240829_frozen.csv tulu_2_dpo_7b_expo,HFv1 MMLU,52.23,,hf_open_llm_v1_240829_frozen.csv tulu_2_dpo_7b_expo,HFv1 TruthfulQA,59.41,,hf_open_llm_v1_240829_frozen.csv tulu_2_dpo_7b_expo,HFv1 Winogrande,74.19,,hf_open_llm_v1_240829_frozen.csv turdus,HF OpenLLM v1,74.66,,hf_open_llm_v1_240829_frozen.csv turdus,HFv1 ARC,73.38,,hf_open_llm_v1_240829_frozen.csv turdus,HFv1 GSM8K,67.7,,hf_open_llm_v1_240829_frozen.csv turdus,HFv1 HellaSwag,88.56,,hf_open_llm_v1_240829_frozen.csv turdus,HFv1 MMLU,64.52,,hf_open_llm_v1_240829_frozen.csv turdus,HFv1 TruthfulQA,67.11,,hf_open_llm_v1_240829_frozen.csv turdus,HFv1 Winogrande,86.66,,hf_open_llm_v1_240829_frozen.csv turkgpt_v0_1,HF OpenLLM v1,65.3,,hf_open_llm_v1_240829_frozen.csv turkgpt_v0_1,HFv1 ARC,59.22,,hf_open_llm_v1_240829_frozen.csv turkgpt_v0_1,HFv1 GSM8K,60.73,,hf_open_llm_v1_240829_frozen.csv turkgpt_v0_1,HFv1 HellaSwag,80.27,,hf_open_llm_v1_240829_frozen.csv turkgpt_v0_1,HFv1 MMLU,67.0,,hf_open_llm_v1_240829_frozen.csv turkgpt_v0_1,HFv1 TruthfulQA,47.29,,hf_open_llm_v1_240829_frozen.csv turkgpt_v0_1,HFv1 Winogrande,77.27,,hf_open_llm_v1_240829_frozen.csv typhoon_7b,HF OpenLLM v1,58.05,,hf_open_llm_v1_240829_frozen.csv typhoon_7b,HFv1 ARC,58.53,,hf_open_llm_v1_240829_frozen.csv typhoon_7b,HFv1 GSM8K,31.61,,hf_open_llm_v1_240829_frozen.csv typhoon_7b,HFv1 HellaSwag,81.55,,hf_open_llm_v1_240829_frozen.csv typhoon_7b,HFv1 MMLU,59.54,,hf_open_llm_v1_240829_frozen.csv typhoon_7b,HFv1 TruthfulQA,40.52,,hf_open_llm_v1_240829_frozen.csv typhoon_7b,HFv1 Winogrande,76.56,,hf_open_llm_v1_240829_frozen.csv typhoon_7b_instruct_01_30_2024,HF OpenLLM v1,66.21,,hf_open_llm_v1_240829_frozen.csv typhoon_7b_instruct_01_30_2024,HFv1 ARC,61.86,,hf_open_llm_v1_240829_frozen.csv typhoon_7b_instruct_01_30_2024,HFv1 GSM8K,63.31,,hf_open_llm_v1_240829_frozen.csv typhoon_7b_instruct_01_30_2024,HFv1 HellaSwag,81.3,,hf_open_llm_v1_240829_frozen.csv typhoon_7b_instruct_01_30_2024,HFv1 MMLU,60.72,,hf_open_llm_v1_240829_frozen.csv typhoon_7b_instruct_01_30_2024,HFv1 TruthfulQA,52.6,,hf_open_llm_v1_240829_frozen.csv typhoon_7b_instruct_01_30_2024,HFv1 Winogrande,77.51,,hf_open_llm_v1_240829_frozen.csv typhoon_7b_instruct_02_19_2024,HF OpenLLM v1,65.39,,hf_open_llm_v1_240829_frozen.csv typhoon_7b_instruct_02_19_2024,HFv1 ARC,61.95,,hf_open_llm_v1_240829_frozen.csv typhoon_7b_instruct_02_19_2024,HFv1 GSM8K,58.45,,hf_open_llm_v1_240829_frozen.csv typhoon_7b_instruct_02_19_2024,HFv1 HellaSwag,81.51,,hf_open_llm_v1_240829_frozen.csv typhoon_7b_instruct_02_19_2024,HFv1 MMLU,61.86,,hf_open_llm_v1_240829_frozen.csv typhoon_7b_instruct_02_19_2024,HFv1 TruthfulQA,49.94,,hf_open_llm_v1_240829_frozen.csv typhoon_7b_instruct_02_19_2024,HFv1 Winogrande,78.61,,hf_open_llm_v1_240829_frozen.csv typhoon_7b_wangchanx_sft_demo,HF OpenLLM v1,61.17,,hf_open_llm_v1_240829_frozen.csv typhoon_7b_wangchanx_sft_demo,HFv1 ARC,58.96,,hf_open_llm_v1_240829_frozen.csv typhoon_7b_wangchanx_sft_demo,HFv1 GSM8K,46.78,,hf_open_llm_v1_240829_frozen.csv typhoon_7b_wangchanx_sft_demo,HFv1 HellaSwag,82.38,,hf_open_llm_v1_240829_frozen.csv typhoon_7b_wangchanx_sft_demo,HFv1 MMLU,57.67,,hf_open_llm_v1_240829_frozen.csv typhoon_7b_wangchanx_sft_demo,HFv1 TruthfulQA,44.83,,hf_open_llm_v1_240829_frozen.csv typhoon_7b_wangchanx_sft_demo,HFv1 Winogrande,76.4,,hf_open_llm_v1_240829_frozen.csv ultra0,HF OpenLLM v1,44.32,,hf_open_llm_v1_240829_frozen.csv ultra0,HFv1 ARC,41.47,,hf_open_llm_v1_240829_frozen.csv ultra0,HFv1 GSM8K,16.07,,hf_open_llm_v1_240829_frozen.csv ultra0,HFv1 HellaSwag,68.02,,hf_open_llm_v1_240829_frozen.csv ultra0,HFv1 MMLU,33.37,,hf_open_llm_v1_240829_frozen.csv ultra0,HFv1 TruthfulQA,41.49,,hf_open_llm_v1_240829_frozen.csv ultra0,HFv1 Winogrande,65.51,,hf_open_llm_v1_240829_frozen.csv ultracatunamayo_dpo,HF OpenLLM v1,75.96,,hf_open_llm_v1_240829_frozen.csv ultracatunamayo_dpo,HFv1 ARC,72.87,,hf_open_llm_v1_240829_frozen.csv ultracatunamayo_dpo,HFv1 GSM8K,68.54,,hf_open_llm_v1_240829_frozen.csv ultracatunamayo_dpo,HFv1 HellaSwag,88.75,,hf_open_llm_v1_240829_frozen.csv ultracatunamayo_dpo,HFv1 MMLU,65.18,,hf_open_llm_v1_240829_frozen.csv ultracatunamayo_dpo,HFv1 TruthfulQA,76.44,,hf_open_llm_v1_240829_frozen.csv ultracatunamayo_dpo,HFv1 Winogrande,83.98,,hf_open_llm_v1_240829_frozen.csv ultramerge_7b,HF OpenLLM v1,76.49,,hf_open_llm_v1_240829_frozen.csv ultramerge_7b,HFv1 ARC,73.04,,hf_open_llm_v1_240829_frozen.csv ultramerge_7b,HFv1 GSM8K,69.22,,hf_open_llm_v1_240829_frozen.csv ultramerge_7b,HFv1 HellaSwag,89.25,,hf_open_llm_v1_240829_frozen.csv ultramerge_7b,HFv1 MMLU,64.4,,hf_open_llm_v1_240829_frozen.csv ultramerge_7b,HFv1 TruthfulQA,78.17,,hf_open_llm_v1_240829_frozen.csv ultramerge_7b,HFv1 Winogrande,84.85,,hf_open_llm_v1_240829_frozen.csv una_solar_10_7b_instruct_v1_0,HF OpenLLM v1,74.07,,hf_open_llm_v1_240829_frozen.csv una_solar_10_7b_instruct_v1_0,HFv1 ARC,70.73,,hf_open_llm_v1_240829_frozen.csv una_solar_10_7b_instruct_v1_0,HFv1 GSM8K,63.38,,hf_open_llm_v1_240829_frozen.csv una_solar_10_7b_instruct_v1_0,HFv1 HellaSwag,88.32,,hf_open_llm_v1_240829_frozen.csv una_solar_10_7b_instruct_v1_0,HFv1 MMLU,66.1,,hf_open_llm_v1_240829_frozen.csv una_solar_10_7b_instruct_v1_0,HFv1 TruthfulQA,72.52,,hf_open_llm_v1_240829_frozen.csv una_solar_10_7b_instruct_v1_0,HFv1 Winogrande,83.35,,hf_open_llm_v1_240829_frozen.csv una_thebeagle_7b_v1,HF OpenLLM v1,73.87,,hf_open_llm_v1_240829_frozen.csv una_thebeagle_7b_v1,HFv1 ARC,73.04,,hf_open_llm_v1_240829_frozen.csv una_thebeagle_7b_v1,HFv1 GSM8K,66.72,,hf_open_llm_v1_240829_frozen.csv una_thebeagle_7b_v1,HFv1 HellaSwag,88.0,,hf_open_llm_v1_240829_frozen.csv una_thebeagle_7b_v1,HFv1 MMLU,63.48,,hf_open_llm_v1_240829_frozen.csv una_thebeagle_7b_v1,HFv1 TruthfulQA,69.85,,hf_open_llm_v1_240829_frozen.csv una_thebeagle_7b_v1,HFv1 Winogrande,82.16,,hf_open_llm_v1_240829_frozen.csv unsafe_llama3_8b,HF OpenLLM v1,67.44,,hf_open_llm_v1_240829_frozen.csv unsafe_llama3_8b,HFv1 ARC,62.63,,hf_open_llm_v1_240829_frozen.csv unsafe_llama3_8b,HFv1 GSM8K,68.16,,hf_open_llm_v1_240829_frozen.csv unsafe_llama3_8b,HFv1 HellaSwag,79.73,,hf_open_llm_v1_240829_frozen.csv unsafe_llama3_8b,HFv1 MMLU,67.05,,hf_open_llm_v1_240829_frozen.csv unsafe_llama3_8b,HFv1 TruthfulQA,51.52,,hf_open_llm_v1_240829_frozen.csv unsafe_llama3_8b,HFv1 Winogrande,75.53,,hf_open_llm_v1_240829_frozen.csv v1,HF OpenLLM v1,46.35,,hf_open_llm_v1_240829_frozen.csv v1,HFv1 ARC,48.12,,hf_open_llm_v1_240829_frozen.csv v1,HFv1 GSM8K,17.44,,hf_open_llm_v1_240829_frozen.csv v1,HFv1 HellaSwag,71.6,,hf_open_llm_v1_240829_frozen.csv v1,HFv1 MMLU,41.83,,hf_open_llm_v1_240829_frozen.csv v1,HFv1 TruthfulQA,33.04,,hf_open_llm_v1_240829_frozen.csv v1,HFv1 Winogrande,66.06,,hf_open_llm_v1_240829_frozen.csv v1olet_merged_dpo_7b,HF OpenLLM v1,70.26,,hf_open_llm_v1_240829_frozen.csv v1olet_merged_dpo_7b,HFv1 ARC,71.33,,hf_open_llm_v1_240829_frozen.csv v1olet_merged_dpo_7b,HFv1 GSM8K,53.37,,hf_open_llm_v1_240829_frozen.csv v1olet_merged_dpo_7b,HFv1 HellaSwag,87.34,,hf_open_llm_v1_240829_frozen.csv v1olet_merged_dpo_7b,HFv1 MMLU,64.13,,hf_open_llm_v1_240829_frozen.csv v1olet_merged_dpo_7b,HFv1 TruthfulQA,63.37,,hf_open_llm_v1_240829_frozen.csv v1olet_merged_dpo_7b,HFv1 Winogrande,82.0,,hf_open_llm_v1_240829_frozen.csv v1olet_merged_dpo_7b_v3,HF OpenLLM v1,73.68,,hf_open_llm_v1_240829_frozen.csv v1olet_merged_dpo_7b_v3,HFv1 ARC,72.61,,hf_open_llm_v1_240829_frozen.csv v1olet_merged_dpo_7b_v3,HFv1 GSM8K,66.87,,hf_open_llm_v1_240829_frozen.csv v1olet_merged_dpo_7b_v3,HFv1 HellaSwag,87.7,,hf_open_llm_v1_240829_frozen.csv v1olet_merged_dpo_7b_v3,HFv1 MMLU,63.51,,hf_open_llm_v1_240829_frozen.csv v1olet_merged_dpo_7b_v3,HFv1 TruthfulQA,69.07,,hf_open_llm_v1_240829_frozen.csv v1olet_merged_dpo_7b_v3,HFv1 Winogrande,82.32,,hf_open_llm_v1_240829_frozen.csv v1olet_merged_dpo_7b_v4,HF OpenLLM v1,64.3,,hf_open_llm_v1_240829_frozen.csv v1olet_merged_dpo_7b_v4,HFv1 ARC,66.98,,hf_open_llm_v1_240829_frozen.csv v1olet_merged_dpo_7b_v4,HFv1 GSM8K,35.25,,hf_open_llm_v1_240829_frozen.csv v1olet_merged_dpo_7b_v4,HFv1 HellaSwag,84.09,,hf_open_llm_v1_240829_frozen.csv v1olet_merged_dpo_7b_v4,HFv1 MMLU,59.02,,hf_open_llm_v1_240829_frozen.csv v1olet_merged_dpo_7b_v4,HFv1 TruthfulQA,59.43,,hf_open_llm_v1_240829_frozen.csv v1olet_merged_dpo_7b_v4,HFv1 Winogrande,81.06,,hf_open_llm_v1_240829_frozen.csv v_alpha_tross,HF OpenLLM v1,73.28,,hf_open_llm_v1_240829_frozen.csv v_alpha_tross,HFv1 ARC,71.93,,hf_open_llm_v1_240829_frozen.csv v_alpha_tross,HFv1 GSM8K,61.79,,hf_open_llm_v1_240829_frozen.csv v_alpha_tross,HFv1 HellaSwag,86.82,,hf_open_llm_v1_240829_frozen.csv v_alpha_tross,HFv1 MMLU,70.38,,hf_open_llm_v1_240829_frozen.csv v_alpha_tross,HFv1 TruthfulQA,65.21,,hf_open_llm_v1_240829_frozen.csv v_alpha_tross,HFv1 Winogrande,83.58,,hf_open_llm_v1_240829_frozen.csv velara_11b_v3,HF OpenLLM v1,57.58,,hf_open_llm_v1_240829_frozen.csv velara_11b_v3,HFv1 ARC,57.51,,hf_open_llm_v1_240829_frozen.csv velara_11b_v3,HFv1 GSM8K,21.99,,hf_open_llm_v1_240829_frozen.csv velara_11b_v3,HFv1 HellaSwag,75.7,,hf_open_llm_v1_240829_frozen.csv velara_11b_v3,HFv1 MMLU,65.85,,hf_open_llm_v1_240829_frozen.csv velara_11b_v3,HFv1 TruthfulQA,50.19,,hf_open_llm_v1_240829_frozen.csv velara_11b_v3,HFv1 Winogrande,74.27,,hf_open_llm_v1_240829_frozen.csv venus_dpo_50,HF OpenLLM v1,74.2,,hf_open_llm_v1_240829_frozen.csv venus_dpo_50,HFv1 ARC,70.73,,hf_open_llm_v1_240829_frozen.csv venus_dpo_50,HFv1 GSM8K,63.61,,hf_open_llm_v1_240829_frozen.csv venus_dpo_50,HFv1 HellaSwag,88.47,,hf_open_llm_v1_240829_frozen.csv venus_dpo_50,HFv1 MMLU,66.3,,hf_open_llm_v1_240829_frozen.csv venus_dpo_50,HFv1 TruthfulQA,72.63,,hf_open_llm_v1_240829_frozen.csv venus_dpo_50,HFv1 Winogrande,83.43,,hf_open_llm_v1_240829_frozen.csv verysmol_llama_v11_kix2,HF OpenLLM v1,28.7,,hf_open_llm_v1_240829_frozen.csv verysmol_llama_v11_kix2,HFv1 ARC,22.7,,hf_open_llm_v1_240829_frozen.csv verysmol_llama_v11_kix2,HFv1 GSM8K,0.3,,hf_open_llm_v1_240829_frozen.csv verysmol_llama_v11_kix2,HFv1 HellaSwag,27.6,,hf_open_llm_v1_240829_frozen.csv verysmol_llama_v11_kix2,HFv1 MMLU,25.28,,hf_open_llm_v1_240829_frozen.csv verysmol_llama_v11_kix2,HFv1 TruthfulQA,44.75,,hf_open_llm_v1_240829_frozen.csv verysmol_llama_v11_kix2,HFv1 Winogrande,51.54,,hf_open_llm_v1_240829_frozen.csv vicuna_7b_v1_3_attention_sparsity_10,HF OpenLLM v1,51.13,,hf_open_llm_v1_240829_frozen.csv vicuna_7b_v1_3_attention_sparsity_10,HFv1 ARC,52.22,,hf_open_llm_v1_240829_frozen.csv vicuna_7b_v1_3_attention_sparsity_10,HFv1 GSM8K,13.19,,hf_open_llm_v1_240829_frozen.csv vicuna_7b_v1_3_attention_sparsity_10,HFv1 HellaSwag,77.05,,hf_open_llm_v1_240829_frozen.csv vicuna_7b_v1_3_attention_sparsity_10,HFv1 MMLU,47.93,,hf_open_llm_v1_240829_frozen.csv vicuna_7b_v1_3_attention_sparsity_10,HFv1 TruthfulQA,46.87,,hf_open_llm_v1_240829_frozen.csv vicuna_7b_v1_3_attention_sparsity_10,HFv1 Winogrande,69.53,,hf_open_llm_v1_240829_frozen.csv vicuna_7b_v1_3_attention_sparsity_20,HF OpenLLM v1,50.63,,hf_open_llm_v1_240829_frozen.csv vicuna_7b_v1_3_attention_sparsity_20,HFv1 ARC,52.3,,hf_open_llm_v1_240829_frozen.csv vicuna_7b_v1_3_attention_sparsity_20,HFv1 GSM8K,11.22,,hf_open_llm_v1_240829_frozen.csv vicuna_7b_v1_3_attention_sparsity_20,HFv1 HellaSwag,77.05,,hf_open_llm_v1_240829_frozen.csv vicuna_7b_v1_3_attention_sparsity_20,HFv1 MMLU,47.39,,hf_open_llm_v1_240829_frozen.csv vicuna_7b_v1_3_attention_sparsity_20,HFv1 TruthfulQA,46.62,,hf_open_llm_v1_240829_frozen.csv vicuna_7b_v1_3_attention_sparsity_20,HFv1 Winogrande,69.22,,hf_open_llm_v1_240829_frozen.csv vicuna_7b_v1_3_attention_sparsity_30,HF OpenLLM v1,50.33,,hf_open_llm_v1_240829_frozen.csv vicuna_7b_v1_3_attention_sparsity_30,HFv1 ARC,51.02,,hf_open_llm_v1_240829_frozen.csv vicuna_7b_v1_3_attention_sparsity_30,HFv1 GSM8K,12.36,,hf_open_llm_v1_240829_frozen.csv vicuna_7b_v1_3_attention_sparsity_30,HFv1 HellaSwag,76.41,,hf_open_llm_v1_240829_frozen.csv vicuna_7b_v1_3_attention_sparsity_30,HFv1 MMLU,46.83,,hf_open_llm_v1_240829_frozen.csv vicuna_7b_v1_3_attention_sparsity_30,HFv1 TruthfulQA,46.06,,hf_open_llm_v1_240829_frozen.csv vicuna_7b_v1_3_attention_sparsity_30,HFv1 Winogrande,69.3,,hf_open_llm_v1_240829_frozen.csv vigogne2_enno_13b_sft_lora_4bit,HF OpenLLM v1,53.15,,hf_open_llm_v1_240829_frozen.csv vigogne2_enno_13b_sft_lora_4bit,HFv1 ARC,62.03,,hf_open_llm_v1_240829_frozen.csv vigogne2_enno_13b_sft_lora_4bit,HFv1 GSM8K,0.15,,hf_open_llm_v1_240829_frozen.csv vigogne2_enno_13b_sft_lora_4bit,HFv1 HellaSwag,82.65,,hf_open_llm_v1_240829_frozen.csv vigogne2_enno_13b_sft_lora_4bit,HFv1 MMLU,54.11,,hf_open_llm_v1_240829_frozen.csv vigogne2_enno_13b_sft_lora_4bit,HFv1 TruthfulQA,42.98,,hf_open_llm_v1_240829_frozen.csv vigogne2_enno_13b_sft_lora_4bit,HFv1 Winogrande,76.95,,hf_open_llm_v1_240829_frozen.csv vortex_3b_v2,HF OpenLLM v1,37.46,,hf_open_llm_v1_240829_frozen.csv vortex_3b_v2,HFv1 ARC,39.68,,hf_open_llm_v1_240829_frozen.csv vortex_3b_v2,HFv1 GSM8K,2.05,,hf_open_llm_v1_240829_frozen.csv vortex_3b_v2,HFv1 HellaSwag,65.04,,hf_open_llm_v1_240829_frozen.csv vortex_3b_v2,HFv1 MMLU,25.09,,hf_open_llm_v1_240829_frozen.csv vortex_3b_v2,HFv1 TruthfulQA,33.8,,hf_open_llm_v1_240829_frozen.csv vortex_3b_v2,HFv1 Winogrande,59.12,,hf_open_llm_v1_240829_frozen.csv walter_falcon_1b,HF OpenLLM v1,34.07,,hf_open_llm_v1_240829_frozen.csv walter_falcon_1b,HFv1 ARC,31.06,,hf_open_llm_v1_240829_frozen.csv walter_falcon_1b,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv walter_falcon_1b,HFv1 HellaSwag,54.92,,hf_open_llm_v1_240829_frozen.csv walter_falcon_1b,HFv1 MMLU,24.58,,hf_open_llm_v1_240829_frozen.csv walter_falcon_1b,HFv1 TruthfulQA,38.47,,hf_open_llm_v1_240829_frozen.csv walter_falcon_1b,HFv1 Winogrande,55.41,,hf_open_llm_v1_240829_frozen.csv walter_llama_1b,HF OpenLLM v1,35.29,,hf_open_llm_v1_240829_frozen.csv walter_llama_1b,HFv1 ARC,32.85,,hf_open_llm_v1_240829_frozen.csv walter_llama_1b,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv walter_llama_1b,HFv1 HellaSwag,61.05,,hf_open_llm_v1_240829_frozen.csv walter_llama_1b,HFv1 MMLU,27.46,,hf_open_llm_v1_240829_frozen.csv walter_llama_1b,HFv1 TruthfulQA,33.93,,hf_open_llm_v1_240829_frozen.csv walter_llama_1b,HFv1 Winogrande,56.43,,hf_open_llm_v1_240829_frozen.csv walter_mistral_7b,HF OpenLLM v1,53.0,,hf_open_llm_v1_240829_frozen.csv walter_mistral_7b,HFv1 ARC,58.87,,hf_open_llm_v1_240829_frozen.csv walter_mistral_7b,HFv1 GSM8K,0.08,,hf_open_llm_v1_240829_frozen.csv walter_mistral_7b,HFv1 HellaSwag,83.43,,hf_open_llm_v1_240829_frozen.csv walter_mistral_7b,HFv1 MMLU,58.65,,hf_open_llm_v1_240829_frozen.csv walter_mistral_7b,HFv1 TruthfulQA,39.93,,hf_open_llm_v1_240829_frozen.csv walter_mistral_7b,HFv1 Winogrande,77.03,,hf_open_llm_v1_240829_frozen.csv walter_solar_11b,HF OpenLLM v1,55.95,,hf_open_llm_v1_240829_frozen.csv walter_solar_11b,HFv1 ARC,60.41,,hf_open_llm_v1_240829_frozen.csv walter_solar_11b,HFv1 GSM8K,0.99,,hf_open_llm_v1_240829_frozen.csv walter_solar_11b,HFv1 HellaSwag,84.86,,hf_open_llm_v1_240829_frozen.csv walter_solar_11b,HFv1 MMLU,64.99,,hf_open_llm_v1_240829_frozen.csv walter_solar_11b,HFv1 TruthfulQA,44.88,,hf_open_llm_v1_240829_frozen.csv walter_solar_11b,HFv1 Winogrande,79.56,,hf_open_llm_v1_240829_frozen.csv weblab_10b,HF OpenLLM v1,38.59,,hf_open_llm_v1_240829_frozen.csv weblab_10b,HFv1 ARC,39.51,,hf_open_llm_v1_240829_frozen.csv weblab_10b,HFv1 GSM8K,1.44,,hf_open_llm_v1_240829_frozen.csv weblab_10b,HFv1 HellaSwag,65.76,,hf_open_llm_v1_240829_frozen.csv weblab_10b,HFv1 MMLU,26.29,,hf_open_llm_v1_240829_frozen.csv weblab_10b,HFv1 TruthfulQA,36.02,,hf_open_llm_v1_240829_frozen.csv weblab_10b,HFv1 Winogrande,62.51,,hf_open_llm_v1_240829_frozen.csv weblab_10b_instruction_sft,HF OpenLLM v1,39.13,,hf_open_llm_v1_240829_frozen.csv weblab_10b_instruction_sft,HFv1 ARC,40.1,,hf_open_llm_v1_240829_frozen.csv weblab_10b_instruction_sft,HFv1 GSM8K,1.82,,hf_open_llm_v1_240829_frozen.csv weblab_10b_instruction_sft,HFv1 HellaSwag,65.3,,hf_open_llm_v1_240829_frozen.csv weblab_10b_instruction_sft,HFv1 MMLU,26.66,,hf_open_llm_v1_240829_frozen.csv weblab_10b_instruction_sft,HFv1 TruthfulQA,36.79,,hf_open_llm_v1_240829_frozen.csv weblab_10b_instruction_sft,HFv1 Winogrande,64.09,,hf_open_llm_v1_240829_frozen.csv westmonarchlasers_7b_slerp,HF OpenLLM v1,75.23,,hf_open_llm_v1_240829_frozen.csv westmonarchlasers_7b_slerp,HFv1 ARC,72.44,,hf_open_llm_v1_240829_frozen.csv westmonarchlasers_7b_slerp,HFv1 GSM8K,67.63,,hf_open_llm_v1_240829_frozen.csv westmonarchlasers_7b_slerp,HFv1 HellaSwag,88.66,,hf_open_llm_v1_240829_frozen.csv westmonarchlasers_7b_slerp,HFv1 MMLU,64.73,,hf_open_llm_v1_240829_frozen.csv westmonarchlasers_7b_slerp,HFv1 TruthfulQA,72.4,,hf_open_llm_v1_240829_frozen.csv westmonarchlasers_7b_slerp,HFv1 Winogrande,85.56,,hf_open_llm_v1_240829_frozen.csv westseverus_7b_dpo_v2,HF OpenLLM v1,75.28,,hf_open_llm_v1_240829_frozen.csv westseverus_7b_dpo_v2,HFv1 ARC,71.42,,hf_open_llm_v1_240829_frozen.csv westseverus_7b_dpo_v2,HFv1 GSM8K,71.57,,hf_open_llm_v1_240829_frozen.csv westseverus_7b_dpo_v2,HFv1 HellaSwag,88.24,,hf_open_llm_v1_240829_frozen.csv westseverus_7b_dpo_v2,HFv1 MMLU,64.8,,hf_open_llm_v1_240829_frozen.csv westseverus_7b_dpo_v2,HFv1 TruthfulQA,72.37,,hf_open_llm_v1_240829_frozen.csv westseverus_7b_dpo_v2,HFv1 Winogrande,83.27,,hf_open_llm_v1_240829_frozen.csv westuccine_7b_slerp,HF OpenLLM v1,70.08,,hf_open_llm_v1_240829_frozen.csv westuccine_7b_slerp,HFv1 ARC,69.37,,hf_open_llm_v1_240829_frozen.csv westuccine_7b_slerp,HFv1 GSM8K,48.52,,hf_open_llm_v1_240829_frozen.csv westuccine_7b_slerp,HFv1 HellaSwag,87.34,,hf_open_llm_v1_240829_frozen.csv westuccine_7b_slerp,HFv1 MMLU,63.8,,hf_open_llm_v1_240829_frozen.csv westuccine_7b_slerp,HFv1 TruthfulQA,69.34,,hf_open_llm_v1_240829_frozen.csv westuccine_7b_slerp,HFv1 Winogrande,82.08,,hf_open_llm_v1_240829_frozen.csv westuccinebagel_7b_slerp,HF OpenLLM v1,71.01,,hf_open_llm_v1_240829_frozen.csv westuccinebagel_7b_slerp,HFv1 ARC,69.37,,hf_open_llm_v1_240829_frozen.csv westuccinebagel_7b_slerp,HFv1 GSM8K,55.72,,hf_open_llm_v1_240829_frozen.csv westuccinebagel_7b_slerp,HFv1 HellaSwag,86.53,,hf_open_llm_v1_240829_frozen.csv westuccinebagel_7b_slerp,HFv1 MMLU,64.8,,hf_open_llm_v1_240829_frozen.csv westuccinebagel_7b_slerp,HFv1 TruthfulQA,67.06,,hf_open_llm_v1_240829_frozen.csv westuccinebagel_7b_slerp,HFv1 Winogrande,82.56,,hf_open_llm_v1_240829_frozen.csv where_llambo_7b,HF OpenLLM v1,66.08,,hf_open_llm_v1_240829_frozen.csv where_llambo_7b,HFv1 ARC,58.45,,hf_open_llm_v1_240829_frozen.csv where_llambo_7b,HFv1 GSM8K,65.2,,hf_open_llm_v1_240829_frozen.csv where_llambo_7b,HFv1 HellaSwag,82.06,,hf_open_llm_v1_240829_frozen.csv where_llambo_7b,HFv1 MMLU,62.61,,hf_open_llm_v1_240829_frozen.csv where_llambo_7b,HFv1 TruthfulQA,49.61,,hf_open_llm_v1_240829_frozen.csv where_llambo_7b,HFv1 Winogrande,78.53,,hf_open_llm_v1_240829_frozen.csv whyarewestillhere_7b_slerp,HF OpenLLM v1,73.96,,hf_open_llm_v1_240829_frozen.csv whyarewestillhere_7b_slerp,HFv1 ARC,71.67,,hf_open_llm_v1_240829_frozen.csv whyarewestillhere_7b_slerp,HFv1 GSM8K,65.35,,hf_open_llm_v1_240829_frozen.csv whyarewestillhere_7b_slerp,HFv1 HellaSwag,88.25,,hf_open_llm_v1_240829_frozen.csv whyarewestillhere_7b_slerp,HFv1 MMLU,64.92,,hf_open_llm_v1_240829_frozen.csv whyarewestillhere_7b_slerp,HFv1 TruthfulQA,68.12,,hf_open_llm_v1_240829_frozen.csv whyarewestillhere_7b_slerp,HFv1 Winogrande,85.48,,hf_open_llm_v1_240829_frozen.csv wizardchatml_7b_v0,HF OpenLLM v1,62.91,,hf_open_llm_v1_240829_frozen.csv wizardchatml_7b_v0,HFv1 ARC,60.49,,hf_open_llm_v1_240829_frozen.csv wizardchatml_7b_v0,HFv1 GSM8K,47.92,,hf_open_llm_v1_240829_frozen.csv wizardchatml_7b_v0,HFv1 HellaSwag,80.62,,hf_open_llm_v1_240829_frozen.csv wizardchatml_7b_v0,HFv1 MMLU,61.65,,hf_open_llm_v1_240829_frozen.csv wizardchatml_7b_v0,HFv1 TruthfulQA,50.58,,hf_open_llm_v1_240829_frozen.csv wizardchatml_7b_v0,HFv1 Winogrande,76.16,,hf_open_llm_v1_240829_frozen.csv wizardcoder_python_13b_lora,HF OpenLLM v1,45.56,,hf_open_llm_v1_240829_frozen.csv wizardcoder_python_13b_lora,HFv1 ARC,47.78,,hf_open_llm_v1_240829_frozen.csv wizardcoder_python_13b_lora,HFv1 GSM8K,7.81,,hf_open_llm_v1_240829_frozen.csv wizardcoder_python_13b_lora,HFv1 HellaSwag,69.6,,hf_open_llm_v1_240829_frozen.csv wizardcoder_python_13b_lora,HFv1 MMLU,38.76,,hf_open_llm_v1_240829_frozen.csv wizardcoder_python_13b_lora,HFv1 TruthfulQA,43.97,,hf_open_llm_v1_240829_frozen.csv wizardcoder_python_13b_lora,HFv1 Winogrande,65.43,,hf_open_llm_v1_240829_frozen.csv wizardcoder_python_34b_v1_0,HF OpenLLM v1,50.46,,hf_open_llm_v1_240829_frozen.csv wizardcoder_python_34b_v1_0,HFv1 ARC,52.13,,hf_open_llm_v1_240829_frozen.csv wizardcoder_python_34b_v1_0,HFv1 GSM8K,9.48,,hf_open_llm_v1_240829_frozen.csv wizardcoder_python_34b_v1_0,HFv1 HellaSwag,74.78,,hf_open_llm_v1_240829_frozen.csv wizardcoder_python_34b_v1_0,HFv1 MMLU,49.15,,hf_open_llm_v1_240829_frozen.csv wizardcoder_python_34b_v1_0,HFv1 TruthfulQA,48.85,,hf_open_llm_v1_240829_frozen.csv wizardcoder_python_34b_v1_0,HFv1 Winogrande,68.35,,hf_open_llm_v1_240829_frozen.csv wizardlm_1_0_uncensored_codellama34b,HF OpenLLM v1,53.59,,hf_open_llm_v1_240829_frozen.csv wizardlm_1_0_uncensored_codellama34b,HFv1 ARC,56.4,,hf_open_llm_v1_240829_frozen.csv wizardlm_1_0_uncensored_codellama34b,HFv1 GSM8K,19.64,,hf_open_llm_v1_240829_frozen.csv wizardlm_1_0_uncensored_codellama34b,HFv1 HellaSwag,75.45,,hf_open_llm_v1_240829_frozen.csv wizardlm_1_0_uncensored_codellama34b,HFv1 MMLU,54.51,,hf_open_llm_v1_240829_frozen.csv wizardlm_1_0_uncensored_codellama34b,HFv1 TruthfulQA,43.06,,hf_open_llm_v1_240829_frozen.csv wizardlm_1_0_uncensored_codellama34b,HFv1 Winogrande,72.45,,hf_open_llm_v1_240829_frozen.csv wizardlm_30b_v1_0,HF OpenLLM v1,28.96,,hf_open_llm_v1_240829_frozen.csv wizardlm_30b_v1_0,HFv1 ARC,27.39,,hf_open_llm_v1_240829_frozen.csv wizardlm_30b_v1_0,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv wizardlm_30b_v1_0,HFv1 HellaSwag,25.94,,hf_open_llm_v1_240829_frozen.csv wizardlm_30b_v1_0,HFv1 MMLU,23.12,,hf_open_llm_v1_240829_frozen.csv wizardlm_30b_v1_0,HFv1 TruthfulQA,48.61,,hf_open_llm_v1_240829_frozen.csv wizardlm_30b_v1_0,HFv1 Winogrande,48.7,,hf_open_llm_v1_240829_frozen.csv wizardlm_70b_v1_0,HF OpenLLM v1,61.25,,hf_open_llm_v1_240829_frozen.csv wizardlm_70b_v1_0,HFv1 ARC,65.44,,hf_open_llm_v1_240829_frozen.csv wizardlm_70b_v1_0,HFv1 GSM8K,17.97,,hf_open_llm_v1_240829_frozen.csv wizardlm_70b_v1_0,HFv1 HellaSwag,84.41,,hf_open_llm_v1_240829_frozen.csv wizardlm_70b_v1_0,HFv1 MMLU,64.05,,hf_open_llm_v1_240829_frozen.csv wizardlm_70b_v1_0,HFv1 TruthfulQA,54.81,,hf_open_llm_v1_240829_frozen.csv wizardlm_70b_v1_0,HFv1 Winogrande,80.82,,hf_open_llm_v1_240829_frozen.csv wizardmath_13b_v1_0,HF OpenLLM v1,53.97,,hf_open_llm_v1_240829_frozen.csv wizardmath_13b_v1_0,HFv1 ARC,60.07,,hf_open_llm_v1_240829_frozen.csv wizardmath_13b_v1_0,HFv1 GSM8K,12.36,,hf_open_llm_v1_240829_frozen.csv wizardmath_13b_v1_0,HFv1 HellaSwag,82.01,,hf_open_llm_v1_240829_frozen.csv wizardmath_13b_v1_0,HFv1 MMLU,54.8,,hf_open_llm_v1_240829_frozen.csv wizardmath_13b_v1_0,HFv1 TruthfulQA,42.7,,hf_open_llm_v1_240829_frozen.csv wizardmath_13b_v1_0,HFv1 Winogrande,71.9,,hf_open_llm_v1_240829_frozen.csv wizardmath_70b_v1_0,HF OpenLLM v1,60.42,,hf_open_llm_v1_240829_frozen.csv wizardmath_70b_v1_0,HFv1 ARC,68.17,,hf_open_llm_v1_240829_frozen.csv wizardmath_70b_v1_0,HFv1 GSM8K,4.09,,hf_open_llm_v1_240829_frozen.csv wizardmath_70b_v1_0,HFv1 HellaSwag,86.49,,hf_open_llm_v1_240829_frozen.csv wizardmath_70b_v1_0,HFv1 MMLU,68.92,,hf_open_llm_v1_240829_frozen.csv wizardmath_70b_v1_0,HFv1 TruthfulQA,52.77,,hf_open_llm_v1_240829_frozen.csv wizardmath_70b_v1_0,HFv1 Winogrande,82.32,,hf_open_llm_v1_240829_frozen.csv wizardmath_7b_v1_1,HF OpenLLM v1,66.61,,hf_open_llm_v1_240829_frozen.csv wizardmath_7b_v1_1,HFv1 ARC,61.86,,hf_open_llm_v1_240829_frozen.csv wizardmath_7b_v1_1,HFv1 GSM8K,67.4,,hf_open_llm_v1_240829_frozen.csv wizardmath_7b_v1_1,HFv1 HellaSwag,84.5,,hf_open_llm_v1_240829_frozen.csv wizardmath_7b_v1_1,HFv1 MMLU,61.53,,hf_open_llm_v1_240829_frozen.csv wizardmath_7b_v1_1,HFv1 TruthfulQA,47.04,,hf_open_llm_v1_240829_frozen.csv wizardmath_7b_v1_1,HFv1 Winogrande,77.35,,hf_open_llm_v1_240829_frozen.csv wizardvicuna2_13b,HF OpenLLM v1,51.05,,hf_open_llm_v1_240829_frozen.csv wizardvicuna2_13b,HFv1 ARC,55.38,,hf_open_llm_v1_240829_frozen.csv wizardvicuna2_13b,HFv1 GSM8K,7.43,,hf_open_llm_v1_240829_frozen.csv wizardvicuna2_13b,HFv1 HellaSwag,79.14,,hf_open_llm_v1_240829_frozen.csv wizardvicuna2_13b,HFv1 MMLU,48.46,,hf_open_llm_v1_240829_frozen.csv wizardvicuna2_13b,HFv1 TruthfulQA,42.43,,hf_open_llm_v1_240829_frozen.csv wizardvicuna2_13b,HFv1 Winogrande,73.48,,hf_open_llm_v1_240829_frozen.csv wizardvicuna_open_llama3b_v2,HF OpenLLM v1,38.77,,hf_open_llm_v1_240829_frozen.csv wizardvicuna_open_llama3b_v2,HFv1 ARC,37.71,,hf_open_llm_v1_240829_frozen.csv wizardvicuna_open_llama3b_v2,HFv1 GSM8K,0.99,,hf_open_llm_v1_240829_frozen.csv wizardvicuna_open_llama3b_v2,HFv1 HellaSwag,66.6,,hf_open_llm_v1_240829_frozen.csv wizardvicuna_open_llama3b_v2,HFv1 MMLU,27.23,,hf_open_llm_v1_240829_frozen.csv wizardvicuna_open_llama3b_v2,HFv1 TruthfulQA,36.8,,hf_open_llm_v1_240829_frozen.csv wizardvicuna_open_llama3b_v2,HFv1 Winogrande,63.3,,hf_open_llm_v1_240829_frozen.csv worldsim_hermes_7b,HF OpenLLM v1,66.26,,hf_open_llm_v1_240829_frozen.csv worldsim_hermes_7b,HFv1 ARC,64.08,,hf_open_llm_v1_240829_frozen.csv worldsim_hermes_7b,HFv1 GSM8K,56.63,,hf_open_llm_v1_240829_frozen.csv worldsim_hermes_7b,HFv1 HellaSwag,83.45,,hf_open_llm_v1_240829_frozen.csv worldsim_hermes_7b,HFv1 MMLU,63.12,,hf_open_llm_v1_240829_frozen.csv worldsim_hermes_7b,HFv1 TruthfulQA,51.52,,hf_open_llm_v1_240829_frozen.csv worldsim_hermes_7b,HFv1 Winogrande,78.77,,hf_open_llm_v1_240829_frozen.csv xenon_1,HF OpenLLM v1,59.21,,hf_open_llm_v1_240829_frozen.csv xenon_1,HFv1 ARC,55.29,,hf_open_llm_v1_240829_frozen.csv xenon_1,HFv1 GSM8K,21.83,,hf_open_llm_v1_240829_frozen.csv xenon_1,HFv1 HellaSwag,81.56,,hf_open_llm_v1_240829_frozen.csv xenon_1,HFv1 MMLU,61.22,,hf_open_llm_v1_240829_frozen.csv xenon_1,HFv1 TruthfulQA,56.68,,hf_open_llm_v1_240829_frozen.csv xenon_1,HFv1 Winogrande,78.69,,hf_open_llm_v1_240829_frozen.csv xenon_2,HF OpenLLM v1,59.93,,hf_open_llm_v1_240829_frozen.csv xenon_2,HFv1 ARC,57.51,,hf_open_llm_v1_240829_frozen.csv xenon_2,HFv1 GSM8K,19.41,,hf_open_llm_v1_240829_frozen.csv xenon_2,HFv1 HellaSwag,83.28,,hf_open_llm_v1_240829_frozen.csv xenon_2,HFv1 MMLU,60.25,,hf_open_llm_v1_240829_frozen.csv xenon_2,HFv1 TruthfulQA,60.92,,hf_open_llm_v1_240829_frozen.csv xenon_2,HFv1 Winogrande,78.22,,hf_open_llm_v1_240829_frozen.csv xenon_3,HF OpenLLM v1,60.27,,hf_open_llm_v1_240829_frozen.csv xenon_3,HFv1 ARC,58.87,,hf_open_llm_v1_240829_frozen.csv xenon_3,HFv1 GSM8K,20.09,,hf_open_llm_v1_240829_frozen.csv xenon_3,HFv1 HellaSwag,83.39,,hf_open_llm_v1_240829_frozen.csv xenon_3,HFv1 MMLU,59.79,,hf_open_llm_v1_240829_frozen.csv xenon_3,HFv1 TruthfulQA,61.99,,hf_open_llm_v1_240829_frozen.csv xenon_3,HFv1 Winogrande,77.51,,hf_open_llm_v1_240829_frozen.csv xenon_4,HF OpenLLM v1,60.39,,hf_open_llm_v1_240829_frozen.csv xenon_4,HFv1 ARC,60.15,,hf_open_llm_v1_240829_frozen.csv xenon_4,HFv1 GSM8K,20.7,,hf_open_llm_v1_240829_frozen.csv xenon_4,HFv1 HellaSwag,83.07,,hf_open_llm_v1_240829_frozen.csv xenon_4,HFv1 MMLU,60.08,,hf_open_llm_v1_240829_frozen.csv xenon_4,HFv1 TruthfulQA,61.31,,hf_open_llm_v1_240829_frozen.csv xenon_4,HFv1 Winogrande,77.03,,hf_open_llm_v1_240829_frozen.csv xglm_4_5b,HF OpenLLM v1,34.31,,hf_open_llm_v1_240829_frozen.csv xglm_4_5b,HFv1 ARC,31.48,,hf_open_llm_v1_240829_frozen.csv xglm_4_5b,HFv1 GSM8K,0.23,,hf_open_llm_v1_240829_frozen.csv xglm_4_5b,HFv1 HellaSwag,57.95,,hf_open_llm_v1_240829_frozen.csv xglm_4_5b,HFv1 MMLU,25.43,,hf_open_llm_v1_240829_frozen.csv xglm_4_5b,HFv1 TruthfulQA,35.84,,hf_open_llm_v1_240829_frozen.csv xglm_4_5b,HFv1 Winogrande,54.93,,hf_open_llm_v1_240829_frozen.csv xglm_564m,HF OpenLLM v1,29.55,,hf_open_llm_v1_240829_frozen.csv xglm_564m,HFv1 ARC,24.57,,hf_open_llm_v1_240829_frozen.csv xglm_564m,HFv1 GSM8K,0.23,,hf_open_llm_v1_240829_frozen.csv xglm_564m,HFv1 HellaSwag,34.64,,hf_open_llm_v1_240829_frozen.csv xglm_564m,HFv1 MMLU,25.18,,hf_open_llm_v1_240829_frozen.csv xglm_564m,HFv1 TruthfulQA,40.43,,hf_open_llm_v1_240829_frozen.csv xglm_564m,HFv1 Winogrande,52.25,,hf_open_llm_v1_240829_frozen.csv xglm_7_5b,HF OpenLLM v1,36.38,,hf_open_llm_v1_240829_frozen.csv xglm_7_5b,HFv1 ARC,34.13,,hf_open_llm_v1_240829_frozen.csv xglm_7_5b,HFv1 GSM8K,0.23,,hf_open_llm_v1_240829_frozen.csv xglm_7_5b,HFv1 HellaSwag,60.77,,hf_open_llm_v1_240829_frozen.csv xglm_7_5b,HFv1 MMLU,27.79,,hf_open_llm_v1_240829_frozen.csv xglm_7_5b,HFv1 TruthfulQA,36.66,,hf_open_llm_v1_240829_frozen.csv xglm_7_5b,HFv1 Winogrande,58.72,,hf_open_llm_v1_240829_frozen.csv yarn_mistral_7b_128k_dpo,HF OpenLLM v1,60.15,,hf_open_llm_v1_240829_frozen.csv yarn_mistral_7b_128k_dpo,HFv1 ARC,60.84,,hf_open_llm_v1_240829_frozen.csv yarn_mistral_7b_128k_dpo,HFv1 GSM8K,32.15,,hf_open_llm_v1_240829_frozen.csv yarn_mistral_7b_128k_dpo,HFv1 HellaSwag,82.99,,hf_open_llm_v1_240829_frozen.csv yarn_mistral_7b_128k_dpo,HFv1 MMLU,63.09,,hf_open_llm_v1_240829_frozen.csv yarn_mistral_7b_128k_dpo,HFv1 TruthfulQA,43.55,,hf_open_llm_v1_240829_frozen.csv yarn_mistral_7b_128k_dpo,HFv1 Winogrande,78.3,,hf_open_llm_v1_240829_frozen.csv yayi2_30b_llama,HF OpenLLM v1,48.46,,hf_open_llm_v1_240829_frozen.csv yayi2_30b_llama,HFv1 ARC,35.67,,hf_open_llm_v1_240829_frozen.csv yayi2_30b_llama,HFv1 GSM8K,18.88,,hf_open_llm_v1_240829_frozen.csv yayi2_30b_llama,HFv1 HellaSwag,53.37,,hf_open_llm_v1_240829_frozen.csv yayi2_30b_llama,HFv1 MMLU,70.6,,hf_open_llm_v1_240829_frozen.csv yayi2_30b_llama,HFv1 TruthfulQA,49.08,,hf_open_llm_v1_240829_frozen.csv yayi2_30b_llama,HFv1 Winogrande,63.14,,hf_open_llm_v1_240829_frozen.csv yehoon_llama2,HF OpenLLM v1,52.71,,hf_open_llm_v1_240829_frozen.csv yehoon_llama2,HFv1 ARC,54.78,,hf_open_llm_v1_240829_frozen.csv yehoon_llama2,HFv1 GSM8K,7.28,,hf_open_llm_v1_240829_frozen.csv yehoon_llama2,HFv1 HellaSwag,78.98,,hf_open_llm_v1_240829_frozen.csv yehoon_llama2,HFv1 MMLU,51.29,,hf_open_llm_v1_240829_frozen.csv yehoon_llama2,HFv1 TruthfulQA,49.17,,hf_open_llm_v1_240829_frozen.csv yehoon_llama2,HFv1 Winogrande,74.74,,hf_open_llm_v1_240829_frozen.csv yi6,HF OpenLLM v1,45.82,,hf_open_llm_v1_240829_frozen.csv yi6,HFv1 ARC,47.78,,hf_open_llm_v1_240829_frozen.csv yi6,HFv1 GSM8K,4.4,,hf_open_llm_v1_240829_frozen.csv yi6,HFv1 HellaSwag,68.25,,hf_open_llm_v1_240829_frozen.csv yi6,HFv1 MMLU,54.05,,hf_open_llm_v1_240829_frozen.csv yi6,HFv1 TruthfulQA,35.8,,hf_open_llm_v1_240829_frozen.csv yi6,HFv1 Winogrande,64.64,,hf_open_llm_v1_240829_frozen.csv yi_1_5_34b_chat_16k,HF OpenLLM v1,73.57,,hf_open_llm_v1_240829_frozen.csv yi_1_5_34b_chat_16k,HFv1 ARC,68.09,,hf_open_llm_v1_240829_frozen.csv yi_1_5_34b_chat_16k,HFv1 GSM8K,67.93,,hf_open_llm_v1_240829_frozen.csv yi_1_5_34b_chat_16k,HFv1 HellaSwag,86.52,,hf_open_llm_v1_240829_frozen.csv yi_1_5_34b_chat_16k,HFv1 MMLU,78.0,,hf_open_llm_v1_240829_frozen.csv yi_1_5_34b_chat_16k,HFv1 TruthfulQA,57.61,,hf_open_llm_v1_240829_frozen.csv yi_1_5_34b_chat_16k,HFv1 Winogrande,83.27,,hf_open_llm_v1_240829_frozen.csv yi_1_5_6b,HF OpenLLM v1,61.6,,hf_open_llm_v1_240829_frozen.csv yi_1_5_6b,HFv1 ARC,57.25,,hf_open_llm_v1_240829_frozen.csv yi_1_5_6b,HFv1 GSM8K,49.81,,hf_open_llm_v1_240829_frozen.csv yi_1_5_6b,HFv1 HellaSwag,77.96,,hf_open_llm_v1_240829_frozen.csv yi_1_5_6b,HFv1 MMLU,65.0,,hf_open_llm_v1_240829_frozen.csv yi_1_5_6b,HFv1 TruthfulQA,44.04,,hf_open_llm_v1_240829_frozen.csv yi_1_5_6b,HFv1 Winogrande,75.53,,hf_open_llm_v1_240829_frozen.csv yi_1_5_6b_chat,HF OpenLLM v1,66.17,,hf_open_llm_v1_240829_frozen.csv yi_1_5_6b_chat,HFv1 ARC,60.67,,hf_open_llm_v1_240829_frozen.csv yi_1_5_6b_chat,HFv1 GSM8K,67.1,,hf_open_llm_v1_240829_frozen.csv yi_1_5_6b_chat,HFv1 HellaSwag,78.87,,hf_open_llm_v1_240829_frozen.csv yi_1_5_6b_chat,HFv1 MMLU,64.24,,hf_open_llm_v1_240829_frozen.csv yi_1_5_6b_chat,HFv1 TruthfulQA,52.57,,hf_open_llm_v1_240829_frozen.csv yi_1_5_6b_chat,HFv1 Winogrande,73.56,,hf_open_llm_v1_240829_frozen.csv yi_32b_x2_v2_0,HF OpenLLM v1,76.17,,hf_open_llm_v1_240829_frozen.csv yi_32b_x2_v2_0,HFv1 ARC,73.04,,hf_open_llm_v1_240829_frozen.csv yi_32b_x2_v2_0,HFv1 GSM8K,65.2,,hf_open_llm_v1_240829_frozen.csv yi_32b_x2_v2_0,HFv1 HellaSwag,85.95,,hf_open_llm_v1_240829_frozen.csv yi_32b_x2_v2_0,HFv1 MMLU,76.79,,hf_open_llm_v1_240829_frozen.csv yi_32b_x2_v2_0,HFv1 TruthfulQA,73.22,,hf_open_llm_v1_240829_frozen.csv yi_32b_x2_v2_0,HFv1 Winogrande,82.79,,hf_open_llm_v1_240829_frozen.csv yi_34b_200k_aezakmi_raw_2301,HF OpenLLM v1,70.12,,hf_open_llm_v1_240829_frozen.csv yi_34b_200k_aezakmi_raw_2301,HFv1 ARC,66.04,,hf_open_llm_v1_240829_frozen.csv yi_34b_200k_aezakmi_raw_2301,HFv1 GSM8K,57.09,,hf_open_llm_v1_240829_frozen.csv yi_34b_200k_aezakmi_raw_2301,HFv1 HellaSwag,84.7,,hf_open_llm_v1_240829_frozen.csv yi_34b_200k_aezakmi_raw_2301,HFv1 MMLU,74.89,,hf_open_llm_v1_240829_frozen.csv yi_34b_200k_aezakmi_raw_2301,HFv1 TruthfulQA,56.89,,hf_open_llm_v1_240829_frozen.csv yi_34b_200k_aezakmi_raw_2301,HFv1 Winogrande,81.14,,hf_open_llm_v1_240829_frozen.csv yi_34b_200k_aezakmi_raw_2901,HF OpenLLM v1,69.59,,hf_open_llm_v1_240829_frozen.csv yi_34b_200k_aezakmi_raw_2901,HFv1 ARC,64.93,,hf_open_llm_v1_240829_frozen.csv yi_34b_200k_aezakmi_raw_2901,HFv1 GSM8K,59.51,,hf_open_llm_v1_240829_frozen.csv yi_34b_200k_aezakmi_raw_2901,HFv1 HellaSwag,84.98,,hf_open_llm_v1_240829_frozen.csv yi_34b_200k_aezakmi_raw_2901,HFv1 MMLU,73.7,,hf_open_llm_v1_240829_frozen.csv yi_34b_200k_aezakmi_raw_2901,HFv1 TruthfulQA,55.09,,hf_open_llm_v1_240829_frozen.csv yi_34b_200k_aezakmi_raw_2901,HFv1 Winogrande,79.32,,hf_open_llm_v1_240829_frozen.csv yi_34b_200k_aezakmi_v2,HF OpenLLM v1,71.0,,hf_open_llm_v1_240829_frozen.csv yi_34b_200k_aezakmi_v2,HFv1 ARC,67.92,,hf_open_llm_v1_240829_frozen.csv yi_34b_200k_aezakmi_v2,HFv1 GSM8K,58.91,,hf_open_llm_v1_240829_frozen.csv yi_34b_200k_aezakmi_v2,HFv1 HellaSwag,85.61,,hf_open_llm_v1_240829_frozen.csv yi_34b_200k_aezakmi_v2,HFv1 MMLU,75.22,,hf_open_llm_v1_240829_frozen.csv yi_34b_200k_aezakmi_v2,HFv1 TruthfulQA,56.74,,hf_open_llm_v1_240829_frozen.csv yi_34b_200k_aezakmi_v2,HFv1 Winogrande,81.61,,hf_open_llm_v1_240829_frozen.csv yi_34b_200k_aezakmi_xlctx_v3,HF OpenLLM v1,64.39,,hf_open_llm_v1_240829_frozen.csv yi_34b_200k_aezakmi_xlctx_v3,HFv1 ARC,64.85,,hf_open_llm_v1_240829_frozen.csv yi_34b_200k_aezakmi_xlctx_v3,HFv1 GSM8K,44.05,,hf_open_llm_v1_240829_frozen.csv yi_34b_200k_aezakmi_xlctx_v3,HFv1 HellaSwag,84.76,,hf_open_llm_v1_240829_frozen.csv yi_34b_200k_aezakmi_xlctx_v3,HFv1 MMLU,74.48,,hf_open_llm_v1_240829_frozen.csv yi_34b_200k_aezakmi_xlctx_v3,HFv1 TruthfulQA,37.14,,hf_open_llm_v1_240829_frozen.csv yi_34b_200k_aezakmi_xlctx_v3,HFv1 Winogrande,81.06,,hf_open_llm_v1_240829_frozen.csv yi_34b_200k_dare_merge_v5,HF OpenLLM v1,71.98,,hf_open_llm_v1_240829_frozen.csv yi_34b_200k_dare_merge_v5,HFv1 ARC,66.47,,hf_open_llm_v1_240829_frozen.csv yi_34b_200k_dare_merge_v5,HFv1 GSM8K,62.93,,hf_open_llm_v1_240829_frozen.csv yi_34b_200k_dare_merge_v5,HFv1 HellaSwag,85.54,,hf_open_llm_v1_240829_frozen.csv yi_34b_200k_dare_merge_v5,HFv1 MMLU,77.22,,hf_open_llm_v1_240829_frozen.csv yi_34b_200k_dare_merge_v5,HFv1 TruthfulQA,57.46,,hf_open_llm_v1_240829_frozen.csv yi_34b_200k_dare_merge_v5,HFv1 Winogrande,82.24,,hf_open_llm_v1_240829_frozen.csv yi_34b_200k_rawrr_dpo_1,HF OpenLLM v1,70.97,,hf_open_llm_v1_240829_frozen.csv yi_34b_200k_rawrr_dpo_1,HFv1 ARC,65.44,,hf_open_llm_v1_240829_frozen.csv yi_34b_200k_rawrr_dpo_1,HFv1 GSM8K,61.79,,hf_open_llm_v1_240829_frozen.csv yi_34b_200k_rawrr_dpo_1,HFv1 HellaSwag,85.69,,hf_open_llm_v1_240829_frozen.csv yi_34b_200k_rawrr_dpo_1,HFv1 MMLU,76.09,,hf_open_llm_v1_240829_frozen.csv yi_34b_200k_rawrr_dpo_1,HFv1 TruthfulQA,54.0,,hf_open_llm_v1_240829_frozen.csv yi_34b_200k_rawrr_dpo_1,HFv1 Winogrande,82.79,,hf_open_llm_v1_240829_frozen.csv yi_34b_aezakmi_v1,HF OpenLLM v1,68.67,,hf_open_llm_v1_240829_frozen.csv yi_34b_aezakmi_v1,HFv1 ARC,64.33,,hf_open_llm_v1_240829_frozen.csv yi_34b_aezakmi_v1,HFv1 GSM8K,52.92,,hf_open_llm_v1_240829_frozen.csv yi_34b_aezakmi_v1,HFv1 HellaSwag,84.31,,hf_open_llm_v1_240829_frozen.csv yi_34b_aezakmi_v1,HFv1 MMLU,73.91,,hf_open_llm_v1_240829_frozen.csv yi_34b_aezakmi_v1,HFv1 TruthfulQA,55.73,,hf_open_llm_v1_240829_frozen.csv yi_34b_aezakmi_v1,HFv1 Winogrande,80.82,,hf_open_llm_v1_240829_frozen.csv yi_34b_chat,HF OpenLLM v1,65.32,,hf_open_llm_v1_240829_frozen.csv yi_34b_chat,HFv1 ARC,65.44,,hf_open_llm_v1_240829_frozen.csv yi_34b_chat,HFv1 GSM8K,31.92,,hf_open_llm_v1_240829_frozen.csv yi_34b_chat,HFv1 HellaSwag,84.16,,hf_open_llm_v1_240829_frozen.csv yi_34b_chat,HFv1 MMLU,74.9,,hf_open_llm_v1_240829_frozen.csv yi_34b_chat,HFv1 TruthfulQA,55.41,,hf_open_llm_v1_240829_frozen.csv yi_34b_chat,HFv1 Winogrande,80.11,,hf_open_llm_v1_240829_frozen.csv yi_34b_llama,HF OpenLLM v1,70.95,,hf_open_llm_v1_240829_frozen.csv yi_34b_llama,HFv1 ARC,64.59,,hf_open_llm_v1_240829_frozen.csv yi_34b_llama,HFv1 GSM8K,60.8,,hf_open_llm_v1_240829_frozen.csv yi_34b_llama,HFv1 HellaSwag,85.63,,hf_open_llm_v1_240829_frozen.csv yi_34b_llama,HFv1 MMLU,76.31,,hf_open_llm_v1_240829_frozen.csv yi_34b_llama,HFv1 TruthfulQA,55.6,,hf_open_llm_v1_240829_frozen.csv yi_34b_llama,HFv1 Winogrande,82.79,,hf_open_llm_v1_240829_frozen.csv yi_34b_v2,HF OpenLLM v1,72.12,,hf_open_llm_v1_240829_frozen.csv yi_34b_v2,HFv1 ARC,66.13,,hf_open_llm_v1_240829_frozen.csv yi_34b_v2,HFv1 GSM8K,64.97,,hf_open_llm_v1_240829_frozen.csv yi_34b_v2,HFv1 HellaSwag,85.0,,hf_open_llm_v1_240829_frozen.csv yi_34b_v2,HFv1 MMLU,75.64,,hf_open_llm_v1_240829_frozen.csv yi_34b_v2,HFv1 TruthfulQA,57.34,,hf_open_llm_v1_240829_frozen.csv yi_34b_v2,HFv1 Winogrande,83.66,,hf_open_llm_v1_240829_frozen.csv yi_34b_v3,HF OpenLLM v1,72.26,,hf_open_llm_v1_240829_frozen.csv yi_34b_v3,HFv1 ARC,67.06,,hf_open_llm_v1_240829_frozen.csv yi_34b_v3,HFv1 GSM8K,64.52,,hf_open_llm_v1_240829_frozen.csv yi_34b_v3,HFv1 HellaSwag,85.11,,hf_open_llm_v1_240829_frozen.csv yi_34b_v3,HFv1 MMLU,75.8,,hf_open_llm_v1_240829_frozen.csv yi_34b_v3,HFv1 TruthfulQA,57.54,,hf_open_llm_v1_240829_frozen.csv yi_34b_v3,HFv1 Winogrande,83.5,,hf_open_llm_v1_240829_frozen.csv yi_6b,HF OpenLLM v1,54.08,,hf_open_llm_v1_240829_frozen.csv yi_6b,HFv1 ARC,55.55,,hf_open_llm_v1_240829_frozen.csv yi_6b,HFv1 GSM8K,12.66,,hf_open_llm_v1_240829_frozen.csv yi_6b,HFv1 HellaSwag,76.57,,hf_open_llm_v1_240829_frozen.csv yi_6b,HFv1 MMLU,64.11,,hf_open_llm_v1_240829_frozen.csv yi_6b,HFv1 TruthfulQA,41.96,,hf_open_llm_v1_240829_frozen.csv yi_6b,HFv1 Winogrande,74.19,,hf_open_llm_v1_240829_frozen.csv yi_6b_200k,HF OpenLLM v1,56.69,,hf_open_llm_v1_240829_frozen.csv yi_6b_200k,HFv1 ARC,53.58,,hf_open_llm_v1_240829_frozen.csv yi_6b_200k,HFv1 GSM8K,30.33,,hf_open_llm_v1_240829_frozen.csv yi_6b_200k,HFv1 HellaSwag,75.58,,hf_open_llm_v1_240829_frozen.csv yi_6b_200k,HFv1 MMLU,64.65,,hf_open_llm_v1_240829_frozen.csv yi_6b_200k,HFv1 TruthfulQA,41.74,,hf_open_llm_v1_240829_frozen.csv yi_6b_200k,HFv1 Winogrande,74.27,,hf_open_llm_v1_240829_frozen.csv yi_6b_200k_dpo,HF OpenLLM v1,51.93,,hf_open_llm_v1_240829_frozen.csv yi_6b_200k_dpo,HFv1 ARC,43.09,,hf_open_llm_v1_240829_frozen.csv yi_6b_200k_dpo,HFv1 GSM8K,11.37,,hf_open_llm_v1_240829_frozen.csv yi_6b_200k_dpo,HFv1 HellaSwag,74.53,,hf_open_llm_v1_240829_frozen.csv yi_6b_200k_dpo,HFv1 MMLU,64.0,,hf_open_llm_v1_240829_frozen.csv yi_6b_200k_dpo,HFv1 TruthfulQA,45.51,,hf_open_llm_v1_240829_frozen.csv yi_6b_200k_dpo,HFv1 Winogrande,73.09,,hf_open_llm_v1_240829_frozen.csv yi_7b_dpo,HF OpenLLM v1,51.93,,hf_open_llm_v1_240829_frozen.csv yi_7b_dpo,HFv1 ARC,43.09,,hf_open_llm_v1_240829_frozen.csv yi_7b_dpo,HFv1 GSM8K,11.37,,hf_open_llm_v1_240829_frozen.csv yi_7b_dpo,HFv1 HellaSwag,74.53,,hf_open_llm_v1_240829_frozen.csv yi_7b_dpo,HFv1 MMLU,64.0,,hf_open_llm_v1_240829_frozen.csv yi_7b_dpo,HFv1 TruthfulQA,45.51,,hf_open_llm_v1_240829_frozen.csv yi_7b_dpo,HFv1 Winogrande,73.09,,hf_open_llm_v1_240829_frozen.csv yi_9b_forest_dpo_v1_0,HF OpenLLM v1,64.11,,hf_open_llm_v1_240829_frozen.csv yi_9b_forest_dpo_v1_0,HFv1 ARC,59.81,,hf_open_llm_v1_240829_frozen.csv yi_9b_forest_dpo_v1_0,HFv1 GSM8K,48.37,,hf_open_llm_v1_240829_frozen.csv yi_9b_forest_dpo_v1_0,HFv1 HellaSwag,78.6,,hf_open_llm_v1_240829_frozen.csv yi_9b_forest_dpo_v1_0,HFv1 MMLU,70.02,,hf_open_llm_v1_240829_frozen.csv yi_9b_forest_dpo_v1_0,HFv1 TruthfulQA,50.98,,hf_open_llm_v1_240829_frozen.csv yi_9b_forest_dpo_v1_0,HFv1 Winogrande,76.87,,hf_open_llm_v1_240829_frozen.csv yi_bagel_2x34b_moe,HF OpenLLM v1,74.93,,hf_open_llm_v1_240829_frozen.csv yi_bagel_2x34b_moe,HFv1 ARC,72.7,,hf_open_llm_v1_240829_frozen.csv yi_bagel_2x34b_moe,HFv1 GSM8K,60.73,,hf_open_llm_v1_240829_frozen.csv yi_bagel_2x34b_moe,HFv1 HellaSwag,85.44,,hf_open_llm_v1_240829_frozen.csv yi_bagel_2x34b_moe,HFv1 MMLU,76.6,,hf_open_llm_v1_240829_frozen.csv yi_bagel_2x34b_moe,HFv1 TruthfulQA,71.42,,hf_open_llm_v1_240829_frozen.csv yi_bagel_2x34b_moe,HFv1 Winogrande,82.72,,hf_open_llm_v1_240829_frozen.csv youri_7b_chat,HF OpenLLM v1,48.51,,hf_open_llm_v1_240829_frozen.csv youri_7b_chat,HFv1 ARC,51.19,,hf_open_llm_v1_240829_frozen.csv youri_7b_chat,HFv1 GSM8K,1.52,,hf_open_llm_v1_240829_frozen.csv youri_7b_chat,HFv1 HellaSwag,76.09,,hf_open_llm_v1_240829_frozen.csv youri_7b_chat,HFv1 MMLU,46.06,,hf_open_llm_v1_240829_frozen.csv youri_7b_chat,HFv1 TruthfulQA,41.17,,hf_open_llm_v1_240829_frozen.csv youri_7b_chat,HFv1 Winogrande,75.06,,hf_open_llm_v1_240829_frozen.csv yousei_22b,HF OpenLLM v1,51.56,,hf_open_llm_v1_240829_frozen.csv yousei_22b,HFv1 ARC,55.89,,hf_open_llm_v1_240829_frozen.csv yousei_22b,HFv1 GSM8K,0.45,,hf_open_llm_v1_240829_frozen.csv yousei_22b,HFv1 HellaSwag,78.55,,hf_open_llm_v1_240829_frozen.csv yousei_22b,HFv1 MMLU,52.31,,hf_open_llm_v1_240829_frozen.csv yousei_22b,HFv1 TruthfulQA,50.68,,hf_open_llm_v1_240829_frozen.csv yousei_22b,HFv1 Winogrande,71.51,,hf_open_llm_v1_240829_frozen.csv ypotryll_22b_epoch2_qlora,HF OpenLLM v1,52.75,,hf_open_llm_v1_240829_frozen.csv ypotryll_22b_epoch2_qlora,HFv1 ARC,59.22,,hf_open_llm_v1_240829_frozen.csv ypotryll_22b_epoch2_qlora,HFv1 GSM8K,5.38,,hf_open_llm_v1_240829_frozen.csv ypotryll_22b_epoch2_qlora,HFv1 HellaSwag,80.66,,hf_open_llm_v1_240829_frozen.csv ypotryll_22b_epoch2_qlora,HFv1 MMLU,54.52,,hf_open_llm_v1_240829_frozen.csv ypotryll_22b_epoch2_qlora,HFv1 TruthfulQA,40.42,,hf_open_llm_v1_240829_frozen.csv ypotryll_22b_epoch2_qlora,HFv1 Winogrande,76.32,,hf_open_llm_v1_240829_frozen.csv yugogpt,HF OpenLLM v1,57.35,,hf_open_llm_v1_240829_frozen.csv yugogpt,HFv1 ARC,58.11,,hf_open_llm_v1_240829_frozen.csv yugogpt,HFv1 GSM8K,30.71,,hf_open_llm_v1_240829_frozen.csv yugogpt,HFv1 HellaSwag,81.45,,hf_open_llm_v1_240829_frozen.csv yugogpt,HFv1 MMLU,60.68,,hf_open_llm_v1_240829_frozen.csv yugogpt,HFv1 TruthfulQA,36.6,,hf_open_llm_v1_240829_frozen.csv yugogpt,HFv1 Winogrande,76.56,,hf_open_llm_v1_240829_frozen.csv yulan_chat_2_13b_fp16,HF OpenLLM v1,57.01,,hf_open_llm_v1_240829_frozen.csv yulan_chat_2_13b_fp16,HFv1 ARC,59.04,,hf_open_llm_v1_240829_frozen.csv yulan_chat_2_13b_fp16,HFv1 GSM8K,13.8,,hf_open_llm_v1_240829_frozen.csv yulan_chat_2_13b_fp16,HFv1 HellaSwag,80.66,,hf_open_llm_v1_240829_frozen.csv yulan_chat_2_13b_fp16,HFv1 MMLU,56.72,,hf_open_llm_v1_240829_frozen.csv yulan_chat_2_13b_fp16,HFv1 TruthfulQA,52.18,,hf_open_llm_v1_240829_frozen.csv yulan_chat_2_13b_fp16,HFv1 Winogrande,79.64,,hf_open_llm_v1_240829_frozen.csv zephyr_0_1,HF OpenLLM v1,58.26,,hf_open_llm_v1_240829_frozen.csv zephyr_0_1,HFv1 ARC,59.47,,hf_open_llm_v1_240829_frozen.csv zephyr_0_1,HFv1 GSM8K,28.96,,hf_open_llm_v1_240829_frozen.csv zephyr_0_1,HFv1 HellaSwag,81.59,,hf_open_llm_v1_240829_frozen.csv zephyr_0_1,HFv1 MMLU,60.25,,hf_open_llm_v1_240829_frozen.csv zephyr_0_1,HFv1 TruthfulQA,42.87,,hf_open_llm_v1_240829_frozen.csv zephyr_0_1,HFv1 Winogrande,76.4,,hf_open_llm_v1_240829_frozen.csv zephyr_0_2,HF OpenLLM v1,58.94,,hf_open_llm_v1_240829_frozen.csv zephyr_0_2,HFv1 ARC,61.09,,hf_open_llm_v1_240829_frozen.csv zephyr_0_2,HFv1 GSM8K,28.35,,hf_open_llm_v1_240829_frozen.csv zephyr_0_2,HFv1 HellaSwag,82.53,,hf_open_llm_v1_240829_frozen.csv zephyr_0_2,HFv1 MMLU,60.37,,hf_open_llm_v1_240829_frozen.csv zephyr_0_2,HFv1 TruthfulQA,44.34,,hf_open_llm_v1_240829_frozen.csv zephyr_0_2,HFv1 Winogrande,76.95,,hf_open_llm_v1_240829_frozen.csv zephyr_0_2_a2_5,HF OpenLLM v1,56.31,,hf_open_llm_v1_240829_frozen.csv zephyr_0_2_a2_5,HFv1 ARC,61.77,,hf_open_llm_v1_240829_frozen.csv zephyr_0_2_a2_5,HFv1 GSM8K,0.3,,hf_open_llm_v1_240829_frozen.csv zephyr_0_2_a2_5,HFv1 HellaSwag,83.85,,hf_open_llm_v1_240829_frozen.csv zephyr_0_2_a2_5,HFv1 MMLU,58.33,,hf_open_llm_v1_240829_frozen.csv zephyr_0_2_a2_5,HFv1 TruthfulQA,60.19,,hf_open_llm_v1_240829_frozen.csv zephyr_0_2_a2_5,HFv1 Winogrande,73.4,,hf_open_llm_v1_240829_frozen.csv zephyr_1b_olmo_sft_qlora,HF OpenLLM v1,37.47,,hf_open_llm_v1_240829_frozen.csv zephyr_1b_olmo_sft_qlora,HFv1 ARC,36.26,,hf_open_llm_v1_240829_frozen.csv zephyr_1b_olmo_sft_qlora,HFv1 GSM8K,2.58,,hf_open_llm_v1_240829_frozen.csv zephyr_1b_olmo_sft_qlora,HFv1 HellaSwag,63.48,,hf_open_llm_v1_240829_frozen.csv zephyr_1b_olmo_sft_qlora,HFv1 MMLU,27.28,,hf_open_llm_v1_240829_frozen.csv zephyr_1b_olmo_sft_qlora,HFv1 TruthfulQA,35.05,,hf_open_llm_v1_240829_frozen.csv zephyr_1b_olmo_sft_qlora,HFv1 Winogrande,60.14,,hf_open_llm_v1_240829_frozen.csv zephyr_220m_dpo_full,HF OpenLLM v1,29.33,,hf_open_llm_v1_240829_frozen.csv zephyr_220m_dpo_full,HFv1 ARC,25.43,,hf_open_llm_v1_240829_frozen.csv zephyr_220m_dpo_full,HFv1 GSM8K,0.53,,hf_open_llm_v1_240829_frozen.csv zephyr_220m_dpo_full,HFv1 HellaSwag,29.15,,hf_open_llm_v1_240829_frozen.csv zephyr_220m_dpo_full,HFv1 MMLU,26.43,,hf_open_llm_v1_240829_frozen.csv zephyr_220m_dpo_full,HFv1 TruthfulQA,43.44,,hf_open_llm_v1_240829_frozen.csv zephyr_220m_dpo_full,HFv1 Winogrande,50.99,,hf_open_llm_v1_240829_frozen.csv zephyr_220m_sft_full,HF OpenLLM v1,29.33,,hf_open_llm_v1_240829_frozen.csv zephyr_220m_sft_full,HFv1 ARC,25.26,,hf_open_llm_v1_240829_frozen.csv zephyr_220m_sft_full,HFv1 GSM8K,0.38,,hf_open_llm_v1_240829_frozen.csv zephyr_220m_sft_full,HFv1 HellaSwag,29.03,,hf_open_llm_v1_240829_frozen.csv zephyr_220m_sft_full,HFv1 MMLU,26.45,,hf_open_llm_v1_240829_frozen.csv zephyr_220m_sft_full,HFv1 TruthfulQA,43.23,,hf_open_llm_v1_240829_frozen.csv zephyr_220m_sft_full,HFv1 Winogrande,51.62,,hf_open_llm_v1_240829_frozen.csv zephyr_2b_gemma_sft_qlora,HF OpenLLM v1,47.26,,hf_open_llm_v1_240829_frozen.csv zephyr_2b_gemma_sft_qlora,HFv1 ARC,49.15,,hf_open_llm_v1_240829_frozen.csv zephyr_2b_gemma_sft_qlora,HFv1 GSM8K,18.2,,hf_open_llm_v1_240829_frozen.csv zephyr_2b_gemma_sft_qlora,HFv1 HellaSwag,71.94,,hf_open_llm_v1_240829_frozen.csv zephyr_2b_gemma_sft_qlora,HFv1 MMLU,41.88,,hf_open_llm_v1_240829_frozen.csv zephyr_2b_gemma_sft_qlora,HFv1 TruthfulQA,35.77,,hf_open_llm_v1_240829_frozen.csv zephyr_2b_gemma_sft_qlora,HFv1 Winogrande,66.61,,hf_open_llm_v1_240829_frozen.csv zephyr_7b_alpha_expo,HF OpenLLM v1,62.15,,hf_open_llm_v1_240829_frozen.csv zephyr_7b_alpha_expo,HFv1 ARC,60.84,,hf_open_llm_v1_240829_frozen.csv zephyr_7b_alpha_expo,HFv1 GSM8K,28.28,,hf_open_llm_v1_240829_frozen.csv zephyr_7b_alpha_expo,HFv1 HellaSwag,84.25,,hf_open_llm_v1_240829_frozen.csv zephyr_7b_alpha_expo,HFv1 MMLU,60.6,,hf_open_llm_v1_240829_frozen.csv zephyr_7b_alpha_expo,HFv1 TruthfulQA,60.89,,hf_open_llm_v1_240829_frozen.csv zephyr_7b_alpha_expo,HFv1 Winogrande,78.06,,hf_open_llm_v1_240829_frozen.csv zephyr_7b_beta_expo,HF OpenLLM v1,61.84,,hf_open_llm_v1_240829_frozen.csv zephyr_7b_beta_expo,HFv1 ARC,62.29,,hf_open_llm_v1_240829_frozen.csv zephyr_7b_beta_expo,HFv1 GSM8K,27.29,,hf_open_llm_v1_240829_frozen.csv zephyr_7b_beta_expo,HFv1 HellaSwag,84.5,,hf_open_llm_v1_240829_frozen.csv zephyr_7b_beta_expo,HFv1 MMLU,60.97,,hf_open_llm_v1_240829_frozen.csv zephyr_7b_beta_expo,HFv1 TruthfulQA,58.34,,hf_open_llm_v1_240829_frozen.csv zephyr_7b_beta_expo,HFv1 Winogrande,77.66,,hf_open_llm_v1_240829_frozen.csv zephyr_7b_dpo_full,HF OpenLLM v1,58.25,,hf_open_llm_v1_240829_frozen.csv zephyr_7b_dpo_full,HFv1 ARC,62.88,,hf_open_llm_v1_240829_frozen.csv zephyr_7b_dpo_full,HFv1 GSM8K,18.57,,hf_open_llm_v1_240829_frozen.csv zephyr_7b_dpo_full,HFv1 HellaSwag,84.45,,hf_open_llm_v1_240829_frozen.csv zephyr_7b_dpo_full,HFv1 MMLU,59.56,,hf_open_llm_v1_240829_frozen.csv zephyr_7b_dpo_full,HFv1 TruthfulQA,47.41,,hf_open_llm_v1_240829_frozen.csv zephyr_7b_dpo_full,HFv1 Winogrande,76.64,,hf_open_llm_v1_240829_frozen.csv zephyr_7b_dpo_full_beta_0_2,HF OpenLLM v1,61.55,,hf_open_llm_v1_240829_frozen.csv zephyr_7b_dpo_full_beta_0_2,HFv1 ARC,61.86,,hf_open_llm_v1_240829_frozen.csv zephyr_7b_dpo_full_beta_0_2,HFv1 GSM8K,30.02,,hf_open_llm_v1_240829_frozen.csv zephyr_7b_dpo_full_beta_0_2,HFv1 HellaSwag,84.04,,hf_open_llm_v1_240829_frozen.csv zephyr_7b_dpo_full_beta_0_2,HFv1 MMLU,61.85,,hf_open_llm_v1_240829_frozen.csv zephyr_7b_dpo_full_beta_0_2,HFv1 TruthfulQA,54.78,,hf_open_llm_v1_240829_frozen.csv zephyr_7b_dpo_full_beta_0_2,HFv1 Winogrande,76.95,,hf_open_llm_v1_240829_frozen.csv zephyr_7b_dpo_full_expo,HF OpenLLM v1,57.73,,hf_open_llm_v1_240829_frozen.csv zephyr_7b_dpo_full_expo,HFv1 ARC,62.29,,hf_open_llm_v1_240829_frozen.csv zephyr_7b_dpo_full_expo,HFv1 GSM8K,13.95,,hf_open_llm_v1_240829_frozen.csv zephyr_7b_dpo_full_expo,HFv1 HellaSwag,84.9,,hf_open_llm_v1_240829_frozen.csv zephyr_7b_dpo_full_expo,HFv1 MMLU,59.01,,hf_open_llm_v1_240829_frozen.csv zephyr_7b_dpo_full_expo,HFv1 TruthfulQA,50.08,,hf_open_llm_v1_240829_frozen.csv zephyr_7b_dpo_full_expo,HFv1 Winogrande,76.16,,hf_open_llm_v1_240829_frozen.csv zephyr_7b_dpo_qlora,HF OpenLLM v1,63.51,,hf_open_llm_v1_240829_frozen.csv zephyr_7b_dpo_qlora,HFv1 ARC,63.82,,hf_open_llm_v1_240829_frozen.csv zephyr_7b_dpo_qlora,HFv1 GSM8K,42.08,,hf_open_llm_v1_240829_frozen.csv zephyr_7b_dpo_qlora,HFv1 HellaSwag,85.35,,hf_open_llm_v1_240829_frozen.csv zephyr_7b_dpo_qlora,HFv1 MMLU,63.82,,hf_open_llm_v1_240829_frozen.csv zephyr_7b_dpo_qlora,HFv1 TruthfulQA,47.14,,hf_open_llm_v1_240829_frozen.csv zephyr_7b_dpo_qlora,HFv1 Winogrande,79.01,,hf_open_llm_v1_240829_frozen.csv zephyr_7b_dpo_qlora_no_sft,HF OpenLLM v1,62.67,,hf_open_llm_v1_240829_frozen.csv zephyr_7b_dpo_qlora_no_sft,HFv1 ARC,62.46,,hf_open_llm_v1_240829_frozen.csv zephyr_7b_dpo_qlora_no_sft,HFv1 GSM8K,41.62,,hf_open_llm_v1_240829_frozen.csv zephyr_7b_dpo_qlora_no_sft,HFv1 HellaSwag,84.5,,hf_open_llm_v1_240829_frozen.csv zephyr_7b_dpo_qlora_no_sft,HFv1 MMLU,64.02,,hf_open_llm_v1_240829_frozen.csv zephyr_7b_dpo_qlora_no_sft,HFv1 TruthfulQA,44.25,,hf_open_llm_v1_240829_frozen.csv zephyr_7b_dpo_qlora_no_sft,HFv1 Winogrande,79.16,,hf_open_llm_v1_240829_frozen.csv zephyr_7b_dpo_qlora_v1,HF OpenLLM v1,64.43,,hf_open_llm_v1_240829_frozen.csv zephyr_7b_dpo_qlora_v1,HFv1 ARC,67.83,,hf_open_llm_v1_240829_frozen.csv zephyr_7b_dpo_qlora_v1,HFv1 GSM8K,34.42,,hf_open_llm_v1_240829_frozen.csv zephyr_7b_dpo_qlora_v1,HFv1 HellaSwag,86.64,,hf_open_llm_v1_240829_frozen.csv zephyr_7b_dpo_qlora_v1,HFv1 MMLU,63.55,,hf_open_llm_v1_240829_frozen.csv zephyr_7b_dpo_qlora_v1,HFv1 TruthfulQA,53.8,,hf_open_llm_v1_240829_frozen.csv zephyr_7b_dpo_qlora_v1,HFv1 Winogrande,80.35,,hf_open_llm_v1_240829_frozen.csv zephyr_7b_gpo_update3_i0,HF OpenLLM v1,63.31,,hf_open_llm_v1_240829_frozen.csv zephyr_7b_gpo_update3_i0,HFv1 ARC,65.19,,hf_open_llm_v1_240829_frozen.csv zephyr_7b_gpo_update3_i0,HFv1 GSM8K,35.1,,hf_open_llm_v1_240829_frozen.csv zephyr_7b_gpo_update3_i0,HFv1 HellaSwag,85.37,,hf_open_llm_v1_240829_frozen.csv zephyr_7b_gpo_update3_i0,HFv1 MMLU,62.5,,hf_open_llm_v1_240829_frozen.csv zephyr_7b_gpo_update3_i0,HFv1 TruthfulQA,51.85,,hf_open_llm_v1_240829_frozen.csv zephyr_7b_gpo_update3_i0,HFv1 Winogrande,79.87,,hf_open_llm_v1_240829_frozen.csv zephyr_7b_gpo_update4_i0,HF OpenLLM v1,63.17,,hf_open_llm_v1_240829_frozen.csv zephyr_7b_gpo_update4_i0,HFv1 ARC,63.82,,hf_open_llm_v1_240829_frozen.csv zephyr_7b_gpo_update4_i0,HFv1 GSM8K,40.71,,hf_open_llm_v1_240829_frozen.csv zephyr_7b_gpo_update4_i0,HFv1 HellaSwag,84.21,,hf_open_llm_v1_240829_frozen.csv zephyr_7b_gpo_update4_i0,HFv1 MMLU,63.33,,hf_open_llm_v1_240829_frozen.csv zephyr_7b_gpo_update4_i0,HFv1 TruthfulQA,47.18,,hf_open_llm_v1_240829_frozen.csv zephyr_7b_gpo_update4_i0,HFv1 Winogrande,79.79,,hf_open_llm_v1_240829_frozen.csv zephyr_7b_gpo_v5_i1,HF OpenLLM v1,62.68,,hf_open_llm_v1_240829_frozen.csv zephyr_7b_gpo_v5_i1,HFv1 ARC,65.44,,hf_open_llm_v1_240829_frozen.csv zephyr_7b_gpo_v5_i1,HFv1 GSM8K,31.61,,hf_open_llm_v1_240829_frozen.csv zephyr_7b_gpo_v5_i1,HFv1 HellaSwag,85.52,,hf_open_llm_v1_240829_frozen.csv zephyr_7b_gpo_v5_i1,HFv1 MMLU,62.14,,hf_open_llm_v1_240829_frozen.csv zephyr_7b_gpo_v5_i1,HFv1 TruthfulQA,51.82,,hf_open_llm_v1_240829_frozen.csv zephyr_7b_gpo_v5_i1,HFv1 Winogrande,79.56,,hf_open_llm_v1_240829_frozen.csv zephyr_7b_gpo_v6_i1,HF OpenLLM v1,63.66,,hf_open_llm_v1_240829_frozen.csv zephyr_7b_gpo_v6_i1,HFv1 ARC,65.61,,hf_open_llm_v1_240829_frozen.csv zephyr_7b_gpo_v6_i1,HFv1 GSM8K,33.06,,hf_open_llm_v1_240829_frozen.csv zephyr_7b_gpo_v6_i1,HFv1 HellaSwag,85.83,,hf_open_llm_v1_240829_frozen.csv zephyr_7b_gpo_v6_i1,HFv1 MMLU,62.96,,hf_open_llm_v1_240829_frozen.csv zephyr_7b_gpo_v6_i1,HFv1 TruthfulQA,56.14,,hf_open_llm_v1_240829_frozen.csv zephyr_7b_gpo_v6_i1,HFv1 Winogrande,79.64,,hf_open_llm_v1_240829_frozen.csv zephyr_7b_ipo_qlora_v0,HF OpenLLM v1,62.67,,hf_open_llm_v1_240829_frozen.csv zephyr_7b_ipo_qlora_v0,HFv1 ARC,63.14,,hf_open_llm_v1_240829_frozen.csv zephyr_7b_ipo_qlora_v0,HFv1 GSM8K,40.03,,hf_open_llm_v1_240829_frozen.csv zephyr_7b_ipo_qlora_v0,HFv1 HellaSwag,84.37,,hf_open_llm_v1_240829_frozen.csv zephyr_7b_ipo_qlora_v0,HFv1 MMLU,63.54,,hf_open_llm_v1_240829_frozen.csv zephyr_7b_ipo_qlora_v0,HFv1 TruthfulQA,45.35,,hf_open_llm_v1_240829_frozen.csv zephyr_7b_ipo_qlora_v0,HFv1 Winogrande,79.56,,hf_open_llm_v1_240829_frozen.csv zephyr_7b_lgpo_v1_i1,HF OpenLLM v1,62.54,,hf_open_llm_v1_240829_frozen.csv zephyr_7b_lgpo_v1_i1,HFv1 ARC,65.96,,hf_open_llm_v1_240829_frozen.csv zephyr_7b_lgpo_v1_i1,HFv1 GSM8K,31.61,,hf_open_llm_v1_240829_frozen.csv zephyr_7b_lgpo_v1_i1,HFv1 HellaSwag,85.2,,hf_open_llm_v1_240829_frozen.csv zephyr_7b_lgpo_v1_i1,HFv1 MMLU,61.88,,hf_open_llm_v1_240829_frozen.csv zephyr_7b_lgpo_v1_i1,HFv1 TruthfulQA,51.1,,hf_open_llm_v1_240829_frozen.csv zephyr_7b_lgpo_v1_i1,HFv1 Winogrande,79.48,,hf_open_llm_v1_240829_frozen.csv zephyr_7b_norobots,HF OpenLLM v1,55.16,,hf_open_llm_v1_240829_frozen.csv zephyr_7b_norobots,HFv1 ARC,56.48,,hf_open_llm_v1_240829_frozen.csv zephyr_7b_norobots,HFv1 GSM8K,20.62,,hf_open_llm_v1_240829_frozen.csv zephyr_7b_norobots,HFv1 HellaSwag,79.64,,hf_open_llm_v1_240829_frozen.csv zephyr_7b_norobots,HFv1 MMLU,55.52,,hf_open_llm_v1_240829_frozen.csv zephyr_7b_norobots,HFv1 TruthfulQA,44.6,,hf_open_llm_v1_240829_frozen.csv zephyr_7b_norobots,HFv1 Winogrande,74.11,,hf_open_llm_v1_240829_frozen.csv zephyr_7b_sft_full_spin_iter3,HF OpenLLM v1,63.7,,hf_open_llm_v1_240829_frozen.csv zephyr_7b_sft_full_spin_iter3,HFv1 ARC,66.13,,hf_open_llm_v1_240829_frozen.csv zephyr_7b_sft_full_spin_iter3,HFv1 GSM8K,34.19,,hf_open_llm_v1_240829_frozen.csv zephyr_7b_sft_full_spin_iter3,HFv1 HellaSwag,85.85,,hf_open_llm_v1_240829_frozen.csv zephyr_7b_sft_full_spin_iter3,HFv1 MMLU,61.51,,hf_open_llm_v1_240829_frozen.csv zephyr_7b_sft_full_spin_iter3,HFv1 TruthfulQA,57.89,,hf_open_llm_v1_240829_frozen.csv zephyr_7b_sft_full_spin_iter3,HFv1 Winogrande,76.64,,hf_open_llm_v1_240829_frozen.csv zephyr_7b_truthy,HF OpenLLM v1,61.93,,hf_open_llm_v1_240829_frozen.csv zephyr_7b_truthy,HFv1 ARC,60.75,,hf_open_llm_v1_240829_frozen.csv zephyr_7b_truthy,HFv1 GSM8K,25.47,,hf_open_llm_v1_240829_frozen.csv zephyr_7b_truthy,HFv1 HellaSwag,84.64,,hf_open_llm_v1_240829_frozen.csv zephyr_7b_truthy,HFv1 MMLU,59.53,,hf_open_llm_v1_240829_frozen.csv zephyr_7b_truthy,HFv1 TruthfulQA,63.31,,hf_open_llm_v1_240829_frozen.csv zephyr_7b_truthy,HFv1 Winogrande,77.9,,hf_open_llm_v1_240829_frozen.csv zephyr_alpha_nebula_v2_7b,HF OpenLLM v1,59.01,,hf_open_llm_v1_240829_frozen.csv zephyr_alpha_nebula_v2_7b,HFv1 ARC,58.62,,hf_open_llm_v1_240829_frozen.csv zephyr_alpha_nebula_v2_7b,HFv1 GSM8K,23.88,,hf_open_llm_v1_240829_frozen.csv zephyr_alpha_nebula_v2_7b,HFv1 HellaSwag,83.05,,hf_open_llm_v1_240829_frozen.csv zephyr_alpha_nebula_v2_7b,HFv1 MMLU,56.68,,hf_open_llm_v1_240829_frozen.csv zephyr_alpha_nebula_v2_7b,HFv1 TruthfulQA,58.28,,hf_open_llm_v1_240829_frozen.csv zephyr_alpha_nebula_v2_7b,HFv1 Winogrande,73.56,,hf_open_llm_v1_240829_frozen.csv zephyr_danube2_sft_qlora,HF OpenLLM v1,48.28,,hf_open_llm_v1_240829_frozen.csv zephyr_danube2_sft_qlora,HFv1 ARC,42.49,,hf_open_llm_v1_240829_frozen.csv zephyr_danube2_sft_qlora,HFv1 GSM8K,28.58,,hf_open_llm_v1_240829_frozen.csv zephyr_danube2_sft_qlora,HFv1 HellaSwag,72.93,,hf_open_llm_v1_240829_frozen.csv zephyr_danube2_sft_qlora,HFv1 MMLU,40.19,,hf_open_llm_v1_240829_frozen.csv zephyr_danube2_sft_qlora,HFv1 TruthfulQA,37.89,,hf_open_llm_v1_240829_frozen.csv zephyr_danube2_sft_qlora,HFv1 Winogrande,67.56,,hf_open_llm_v1_240829_frozen.csv zephyr_danube_sft_qlora,HF OpenLLM v1,40.11,,hf_open_llm_v1_240829_frozen.csv zephyr_danube_sft_qlora,HFv1 ARC,40.44,,hf_open_llm_v1_240829_frozen.csv zephyr_danube_sft_qlora,HFv1 GSM8K,2.05,,hf_open_llm_v1_240829_frozen.csv zephyr_danube_sft_qlora,HFv1 HellaSwag,69.4,,hf_open_llm_v1_240829_frozen.csv zephyr_danube_sft_qlora,HFv1 MMLU,27.0,,hf_open_llm_v1_240829_frozen.csv zephyr_danube_sft_qlora,HFv1 TruthfulQA,37.08,,hf_open_llm_v1_240829_frozen.csv zephyr_danube_sft_qlora,HFv1 Winogrande,64.72,,hf_open_llm_v1_240829_frozen.csv zephyr_gemma_rpo,HF OpenLLM v1,60.93,,hf_open_llm_v1_240829_frozen.csv zephyr_gemma_rpo,HFv1 ARC,56.91,,hf_open_llm_v1_240829_frozen.csv zephyr_gemma_rpo,HFv1 GSM8K,42.23,,hf_open_llm_v1_240829_frozen.csv zephyr_gemma_rpo,HFv1 HellaSwag,83.5,,hf_open_llm_v1_240829_frozen.csv zephyr_gemma_rpo,HFv1 MMLU,59.27,,hf_open_llm_v1_240829_frozen.csv zephyr_gemma_rpo,HFv1 TruthfulQA,49.72,,hf_open_llm_v1_240829_frozen.csv zephyr_gemma_rpo,HFv1 Winogrande,73.95,,hf_open_llm_v1_240829_frozen.csv zephyr_phi_1_5_sft_qlora,HF OpenLLM v1,50.14,,hf_open_llm_v1_240829_frozen.csv zephyr_phi_1_5_sft_qlora,HFv1 ARC,51.96,,hf_open_llm_v1_240829_frozen.csv zephyr_phi_1_5_sft_qlora,HFv1 GSM8K,27.6,,hf_open_llm_v1_240829_frozen.csv zephyr_phi_1_5_sft_qlora,HFv1 HellaSwag,62.22,,hf_open_llm_v1_240829_frozen.csv zephyr_phi_1_5_sft_qlora,HFv1 MMLU,43.09,,hf_open_llm_v1_240829_frozen.csv zephyr_phi_1_5_sft_qlora,HFv1 TruthfulQA,42.87,,hf_open_llm_v1_240829_frozen.csv zephyr_phi_1_5_sft_qlora,HFv1 Winogrande,73.09,,hf_open_llm_v1_240829_frozen.csv zephyr_python_ru,HF OpenLLM v1,60.08,,hf_open_llm_v1_240829_frozen.csv zephyr_python_ru,HFv1 ARC,56.14,,hf_open_llm_v1_240829_frozen.csv zephyr_python_ru,HFv1 GSM8K,32.52,,hf_open_llm_v1_240829_frozen.csv zephyr_python_ru,HFv1 HellaSwag,82.03,,hf_open_llm_v1_240829_frozen.csv zephyr_python_ru,HFv1 MMLU,60.18,,hf_open_llm_v1_240829_frozen.csv zephyr_python_ru,HFv1 TruthfulQA,52.8,,hf_open_llm_v1_240829_frozen.csv zephyr_python_ru,HFv1 Winogrande,76.8,,hf_open_llm_v1_240829_frozen.csv zephyr_smol_llama_100m_dpo_full,HF OpenLLM v1,29.37,,hf_open_llm_v1_240829_frozen.csv zephyr_smol_llama_100m_dpo_full,HFv1 ARC,25.0,,hf_open_llm_v1_240829_frozen.csv zephyr_smol_llama_100m_dpo_full,HFv1 GSM8K,0.68,,hf_open_llm_v1_240829_frozen.csv zephyr_smol_llama_100m_dpo_full,HFv1 HellaSwag,28.54,,hf_open_llm_v1_240829_frozen.csv zephyr_smol_llama_100m_dpo_full,HFv1 MMLU,25.18,,hf_open_llm_v1_240829_frozen.csv zephyr_smol_llama_100m_dpo_full,HFv1 TruthfulQA,45.75,,hf_open_llm_v1_240829_frozen.csv zephyr_smol_llama_100m_dpo_full,HFv1 Winogrande,51.07,,hf_open_llm_v1_240829_frozen.csv zephyr_tiny_dpo_qlora,HF OpenLLM v1,37.35,,hf_open_llm_v1_240829_frozen.csv zephyr_tiny_dpo_qlora,HFv1 ARC,36.6,,hf_open_llm_v1_240829_frozen.csv zephyr_tiny_dpo_qlora,HFv1 GSM8K,2.12,,hf_open_llm_v1_240829_frozen.csv zephyr_tiny_dpo_qlora,HFv1 HellaSwag,61.66,,hf_open_llm_v1_240829_frozen.csv zephyr_tiny_dpo_qlora,HFv1 MMLU,25.78,,hf_open_llm_v1_240829_frozen.csv zephyr_tiny_dpo_qlora,HFv1 TruthfulQA,36.4,,hf_open_llm_v1_240829_frozen.csv zephyr_tiny_dpo_qlora,HFv1 Winogrande,61.56,,hf_open_llm_v1_240829_frozen.csv zephyr_tinyllama_sft_qlora,HF OpenLLM v1,36.64,,hf_open_llm_v1_240829_frozen.csv zephyr_tinyllama_sft_qlora,HFv1 ARC,34.64,,hf_open_llm_v1_240829_frozen.csv zephyr_tinyllama_sft_qlora,HFv1 GSM8K,1.74,,hf_open_llm_v1_240829_frozen.csv zephyr_tinyllama_sft_qlora,HFv1 HellaSwag,59.84,,hf_open_llm_v1_240829_frozen.csv zephyr_tinyllama_sft_qlora,HFv1 MMLU,25.85,,hf_open_llm_v1_240829_frozen.csv zephyr_tinyllama_sft_qlora,HFv1 TruthfulQA,36.57,,hf_open_llm_v1_240829_frozen.csv zephyr_tinyllama_sft_qlora,HFv1 Winogrande,61.17,,hf_open_llm_v1_240829_frozen.csv zephyrnotus_11b_alpha,HF OpenLLM v1,59.26,,hf_open_llm_v1_240829_frozen.csv zephyrnotus_11b_alpha,HFv1 ARC,61.35,,hf_open_llm_v1_240829_frozen.csv zephyrnotus_11b_alpha,HFv1 GSM8K,17.13,,hf_open_llm_v1_240829_frozen.csv zephyrnotus_11b_alpha,HFv1 HellaSwag,82.8,,hf_open_llm_v1_240829_frozen.csv zephyrnotus_11b_alpha,HFv1 MMLU,60.67,,hf_open_llm_v1_240829_frozen.csv zephyrnotus_11b_alpha,HFv1 TruthfulQA,57.22,,hf_open_llm_v1_240829_frozen.csv zephyrnotus_11b_alpha,HFv1 Winogrande,76.4,,hf_open_llm_v1_240829_frozen.csv ziya2_13b_base,HF OpenLLM v1,62.04,,hf_open_llm_v1_240829_frozen.csv ziya2_13b_base,HFv1 ARC,54.01,,hf_open_llm_v1_240829_frozen.csv ziya2_13b_base,HFv1 GSM8K,60.42,,hf_open_llm_v1_240829_frozen.csv ziya2_13b_base,HFv1 HellaSwag,78.9,,hf_open_llm_v1_240829_frozen.csv ziya2_13b_base,HFv1 MMLU,61.32,,hf_open_llm_v1_240829_frozen.csv ziya2_13b_base,HFv1 TruthfulQA,42.74,,hf_open_llm_v1_240829_frozen.csv ziya2_13b_base,HFv1 Winogrande,74.82,,hf_open_llm_v1_240829_frozen.csv zysec_7b,HF OpenLLM v1,58.41,,hf_open_llm_v1_240829_frozen.csv zysec_7b,HFv1 ARC,57.51,,hf_open_llm_v1_240829_frozen.csv zysec_7b,HFv1 GSM8K,28.96,,hf_open_llm_v1_240829_frozen.csv zysec_7b,HFv1 HellaSwag,79.73,,hf_open_llm_v1_240829_frozen.csv zysec_7b,HFv1 MMLU,58.65,,hf_open_llm_v1_240829_frozen.csv zysec_7b,HFv1 TruthfulQA,51.11,,hf_open_llm_v1_240829_frozen.csv zysec_7b,HFv1 Winogrande,74.51,,hf_open_llm_v1_240829_frozen.csv zysec_7b_v2,HF OpenLLM v1,54.63,,hf_open_llm_v1_240829_frozen.csv zysec_7b_v2,HFv1 ARC,53.07,,hf_open_llm_v1_240829_frozen.csv zysec_7b_v2,HFv1 GSM8K,28.05,,hf_open_llm_v1_240829_frozen.csv zysec_7b_v2,HFv1 HellaSwag,76.3,,hf_open_llm_v1_240829_frozen.csv zysec_7b_v2,HFv1 MMLU,54.55,,hf_open_llm_v1_240829_frozen.csv zysec_7b_v2,HFv1 TruthfulQA,47.05,,hf_open_llm_v1_240829_frozen.csv zysec_7b_v2,HFv1 Winogrande,68.75,,hf_open_llm_v1_240829_frozen.csv zysec_8b_v2,HF OpenLLM v1,54.63,,hf_open_llm_v1_240829_frozen.csv zysec_8b_v2,HFv1 ARC,53.07,,hf_open_llm_v1_240829_frozen.csv zysec_8b_v2,HFv1 GSM8K,28.05,,hf_open_llm_v1_240829_frozen.csv zysec_8b_v2,HFv1 HellaSwag,76.3,,hf_open_llm_v1_240829_frozen.csv zysec_8b_v2,HFv1 MMLU,54.55,,hf_open_llm_v1_240829_frozen.csv zysec_8b_v2,HFv1 TruthfulQA,47.05,,hf_open_llm_v1_240829_frozen.csv zysec_8b_v2,HFv1 Winogrande,68.75,,hf_open_llm_v1_240829_frozen.csv zyte_1b,HF OpenLLM v1,38.23,,hf_open_llm_v1_240829_frozen.csv zyte_1b,HFv1 ARC,37.88,,hf_open_llm_v1_240829_frozen.csv zyte_1b,HFv1 GSM8K,1.44,,hf_open_llm_v1_240829_frozen.csv zyte_1b,HFv1 HellaSwag,61.37,,hf_open_llm_v1_240829_frozen.csv zyte_1b,HFv1 MMLU,24.61,,hf_open_llm_v1_240829_frozen.csv zyte_1b,HFv1 TruthfulQA,42.14,,hf_open_llm_v1_240829_frozen.csv zyte_1b,HFv1 Winogrande,61.96,,hf_open_llm_v1_240829_frozen.csv claude_2_1,BFCL,74.57,,bfcl_240906.csv claude_3_5_sonnet_20240620,BFCL,76.29,,bfcl_240906.csv claude_3_haiku_20240307,BFCL,60.34,,bfcl_240906.csv claude_3_opus_20240229,BFCL,80.88,,bfcl_240906.csv claude_3_sonnet_20240229,BFCL,77.92,,bfcl_240906.csv claude_instant_1_2,BFCL,47.95,,bfcl_240906.csv command_r_plus_original,BFCL,74.11,,bfcl_240906.csv dbrx_instructruct,BFCL,69.55,,bfcl_240906.csv deepseek_v1_5,BFCL,11.18,,bfcl_240906.csv firefunction_v1,BFCL,48.11,,bfcl_240906.csv firefunction_v2,BFCL,77.45,,bfcl_240906.csv functionary_medium_v3_1,BFCL,82.55,,bfcl_240906.csv functionary_small_v3_1,BFCL,80.21,,bfcl_240906.csv functionary_small_v3_2,BFCL,78.96,,bfcl_240906.csv gemini_1_0_pro_001,BFCL,57.81,,bfcl_240906.csv gemini_1_5_flash_preview_0514,BFCL,70.75,,bfcl_240906.csv gemini_1_5_pro_preview_0409,BFCL,74.56,,bfcl_240906.csv gemini_1_5_pro_preview_0514,BFCL,74.75,,bfcl_240906.csv gemma_7b_it,BFCL,10.3,,bfcl_240906.csv gorilla_openfunctions_v2,BFCL,79.1,,bfcl_240906.csv gpt_3_5_turbo_0125,BFCL,75.41,,bfcl_240906.csv gpt_4_0125_preview,BFCL,85.79,,bfcl_240906.csv gpt_4_0613,BFCL,84.74,,bfcl_240906.csv gpt_4_1106_preview,BFCL,85.0,,bfcl_240906.csv gpt_4_turbo_2024_04_09,BFCL,83.89,,bfcl_240906.csv gpt_4o_2024_05_13,BFCL,83.13,,bfcl_240906.csv gpt_4o_2024_08_06,BFCL,78.87,,bfcl_240906.csv gpt_4o_mini_2024_07_18,BFCL,83.35,,bfcl_240906.csv granite_20b_functioncalling,BFCL,76.63,,bfcl_240906.csv hermes_2_pro_llama3_70b,BFCL,74.78,,bfcl_240906.csv hermes_2_pro_llama3_8b,BFCL,66.18,,bfcl_240906.csv hermes_2_pro_mistral_7b,BFCL,65.44,,bfcl_240906.csv hermes_2_theta_llama3_70b,BFCL,10.0,,bfcl_240906.csv hermes_2_theta_llama3_8b,BFCL,64.83,,bfcl_240906.csv llama3_70b_instruct,BFCL,81.59,,bfcl_240906.csv llama3_8b_instruct,BFCL,62.7,,bfcl_240906.csv mistral_large_2407,BFCL,79.66,,bfcl_240906.csv mistral_medium_2312,BFCL,72.19,,bfcl_240906.csv mistral_small_2402,BFCL,55.36,,bfcl_240906.csv mistral_tiny_2312,BFCL,21.17,,bfcl_240906.csv nemotron_4_340b_instruct,BFCL,80.23,,bfcl_240906.csv open_mistral_nemo_2407,BFCL,76.31,,bfcl_240906.csv open_mixtral_8x22b,BFCL,79.14,,bfcl_240906.csv open_mixtral_8x7b,BFCL,60.82,,bfcl_240906.csv snowflake_arctic_instruct,BFCL,42.46,,bfcl_240906.csv xlam_1b_fc_r,BFCL,74.9,,bfcl_240906.csv xlam_7b_fc_r,BFCL,79.41,,bfcl_240906.csv llama3_1_405b_instruct,eq_bench,83.0,[],eqbench_240912.csv claude_3_5_sonnet_20240620,eq_bench,86.36,[],eqbench_240912.csv gpt_4o,eq_bench,83.51,[],eqbench_240912.csv gpt_4_turbo_2024_04_09,eq_bench,86.35,[],eqbench_240912.csv rys_xlarge_base,eq_bench,85.05,[],eqbench_240912.csv gpt_4_0613,eq_bench,84.79,[],eqbench_240912.csv gpt_4_0314,eq_bench,85.73,[],eqbench_240912.csv rys_xlarge,eq_bench,84.55,[],eqbench_240912.csv gpt_4_1106_preview,eq_bench,86.05,[],eqbench_240912.csv gpt_4_0125_preview,eq_bench,83.87,[],eqbench_240912.csv claude_3_opus_20240229,eq_bench,82.19,[],eqbench_240912.csv mistral_large_2407,eq_bench,85.05,[],eqbench_240912.csv qwen2_72b_instruct,eq_bench,81.35,[],eqbench_240912.csv mistral_large_2402,eq_bench,85.17,[],eqbench_240912.csv llama3_70b_instruct,eq_bench,82.13,[],eqbench_240912.csv qwen1_5_110b_chat,eq_bench,83.68,[],eqbench_240912.csv solar_pro_preview_instruct,eq_bench,78.52,[],eqbench_240912.csv senku_70b_full,eq_bench,84.89,[],eqbench_240912.csv smaug_llama3_70b_instruct,eq_bench,80.69,[],eqbench_240912.csv ece_tw3_jrgl_v1,eq_bench,83.07,[],eqbench_240912.csv miiqu_f16,eq_bench,83.17,[],eqbench_240912.csv qwen1_5_72b_chat,eq_bench,82.81,[],eqbench_240912.csv miqu_1_70b,eq_bench,82.91,[],eqbench_240912.csv mistral_medium,eq_bench,82.57,[],eqbench_240912.csv gemma_2_27b_it,eq_bench,80.55,[],eqbench_240912.csv gpt_4o_mini,eq_bench,76.93,[],eqbench_240912.csv 🆕phi_3_5_moe_instruct,eq_bench,76.97,[],eqbench_240912.csv deepseek_v2_chat_0628,eq_bench,83.18,[],eqbench_240912.csv miquella_120b,eq_bench,82.15,[],eqbench_240912.csv phi_3_medium_4k_instruct,eq_bench,76.34,[],eqbench_240912.csv claude_3_sonnet_20240229,eq_bench,80.45,[],eqbench_240912.csv tess_72b_v1_5b,eq_bench,81.78,[],eqbench_240912.csv mixtral_8x22b_instruct_v0_1,eq_bench,78.79,[],eqbench_240912.csv qwen_72b_chat,eq_bench,80.7,[],eqbench_240912.csv smaug_72b_v0_1,eq_bench,79.75,[],eqbench_240912.csv gemma_2_9b_it,eq_bench,80.46,[],eqbench_240912.csv yi_1_5_34b_chat,eq_bench,72.93,[],eqbench_240912.csv mixtral_34bx2_moe_60b,eq_bench,72.69,[],eqbench_240912.csv phi_3_small_8k_instruct,eq_bench,73.49,[],eqbench_240912.csv wizardlm_2_8x22b,eq_bench,77.91,[],eqbench_240912.csv miquliz_120b_v2_0,eq_bench,82.21,[],eqbench_240912.csv quyen_pro_max_v0_1,eq_bench,77.16,[],eqbench_240912.csv qwen1_5_32b_chat,eq_bench,75.59,[],eqbench_240912.csv 🆕gemma_2_ifable_9b,eq_bench,79.93,[],eqbench_240912.csv dolphin_2_2_yi_34b,eq_bench,75.52,[],eqbench_240912.csv nous_hermes_2_yi_34b,eq_bench,72.68,[],eqbench_240912.csv megadolphin_120b,eq_bench,80.21,[],eqbench_240912.csv dbrx_instructruct,eq_bench,76.82,[],eqbench_240912.csv llama3_8b_instruct,eq_bench,68.88,[],eqbench_240912.csv discolm_120b,eq_bench,78.48,[],eqbench_240912.csv mistral_small_2402,eq_bench,80.36,[],eqbench_240912.csv dolphin_2_2_70b,eq_bench,79.6,[],eqbench_240912.csv yi_34b_chat,eq_bench,71.62,[],eqbench_240912.csv tulu_2_dpo_70b,eq_bench,76.63,[],eqbench_240912.csv tess_xl_v1_0,eq_bench,78.46,[],eqbench_240912.csv yi_1_5_9b_chat,eq_bench,70.37,[],eqbench_240912.csv goliath_120b,eq_bench,76.09,[],eqbench_240912.csv c4ai_command_r_plus,eq_bench,76.11,[],eqbench_240912.csv samantha_120b,eq_bench,76.44,[],eqbench_240912.csv nous_hermes_2_mixtral_8x7b_sft,eq_bench,72.91,[],eqbench_240912.csv qwen1_5_14b_chat,eq_bench,74.99,[],eqbench_240912.csv synthia_70b_v1_5,eq_bench,73.71,[],eqbench_240912.csv gemini_pro,eq_bench,75.08,[],eqbench_240912.csv mistral_nemo_instruct_2407,eq_bench,77.13,[],eqbench_240912.csv mixtral_8x7b_instruct_v0_1,eq_bench,72.37,[],eqbench_240912.csv quyen_pro_v0_1,eq_bench,70.75,[],eqbench_240912.csv gpt_3_5_turbo_0301,eq_bench,70.67,[],eqbench_240912.csv midnight_miqu_70b_v1_0,eq_bench,75.9,[],eqbench_240912.csv meow,eq_bench,73.94,[],eqbench_240912.csv lmcocktail_10_7b_v1,eq_bench,73.67,[],eqbench_240912.csv experiment26_7b,eq_bench,77.21,[],eqbench_240912.csv beyonder_4x7b_v3,eq_bench,77.01,[],eqbench_240912.csv sauerkrautlm_una_solar_instruct,eq_bench,73.56,[],eqbench_240912.csv neuralbeagle14_7b,eq_bench,74.79,[],eqbench_240912.csv neuralmonarch_7b,eq_bench,76.26,[],eqbench_240912.csv solar_10_7b_instruct_dpo,eq_bench,73.21,[],eqbench_240912.csv beagle14_7b,eq_bench,74.45,[],eqbench_240912.csv monarch_7b,eq_bench,75.8,[],eqbench_240912.csv westlake_7b_v2,eq_bench,78.7,[],eqbench_240912.csv alphamonarch_7b,eq_bench,76.08,[],eqbench_240912.csv gml_mistral_merged_v1,eq_bench,74.01,[],eqbench_240912.csv gpt_3_5_turbo_1106,eq_bench,71.74,[],eqbench_240912.csv starling_lm_7b_beta,eq_bench,73.82,[],eqbench_240912.csv solar_10_7b_instruct_v1_0,eq_bench,73.53,[],eqbench_240912.csv phi_3_mini_4k_instruct,eq_bench,58.15,[],eqbench_240912.csv claude_3_haiku_20240307,eq_bench,63.65,[],eqbench_240912.csv openchat_3_5_1210,eq_bench,72.52,[],eqbench_240912.csv neuralmarcoro14_7b,eq_bench,74.15,[],eqbench_240912.csv wizardlm_70b_v1_0,eq_bench,71.28,[],eqbench_240912.csv starling_lm_7b_alpha,eq_bench,73.9,[],eqbench_240912.csv gpt_3_5_turbo_0613,eq_bench,69.35,[],eqbench_240912.csv openchat_3_5,eq_bench,72.18,[],eqbench_240912.csv 🆕exaone_3_0_7_8b_instruct,eq_bench,66.72,[],eqbench_240912.csv laserxtral,eq_bench,71.96,[],eqbench_240912.csv llama_2_70b_chat,eq_bench,73.59,[],eqbench_240912.csv marcoroni_7b_v3_safetensor,eq_bench,71.68,[],eqbench_240912.csv 🆕trillama_8b,eq_bench,66.63,[],eqbench_240912.csv 🆕phi_3_5_mini_instruct,eq_bench,54.74,[],eqbench_240912.csv gpt_3_5_turbo_0125,eq_bench,64.97,[],eqbench_240912.csv beyonder_4x7b_v2,eq_bench,69.23,[],eqbench_240912.csv firefly_mixtral_8x7b,eq_bench,64.36,[],eqbench_240912.csv yi_1_5_6b_chat,eq_bench,59.45,[],eqbench_240912.csv marcoroni_neural_chat_7b_v2,eq_bench,68.54,[],eqbench_240912.csv wizardlm_2_7b,eq_bench,69.31,[],eqbench_240912.csv openhermes_2_5_mistral_7b,eq_bench,66.89,[],eqbench_240912.csv neuralhermes_2_5_mistral_7b,eq_bench,65.86,[],eqbench_240912.csv snorkel_mistral_pairrm_dpo,eq_bench,65.83,[],eqbench_240912.csv qwen_14b_chat,eq_bench,63.47,[],eqbench_240912.csv dolphin_2_2_1_mistral_7b,eq_bench,69.92,[],eqbench_240912.csv mistral_7b_instruct_v0_2,eq_bench,68.18,[],eqbench_240912.csv mistral_7b_openorca,eq_bench,66.55,[],eqbench_240912.csv neural_chat_7b_v3_1,eq_bench,64.77,[],eqbench_240912.csv internlm2_chat_7b,eq_bench,62.61,[],eqbench_240912.csv yi_6b_chat,eq_bench,61.79,[],eqbench_240912.csv orion_14b_chat,eq_bench,59.71,[],eqbench_240912.csv una_cybertron_7b_v2_bf16,eq_bench,62.83,[],eqbench_240912.csv c4ai_command_r_v0_1,eq_bench,56.05,[],eqbench_240912.csv mistral_7b_instruct_v0_3,eq_bench,63.15,[],eqbench_240912.csv vicuna_33b_v1_3,eq_bench,67.07,[],eqbench_240912.csv nanbeige2_8b_chat,eq_bench,65.17,[],eqbench_240912.csv gemma_1_1_7b_it,eq_bench,59.17,[],eqbench_240912.csv qwen1_5_moe_a2_7b_chat,eq_bench,58.07,[],eqbench_240912.csv vicuna_13b_v1_5,eq_bench,67.39,[],eqbench_240912.csv gemma_2_2b_it,eq_bench,60.86,[],eqbench_240912.csv qwen1_5_7b_chat,eq_bench,54.41,[],eqbench_240912.csv sparsetral_16x7b_v2,eq_bench,59.9,[],eqbench_240912.csv zephyr_7b_beta,eq_bench,58.33,[],eqbench_240912.csv wizardlm_13b_v1_2,eq_bench,63.71,[],eqbench_240912.csv zephyr_7b_alpha,eq_bench,56.82,[],eqbench_240912.csv phi_2_orange,eq_bench,56.94,[],eqbench_240912.csv phi_2_psy,eq_bench,56.44,[],eqbench_240912.csv gemma_7b_it,eq_bench,61.72,[],eqbench_240912.csv phi_2_dpo,eq_bench,54.42,[],eqbench_240912.csv phixtral_2x2_8,eq_bench,54.58,[],eqbench_240912.csv qwen_7b_chat,eq_bench,50.11,[],eqbench_240912.csv mistral_7b_instruct_v0_1,eq_bench,52.15,[],eqbench_240912.csv llama_2_13b_chat,eq_bench,49.12,[],eqbench_240912.csv guanaco_33b_merged,eq_bench,36.11,[],eqbench_240912.csv nous_capybara_7b_v1,eq_bench,34.37,[],eqbench_240912.csv llama_2_7b_chat,eq_bench,36.32,[],eqbench_240912.csv qwen1_5_4b_chat,eq_bench,28.75,[],eqbench_240912.csv qwen_1_8b_chat,eq_bench,30.0,[],eqbench_240912.csv phi_2,eq_bench,27.6,[],eqbench_240912.csv qwen1_5_1_8b_chat,eq_bench,24.12,[],eqbench_240912.csv vicuna_7b_v1_1,eq_bench,26.12,[],eqbench_240912.csv gemma_2b_it,eq_bench,23.26,[],eqbench_240912.csv koala_7b,eq_bench,21.54,[],eqbench_240912.csv stablelm_2_zephyr_1_6b,eq_bench,15.04,[],eqbench_240912.csv random_baseline,eq_bench,0.0,[],eqbench_240912.csv falcon_180b_chat,eq_bench,56.82,[],eqbench_240912.csv claude_instant_1_2,eq_bench,69.04,[],eqbench_240912.csv claude_2_1,eq_bench,73.96,[],eqbench_240912.csv claude_1,eq_bench,76.83,[],eqbench_240912.csv claude_2_0,eq_bench,72.89,[],eqbench_240912.csv pplx_70b_online,eq_bench,62.79,[],eqbench_240912.csv pplx_7b_online,eq_bench,48.91,[],eqbench_240912.csv theprofessor_155b,eq_bench,78.82,[],eqbench_240912.csv llama3_1_405b_instruct,magi_hard,83.81,[],eqbench_240912.csv claude_3_5_sonnet_20240620,magi_hard,78.8,[],eqbench_240912.csv gpt_4o,magi_hard,80.86,[],eqbench_240912.csv gpt_4_turbo_2024_04_09,magi_hard,77.74,[],eqbench_240912.csv rys_xlarge_base,magi_hard,78.3,[],eqbench_240912.csv gpt_4_0613,magi_hard,77.85,[],eqbench_240912.csv gpt_4_0314,magi_hard,75.67,[],eqbench_240912.csv rys_xlarge,magi_hard,76.83,[],eqbench_240912.csv gpt_4_1106_preview,magi_hard,74.96,[],eqbench_240912.csv gpt_4_0125_preview,magi_hard,76.83,[],eqbench_240912.csv claude_3_opus_20240229,magi_hard,76.55,[],eqbench_240912.csv mistral_large_2407,magi_hard,72.37,[],eqbench_240912.csv qwen2_72b_instruct,magi_hard,75.74,[],eqbench_240912.csv mistral_large_2402,magi_hard,67.69,[],eqbench_240912.csv llama3_70b_instruct,magi_hard,67.97,[],eqbench_240912.csv qwen1_5_110b_chat,magi_hard,66.09,[],eqbench_240912.csv solar_pro_preview_instruct,magi_hard,70.84,[],eqbench_240912.csv senku_70b_full,magi_hard,63.94,[],eqbench_240912.csv smaug_llama3_70b_instruct,magi_hard,67.25,[],eqbench_240912.csv ece_tw3_jrgl_v1,magi_hard,63.56,[],eqbench_240912.csv miiqu_f16,magi_hard,63.28,[],eqbench_240912.csv qwen1_5_72b_chat,magi_hard,63.47,[],eqbench_240912.csv miqu_1_70b,magi_hard,63.22,[],eqbench_240912.csv mistral_medium,magi_hard,62.15,[],eqbench_240912.csv gemma_2_27b_it,magi_hard,64.1,[],eqbench_240912.csv gpt_4o_mini,magi_hard,67.5,[],eqbench_240912.csv 🆕phi_3_5_moe_instruct,magi_hard,67.25,[],eqbench_240912.csv deepseek_v2_chat_0628,magi_hard,60.63,[],eqbench_240912.csv miquella_120b,magi_hard,60.69,[],eqbench_240912.csv phi_3_medium_4k_instruct,magi_hard,66.38,[],eqbench_240912.csv claude_3_sonnet_20240229,magi_hard,61.01,[],eqbench_240912.csv tess_72b_v1_5b,magi_hard,59.57,[],eqbench_240912.csv mixtral_8x22b_instruct_v0_1,magi_hard,62.41,[],eqbench_240912.csv qwen_72b_chat,magi_hard,60.38,[],eqbench_240912.csv smaug_72b_v0_1,magi_hard,60.22,[],eqbench_240912.csv gemma_2_9b_it,magi_hard,57.98,[],eqbench_240912.csv yi_1_5_34b_chat,magi_hard,64.85,[],eqbench_240912.csv mixtral_34bx2_moe_60b,magi_hard,65.06,[],eqbench_240912.csv phi_3_small_8k_instruct,magi_hard,64.16,[],eqbench_240912.csv wizardlm_2_8x22b,magi_hard,59.16,[],eqbench_240912.csv miquliz_120b_v2_0,magi_hard,54.57,[],eqbench_240912.csv quyen_pro_max_v0_1,magi_hard,59.29,[],eqbench_240912.csv qwen1_5_32b_chat,magi_hard,60.72,[],eqbench_240912.csv 🆕gemma_2_ifable_9b,magi_hard,56.35,[],eqbench_240912.csv dolphin_2_2_yi_34b,magi_hard,60.66,[],eqbench_240912.csv nous_hermes_2_yi_34b,magi_hard,63.03,[],eqbench_240912.csv megadolphin_120b,magi_hard,54.45,[],eqbench_240912.csv dbrx_instructruct,magi_hard,57.13,[],eqbench_240912.csv llama3_8b_instruct,magi_hard,63.84,[],eqbench_240912.csv discolm_120b,magi_hard,54.01,[],eqbench_240912.csv mistral_small_2402,magi_hard,51.9,[],eqbench_240912.csv dolphin_2_2_70b,magi_hard,49.73,[],eqbench_240912.csv yi_34b_chat,magi_hard,57.1,[],eqbench_240912.csv tulu_2_dpo_70b,magi_hard,50.23,[],eqbench_240912.csv tess_xl_v1_0,magi_hard,48.08,[],eqbench_240912.csv yi_1_5_9b_chat,magi_hard,56.13,[],eqbench_240912.csv goliath_120b,magi_hard,50.36,[],eqbench_240912.csv c4ai_command_r_plus,magi_hard,49.7,[],eqbench_240912.csv samantha_120b,magi_hard,48.58,[],eqbench_240912.csv nous_hermes_2_mixtral_8x7b_sft,magi_hard,51.83,[],eqbench_240912.csv qwen1_5_14b_chat,magi_hard,49.27,[],eqbench_240912.csv synthia_70b_v1_5,magi_hard,48.92,[],eqbench_240912.csv gemini_pro,magi_hard,46.87,[],eqbench_240912.csv mistral_nemo_instruct_2407,magi_hard,43.65,[],eqbench_240912.csv mixtral_8x7b_instruct_v0_1,magi_hard,45.74,[],eqbench_240912.csv quyen_pro_v0_1,magi_hard,47.3,[],eqbench_240912.csv gpt_3_5_turbo_0301,magi_hard,46.66,[],eqbench_240912.csv midnight_miqu_70b_v1_0,magi_hard,40.74,[],eqbench_240912.csv meow,magi_hard,42.68,[],eqbench_240912.csv lmcocktail_10_7b_v1,magi_hard,42.65,[],eqbench_240912.csv experiment26_7b,magi_hard,38.93,[],eqbench_240912.csv beyonder_4x7b_v3,magi_hard,39.03,[],eqbench_240912.csv sauerkrautlm_una_solar_instruct,magi_hard,42.43,[],eqbench_240912.csv neuralbeagle14_7b,magi_hard,41.06,[],eqbench_240912.csv neuralmonarch_7b,magi_hard,39.59,[],eqbench_240912.csv solar_10_7b_instruct_dpo,magi_hard,42.37,[],eqbench_240912.csv beagle14_7b,magi_hard,41.02,[],eqbench_240912.csv monarch_7b,magi_hard,39.56,[],eqbench_240912.csv westlake_7b_v2,magi_hard,36.59,[],eqbench_240912.csv alphamonarch_7b,magi_hard,39.12,[],eqbench_240912.csv gml_mistral_merged_v1,magi_hard,41.18,[],eqbench_240912.csv gpt_3_5_turbo_1106,magi_hard,43.17,[],eqbench_240912.csv starling_lm_7b_beta,magi_hard,40.12,[],eqbench_240912.csv solar_10_7b_instruct_v1_0,magi_hard,39.62,[],eqbench_240912.csv phi_3_mini_4k_instruct,magi_hard,53.26,[],eqbench_240912.csv claude_3_haiku_20240307,magi_hard,47.71,[],eqbench_240912.csv openchat_3_5_1210,magi_hard,38.81,[],eqbench_240912.csv neuralmarcoro14_7b,magi_hard,37.12,[],eqbench_240912.csv wizardlm_70b_v1_0,magi_hard,39.87,[],eqbench_240912.csv starling_lm_7b_alpha,magi_hard,37.06,[],eqbench_240912.csv gpt_3_5_turbo_0613,magi_hard,40.55,[],eqbench_240912.csv openchat_3_5,magi_hard,37.34,[],eqbench_240912.csv 🆕exaone_3_0_7_8b_instruct,magi_hard,42.8,[],eqbench_240912.csv laserxtral,magi_hard,37.46,[],eqbench_240912.csv llama_2_70b_chat,magi_hard,35.4,[],eqbench_240912.csv marcoroni_7b_v3_safetensor,magi_hard,37.06,[],eqbench_240912.csv 🆕trillama_8b,magi_hard,41.9,[],eqbench_240912.csv 🆕phi_3_5_mini_instruct,magi_hard,52.92,[],eqbench_240912.csv gpt_3_5_turbo_0125,magi_hard,42.65,[],eqbench_240912.csv beyonder_4x7b_v2,magi_hard,38.03,[],eqbench_240912.csv firefly_mixtral_8x7b,magi_hard,42.46,[],eqbench_240912.csv yi_1_5_6b_chat,magi_hard,46.18,[],eqbench_240912.csv marcoroni_neural_chat_7b_v2,magi_hard,36.31,[],eqbench_240912.csv wizardlm_2_7b,magi_hard,35.4,[],eqbench_240912.csv openhermes_2_5_mistral_7b,magi_hard,37.31,[],eqbench_240912.csv neuralhermes_2_5_mistral_7b,magi_hard,37.56,[],eqbench_240912.csv snorkel_mistral_pairrm_dpo,magi_hard,37.53,[],eqbench_240912.csv qwen_14b_chat,magi_hard,39.74,[],eqbench_240912.csv dolphin_2_2_1_mistral_7b,magi_hard,33.16,[],eqbench_240912.csv mistral_7b_instruct_v0_2,magi_hard,34.69,[],eqbench_240912.csv mistral_7b_openorca,magi_hard,35.78,[],eqbench_240912.csv neural_chat_7b_v3_1,magi_hard,36.65,[],eqbench_240912.csv internlm2_chat_7b,magi_hard,38.43,[],eqbench_240912.csv yi_6b_chat,magi_hard,38.74,[],eqbench_240912.csv orion_14b_chat,magi_hard,40.74,[],eqbench_240912.csv una_cybertron_7b_v2_bf16,magi_hard,37.5,[],eqbench_240912.csv c4ai_command_r_v0_1,magi_hard,43.27,[],eqbench_240912.csv mistral_7b_instruct_v0_3,magi_hard,36.0,[],eqbench_240912.csv vicuna_33b_v1_3,magi_hard,31.66,[],eqbench_240912.csv nanbeige2_8b_chat,magi_hard,33.03,[],eqbench_240912.csv gemma_1_1_7b_it,magi_hard,38.43,[],eqbench_240912.csv qwen1_5_moe_a2_7b_chat,magi_hard,38.34,[],eqbench_240912.csv vicuna_13b_v1_5,magi_hard,28.75,[],eqbench_240912.csv gemma_2_2b_it,magi_hard,35.22,[],eqbench_240912.csv qwen1_5_7b_chat,magi_hard,41.59,[],eqbench_240912.csv sparsetral_16x7b_v2,magi_hard,34.97,[],eqbench_240912.csv zephyr_7b_beta,magi_hard,35.97,[],eqbench_240912.csv wizardlm_13b_v1_2,magi_hard,29.1,[],eqbench_240912.csv zephyr_7b_alpha,magi_hard,35.15,[],eqbench_240912.csv phi_2_orange,magi_hard,32.03,[],eqbench_240912.csv phi_2_psy,magi_hard,32.03,[],eqbench_240912.csv gemma_7b_it,magi_hard,24.85,[],eqbench_240912.csv phi_2_dpo,magi_hard,31.85,[],eqbench_240912.csv phixtral_2x2_8,magi_hard,30.44,[],eqbench_240912.csv qwen_7b_chat,magi_hard,33.44,[],eqbench_240912.csv mistral_7b_instruct_v0_1,magi_hard,30.69,[],eqbench_240912.csv llama_2_13b_chat,magi_hard,28.2,[],eqbench_240912.csv guanaco_33b_merged,magi_hard,31.78,[],eqbench_240912.csv nous_capybara_7b_v1,magi_hard,30.16,[],eqbench_240912.csv llama_2_7b_chat,magi_hard,27.5,[],eqbench_240912.csv qwen1_5_4b_chat,magi_hard,32.66,[],eqbench_240912.csv qwen_1_8b_chat,magi_hard,29.19,[],eqbench_240912.csv phi_2,magi_hard,30.57,[],eqbench_240912.csv qwen1_5_1_8b_chat,magi_hard,31.56,[],eqbench_240912.csv vicuna_7b_v1_1,magi_hard,27.38,[],eqbench_240912.csv gemma_2b_it,magi_hard,24.16,[],eqbench_240912.csv koala_7b,magi_hard,23.7,[],eqbench_240912.csv stablelm_2_zephyr_1_6b,magi_hard,27.54,[],eqbench_240912.csv random_baseline,magi_hard,25.0,[],eqbench_240912.csv gpt_4_1106_preview,BIGGEN,4.22,[],biggen_240829.csv gpt_4_0125_preview,BIGGEN,4.19,[],biggen_240829.csv gpt_4o_2024_05_13,BIGGEN,4.141,[],biggen_240829.csv gpt_4_turbo_2024_04_09,BIGGEN,4.132,[],biggen_240829.csv claude_3_opus_20240229,BIGGEN,4.103,[],biggen_240829.csv llama3_70b_instruct,BIGGEN,4.012,[],biggen_240829.csv claude_3_sonnet_20240229,BIGGEN,4.011,[],biggen_240829.csv qwen_110b_chat,BIGGEN,3.979,[],biggen_240829.csv claude_3_haiku_20240307,BIGGEN,3.954,[],biggen_240829.csv gemini_pro_1_5,BIGGEN,3.953,[],biggen_240829.csv mixtral_8x22b_instruct_v0_1_awq,BIGGEN,3.936,[],biggen_240829.csv mistral_medium,BIGGEN,3.935,[],biggen_240829.csv mistral_large,BIGGEN,3.927,[],biggen_240829.csv gemini_flash_1_5,BIGGEN,3.899,[],biggen_240829.csv c4ai_command_r_plus_gptq,BIGGEN,3.839,[],biggen_240829.csv qwen1_5_72b_chat,BIGGEN,3.832,[],biggen_240829.csv phi_3_mini_4k_instruct,BIGGEN,3.821,[],biggen_240829.csv qwen1_5_32b_chat,BIGGEN,3.813,[],biggen_240829.csv starling_lm_7b_beta,BIGGEN,3.756,[],biggen_240829.csv llama3_8b_instruct,BIGGEN,3.753,[],biggen_240829.csv nous_hermes_2_mixtral_8x7b_dpo,BIGGEN,3.737,[],biggen_240829.csv yi_34b_chat,BIGGEN,3.701,[],biggen_240829.csv mixtral_8x7b_instruct_v0_1,BIGGEN,3.695,[],biggen_240829.csv gpt_3_5_turbo_0125,BIGGEN,3.689,[],biggen_240829.csv tulu_2_dpo_70b,BIGGEN,3.683,[],biggen_240829.csv phi_3_mini_128k_instruct,BIGGEN,3.679,[],biggen_240829.csv gpt_3_5_turbo_1106,BIGGEN,3.678,[],biggen_240829.csv c4ai_command_r_v0_1,BIGGEN,3.677,[],biggen_240829.csv solar_10_7b_instruct_v1_0,BIGGEN,3.672,[],biggen_240829.csv llama_2_70b_chat,BIGGEN,3.668,[],biggen_240829.csv gemini_1_0_pro,BIGGEN,3.64,[],biggen_240829.csv mistral_7b_instruct_v0_2,BIGGEN,3.619,[],biggen_240829.csv mixtral_8x22b_v0_1_awq,BIGGEN,3.606,[],biggen_240829.csv nous_hermes_2_mixtral_8x7b_sft,BIGGEN,3.596,[],biggen_240829.csv openchat_3_5_0106,BIGGEN,3.581,[],biggen_240829.csv zephyr_orpo_141b_a35b_v0_1_awq,BIGGEN,3.573,[],biggen_240829.csv qwen1_5_14b_chat,BIGGEN,3.573,[],biggen_240829.csv qwen1_5_7b_chat,BIGGEN,3.556,[],biggen_240829.csv starling_lm_7b_alpha,BIGGEN,3.537,[],biggen_240829.csv zephyr_7b_beta,BIGGEN,3.522,[],biggen_240829.csv nous_hermes_2_mistral_7b_dpo,BIGGEN,3.493,[],biggen_240829.csv nous_hermes_2_yi_34b,BIGGEN,3.476,[],biggen_240829.csv mistral_orpo_beta,BIGGEN,3.473,[],biggen_240829.csv llama_2_13b_chat,BIGGEN,3.467,[],biggen_240829.csv openhermes_2_5_mistral_7b,BIGGEN,3.462,[],biggen_240829.csv mixtral_8x7b_v0_1,BIGGEN,3.445,[],biggen_240829.csv mistral_orpo_alpha,BIGGEN,3.441,[],biggen_240829.csv tulu_2_dpo_13b,BIGGEN,3.423,[],biggen_240829.csv qwen1_5_72b,BIGGEN,3.422,[],biggen_240829.csv codetulu_2_34b,BIGGEN,3.421,[],biggen_240829.csv gemma_1_1_7b_it,BIGGEN,3.407,[],biggen_240829.csv openhermes_2_mistral_7b,BIGGEN,3.394,[],biggen_240829.csv codellama34b_instruct,BIGGEN,3.363,[],biggen_240829.csv yi_34b,BIGGEN,3.322,[],biggen_240829.csv llama_2_70b,BIGGEN,3.317,[],biggen_240829.csv qwen1_5_32b,BIGGEN,3.312,[],biggen_240829.csv llama_2_7b_chat,BIGGEN,3.307,[],biggen_240829.csv tulu_2_dpo_7b,BIGGEN,3.28,[],biggen_240829.csv codetulu_2_13b,BIGGEN,3.254,[],biggen_240829.csv solar_10_7b_v1_0,BIGGEN,3.248,[],biggen_240829.csv tulu_2_13b,BIGGEN,3.211,[],biggen_240829.csv codellama_13b_instruct,BIGGEN,3.206,[],biggen_240829.csv yi_6b_chat,BIGGEN,3.204,[],biggen_240829.csv codellama_7b_instruct,BIGGEN,3.14,[],biggen_240829.csv gemma_7b_it,BIGGEN,3.132,[],biggen_240829.csv llama3_70b,BIGGEN,3.122,[],biggen_240829.csv qwen1_5_14b,BIGGEN,3.106,[],biggen_240829.csv gemma_1_1_2b_it,BIGGEN,3.072,[],biggen_240829.csv codetulu_2_7b,BIGGEN,3.07,[],biggen_240829.csv tulu_2_7b,BIGGEN,3.041,[],biggen_240829.csv mistral_7b_v0_2,BIGGEN,3.024,[],biggen_240829.csv mistral_7b_v0_1,BIGGEN,3.006,[],biggen_240829.csv qwen1_5_4b_chat,BIGGEN,2.976,[],biggen_240829.csv olmo_7b_instruct,BIGGEN,2.974,[],biggen_240829.csv gemma_2b_it,BIGGEN,2.932,[],biggen_240829.csv qwen1_5_7b,BIGGEN,2.872,[],biggen_240829.csv phi_2,BIGGEN,2.859,[],biggen_240829.csv olmo_7b_sft,BIGGEN,2.827,[],biggen_240829.csv codellama_70b_instruct,BIGGEN,2.805,[],biggen_240829.csv llemma_34b,BIGGEN,2.771,[],biggen_240829.csv llama3_8b,BIGGEN,2.743,[],biggen_240829.csv qwen1_5_1_8b_chat,BIGGEN,2.741,[],biggen_240829.csv qwen1_5_4b,BIGGEN,2.708,[],biggen_240829.csv llama_2_13b,BIGGEN,2.703,[],biggen_240829.csv yi_6b,BIGGEN,2.635,[],biggen_240829.csv codellama_70b,BIGGEN,2.593,[],biggen_240829.csv codellama34b,BIGGEN,2.509,[],biggen_240829.csv phi_1_5,BIGGEN,2.497,[],biggen_240829.csv orca_2_13b,BIGGEN,2.489,[],biggen_240829.csv llama_2_7b,BIGGEN,2.457,[],biggen_240829.csv qwen1_5_1_8b,BIGGEN,2.364,[],biggen_240829.csv llemma_7b,BIGGEN,2.27,[],biggen_240829.csv gemma_2b,BIGGEN,2.262,[],biggen_240829.csv codellama_13b,BIGGEN,2.134,[],biggen_240829.csv qwen1_5_0_5b_chat,BIGGEN,2.108,[],biggen_240829.csv orca_2_7b,BIGGEN,2.083,[],biggen_240829.csv olmo_7b,BIGGEN,2.081,[],biggen_240829.csv codellama_7b,BIGGEN,1.954,[],biggen_240829.csv qwen1_5_0_5b,BIGGEN,1.834,[],biggen_240829.csv olmo_1b,BIGGEN,1.648,[],biggen_240829.csv aya_101,BIGGEN,1.447,[],biggen_240829.csv gemma_7b,BIGGEN,1.411,[],biggen_240829.csv phi_1,BIGGEN,1.135,[],biggen_240829.csv gpt_4_1106_preview,BIGGEN Grounding,4.288,[],biggen_240829.csv gpt_4_0125_preview,BIGGEN Grounding,4.3,[],biggen_240829.csv gpt_4o_2024_05_13,BIGGEN Grounding,4.238,[],biggen_240829.csv gpt_4_turbo_2024_04_09,BIGGEN Grounding,4.312,[],biggen_240829.csv claude_3_opus_20240229,BIGGEN Grounding,4.288,[],biggen_240829.csv llama3_70b_instruct,BIGGEN Grounding,4.125,[],biggen_240829.csv claude_3_sonnet_20240229,BIGGEN Grounding,4.25,[],biggen_240829.csv qwen_110b_chat,BIGGEN Grounding,4.15,[],biggen_240829.csv claude_3_haiku_20240307,BIGGEN Grounding,4.138,[],biggen_240829.csv gemini_pro_1_5,BIGGEN Grounding,4.05,[],biggen_240829.csv mixtral_8x22b_instruct_v0_1_awq,BIGGEN Grounding,4.012,[],biggen_240829.csv mistral_medium,BIGGEN Grounding,3.962,[],biggen_240829.csv mistral_large,BIGGEN Grounding,4.025,[],biggen_240829.csv gemini_flash_1_5,BIGGEN Grounding,4.138,[],biggen_240829.csv c4ai_command_r_plus_gptq,BIGGEN Grounding,3.988,[],biggen_240829.csv qwen1_5_72b_chat,BIGGEN Grounding,3.888,[],biggen_240829.csv phi_3_mini_4k_instruct,BIGGEN Grounding,3.725,[],biggen_240829.csv qwen1_5_32b_chat,BIGGEN Grounding,3.788,[],biggen_240829.csv starling_lm_7b_beta,BIGGEN Grounding,3.8,[],biggen_240829.csv llama3_8b_instruct,BIGGEN Grounding,4.125,[],biggen_240829.csv nous_hermes_2_mixtral_8x7b_dpo,BIGGEN Grounding,3.812,[],biggen_240829.csv yi_34b_chat,BIGGEN Grounding,3.738,[],biggen_240829.csv mixtral_8x7b_instruct_v0_1,BIGGEN Grounding,3.9,[],biggen_240829.csv gpt_3_5_turbo_0125,BIGGEN Grounding,3.925,[],biggen_240829.csv tulu_2_dpo_70b,BIGGEN Grounding,3.7,[],biggen_240829.csv phi_3_mini_128k_instruct,BIGGEN Grounding,3.712,[],biggen_240829.csv gpt_3_5_turbo_1106,BIGGEN Grounding,4.025,[],biggen_240829.csv c4ai_command_r_v0_1,BIGGEN Grounding,3.812,[],biggen_240829.csv solar_10_7b_instruct_v1_0,BIGGEN Grounding,3.812,[],biggen_240829.csv llama_2_70b_chat,BIGGEN Grounding,3.662,[],biggen_240829.csv gemini_1_0_pro,BIGGEN Grounding,3.6,[],biggen_240829.csv mistral_7b_instruct_v0_2,BIGGEN Grounding,3.7,[],biggen_240829.csv mixtral_8x22b_v0_1_awq,BIGGEN Grounding,3.688,[],biggen_240829.csv nous_hermes_2_mixtral_8x7b_sft,BIGGEN Grounding,3.65,[],biggen_240829.csv openchat_3_5_0106,BIGGEN Grounding,3.638,[],biggen_240829.csv zephyr_orpo_141b_a35b_v0_1_awq,BIGGEN Grounding,3.55,[],biggen_240829.csv qwen1_5_14b_chat,BIGGEN Grounding,3.625,[],biggen_240829.csv qwen1_5_7b_chat,BIGGEN Grounding,3.588,[],biggen_240829.csv starling_lm_7b_alpha,BIGGEN Grounding,3.712,[],biggen_240829.csv zephyr_7b_beta,BIGGEN Grounding,3.55,[],biggen_240829.csv nous_hermes_2_mistral_7b_dpo,BIGGEN Grounding,3.662,[],biggen_240829.csv nous_hermes_2_yi_34b,BIGGEN Grounding,3.338,[],biggen_240829.csv mistral_orpo_beta,BIGGEN Grounding,3.612,[],biggen_240829.csv llama_2_13b_chat,BIGGEN Grounding,3.662,[],biggen_240829.csv openhermes_2_5_mistral_7b,BIGGEN Grounding,3.688,[],biggen_240829.csv mixtral_8x7b_v0_1,BIGGEN Grounding,3.712,[],biggen_240829.csv mistral_orpo_alpha,BIGGEN Grounding,3.525,[],biggen_240829.csv tulu_2_dpo_13b,BIGGEN Grounding,3.45,[],biggen_240829.csv qwen1_5_72b,BIGGEN Grounding,3.488,[],biggen_240829.csv codetulu_2_34b,BIGGEN Grounding,3.45,[],biggen_240829.csv gemma_1_1_7b_it,BIGGEN Grounding,3.588,[],biggen_240829.csv openhermes_2_mistral_7b,BIGGEN Grounding,3.525,[],biggen_240829.csv codellama34b_instruct,BIGGEN Grounding,3.5,[],biggen_240829.csv yi_34b,BIGGEN Grounding,3.512,[],biggen_240829.csv llama_2_70b,BIGGEN Grounding,3.425,[],biggen_240829.csv qwen1_5_32b,BIGGEN Grounding,3.325,[],biggen_240829.csv llama_2_7b_chat,BIGGEN Grounding,3.388,[],biggen_240829.csv tulu_2_dpo_7b,BIGGEN Grounding,3.238,[],biggen_240829.csv codetulu_2_13b,BIGGEN Grounding,3.225,[],biggen_240829.csv solar_10_7b_v1_0,BIGGEN Grounding,3.25,[],biggen_240829.csv tulu_2_13b,BIGGEN Grounding,3.15,[],biggen_240829.csv codellama_13b_instruct,BIGGEN Grounding,3.262,[],biggen_240829.csv yi_6b_chat,BIGGEN Grounding,3.275,[],biggen_240829.csv codellama_7b_instruct,BIGGEN Grounding,3.212,[],biggen_240829.csv gemma_7b_it,BIGGEN Grounding,3.312,[],biggen_240829.csv llama3_70b,BIGGEN Grounding,3.35,[],biggen_240829.csv qwen1_5_14b,BIGGEN Grounding,3.538,[],biggen_240829.csv gemma_1_1_2b_it,BIGGEN Grounding,2.9,[],biggen_240829.csv codetulu_2_7b,BIGGEN Grounding,3.112,[],biggen_240829.csv tulu_2_7b,BIGGEN Grounding,2.862,[],biggen_240829.csv mistral_7b_v0_2,BIGGEN Grounding,3.15,[],biggen_240829.csv mistral_7b_v0_1,BIGGEN Grounding,3.225,[],biggen_240829.csv qwen1_5_4b_chat,BIGGEN Grounding,2.9,[],biggen_240829.csv olmo_7b_instruct,BIGGEN Grounding,3.112,[],biggen_240829.csv gemma_2b_it,BIGGEN Grounding,2.875,[],biggen_240829.csv qwen1_5_7b,BIGGEN Grounding,2.988,[],biggen_240829.csv phi_2,BIGGEN Grounding,3.138,[],biggen_240829.csv olmo_7b_sft,BIGGEN Grounding,2.95,[],biggen_240829.csv codellama_70b_instruct,BIGGEN Grounding,2.85,[],biggen_240829.csv llemma_34b,BIGGEN Grounding,2.988,[],biggen_240829.csv llama3_8b,BIGGEN Grounding,3.262,[],biggen_240829.csv qwen1_5_1_8b_chat,BIGGEN Grounding,2.812,[],biggen_240829.csv qwen1_5_4b,BIGGEN Grounding,2.888,[],biggen_240829.csv llama_2_13b,BIGGEN Grounding,2.85,[],biggen_240829.csv yi_6b,BIGGEN Grounding,2.938,[],biggen_240829.csv codellama_70b,BIGGEN Grounding,2.938,[],biggen_240829.csv codellama34b,BIGGEN Grounding,2.812,[],biggen_240829.csv phi_1_5,BIGGEN Grounding,2.475,[],biggen_240829.csv orca_2_13b,BIGGEN Grounding,2.938,[],biggen_240829.csv llama_2_7b,BIGGEN Grounding,2.612,[],biggen_240829.csv qwen1_5_1_8b,BIGGEN Grounding,2.538,[],biggen_240829.csv llemma_7b,BIGGEN Grounding,2.412,[],biggen_240829.csv gemma_2b,BIGGEN Grounding,2.338,[],biggen_240829.csv codellama_13b,BIGGEN Grounding,2.3,[],biggen_240829.csv qwen1_5_0_5b_chat,BIGGEN Grounding,2.2,[],biggen_240829.csv orca_2_7b,BIGGEN Grounding,2.425,[],biggen_240829.csv olmo_7b,BIGGEN Grounding,2.388,[],biggen_240829.csv codellama_7b,BIGGEN Grounding,1.962,[],biggen_240829.csv qwen1_5_0_5b,BIGGEN Grounding,2.025,[],biggen_240829.csv olmo_1b,BIGGEN Grounding,1.762,[],biggen_240829.csv aya_101,BIGGEN Grounding,1.288,[],biggen_240829.csv gemma_7b,BIGGEN Grounding,1.325,[],biggen_240829.csv phi_1,BIGGEN Grounding,1.112,[],biggen_240829.csv gpt_4_1106_preview,BIGGEN Instruction Following,4.23,[],biggen_240829.csv gpt_4_0125_preview,BIGGEN Instruction Following,4.2,[],biggen_240829.csv gpt_4o_2024_05_13,BIGGEN Instruction Following,4.26,[],biggen_240829.csv gpt_4_turbo_2024_04_09,BIGGEN Instruction Following,4.13,[],biggen_240829.csv claude_3_opus_20240229,BIGGEN Instruction Following,4.06,[],biggen_240829.csv llama3_70b_instruct,BIGGEN Instruction Following,4.18,[],biggen_240829.csv claude_3_sonnet_20240229,BIGGEN Instruction Following,3.92,[],biggen_240829.csv qwen_110b_chat,BIGGEN Instruction Following,4.01,[],biggen_240829.csv claude_3_haiku_20240307,BIGGEN Instruction Following,4.01,[],biggen_240829.csv gemini_pro_1_5,BIGGEN Instruction Following,4.04,[],biggen_240829.csv mixtral_8x22b_instruct_v0_1_awq,BIGGEN Instruction Following,4.0,[],biggen_240829.csv mistral_medium,BIGGEN Instruction Following,3.94,[],biggen_240829.csv mistral_large,BIGGEN Instruction Following,3.99,[],biggen_240829.csv gemini_flash_1_5,BIGGEN Instruction Following,3.91,[],biggen_240829.csv c4ai_command_r_plus_gptq,BIGGEN Instruction Following,4.0,[],biggen_240829.csv qwen1_5_72b_chat,BIGGEN Instruction Following,3.99,[],biggen_240829.csv phi_3_mini_4k_instruct,BIGGEN Instruction Following,3.88,[],biggen_240829.csv qwen1_5_32b_chat,BIGGEN Instruction Following,3.85,[],biggen_240829.csv starling_lm_7b_beta,BIGGEN Instruction Following,3.84,[],biggen_240829.csv llama3_8b_instruct,BIGGEN Instruction Following,3.94,[],biggen_240829.csv nous_hermes_2_mixtral_8x7b_dpo,BIGGEN Instruction Following,4.06,[],biggen_240829.csv yi_34b_chat,BIGGEN Instruction Following,3.83,[],biggen_240829.csv mixtral_8x7b_instruct_v0_1,BIGGEN Instruction Following,3.88,[],biggen_240829.csv gpt_3_5_turbo_0125,BIGGEN Instruction Following,3.85,[],biggen_240829.csv tulu_2_dpo_70b,BIGGEN Instruction Following,3.89,[],biggen_240829.csv phi_3_mini_128k_instruct,BIGGEN Instruction Following,3.8,[],biggen_240829.csv gpt_3_5_turbo_1106,BIGGEN Instruction Following,3.79,[],biggen_240829.csv c4ai_command_r_v0_1,BIGGEN Instruction Following,3.88,[],biggen_240829.csv solar_10_7b_instruct_v1_0,BIGGEN Instruction Following,3.77,[],biggen_240829.csv llama_2_70b_chat,BIGGEN Instruction Following,3.88,[],biggen_240829.csv gemini_1_0_pro,BIGGEN Instruction Following,3.84,[],biggen_240829.csv mistral_7b_instruct_v0_2,BIGGEN Instruction Following,3.87,[],biggen_240829.csv mixtral_8x22b_v0_1_awq,BIGGEN Instruction Following,3.7,[],biggen_240829.csv nous_hermes_2_mixtral_8x7b_sft,BIGGEN Instruction Following,3.78,[],biggen_240829.csv openchat_3_5_0106,BIGGEN Instruction Following,3.84,[],biggen_240829.csv zephyr_orpo_141b_a35b_v0_1_awq,BIGGEN Instruction Following,3.62,[],biggen_240829.csv qwen1_5_14b_chat,BIGGEN Instruction Following,3.9,[],biggen_240829.csv qwen1_5_7b_chat,BIGGEN Instruction Following,3.88,[],biggen_240829.csv starling_lm_7b_alpha,BIGGEN Instruction Following,3.72,[],biggen_240829.csv zephyr_7b_beta,BIGGEN Instruction Following,3.72,[],biggen_240829.csv nous_hermes_2_mistral_7b_dpo,BIGGEN Instruction Following,3.74,[],biggen_240829.csv nous_hermes_2_yi_34b,BIGGEN Instruction Following,3.65,[],biggen_240829.csv mistral_orpo_beta,BIGGEN Instruction Following,3.8,[],biggen_240829.csv llama_2_13b_chat,BIGGEN Instruction Following,3.92,[],biggen_240829.csv openhermes_2_5_mistral_7b,BIGGEN Instruction Following,3.66,[],biggen_240829.csv mixtral_8x7b_v0_1,BIGGEN Instruction Following,3.58,[],biggen_240829.csv mistral_orpo_alpha,BIGGEN Instruction Following,3.7,[],biggen_240829.csv tulu_2_dpo_13b,BIGGEN Instruction Following,3.77,[],biggen_240829.csv qwen1_5_72b,BIGGEN Instruction Following,3.6,[],biggen_240829.csv codetulu_2_34b,BIGGEN Instruction Following,3.51,[],biggen_240829.csv gemma_1_1_7b_it,BIGGEN Instruction Following,3.53,[],biggen_240829.csv openhermes_2_mistral_7b,BIGGEN Instruction Following,3.66,[],biggen_240829.csv codellama34b_instruct,BIGGEN Instruction Following,3.5,[],biggen_240829.csv yi_34b,BIGGEN Instruction Following,3.54,[],biggen_240829.csv llama_2_70b,BIGGEN Instruction Following,3.56,[],biggen_240829.csv qwen1_5_32b,BIGGEN Instruction Following,3.64,[],biggen_240829.csv llama_2_7b_chat,BIGGEN Instruction Following,3.58,[],biggen_240829.csv tulu_2_dpo_7b,BIGGEN Instruction Following,3.76,[],biggen_240829.csv codetulu_2_13b,BIGGEN Instruction Following,3.5,[],biggen_240829.csv solar_10_7b_v1_0,BIGGEN Instruction Following,3.56,[],biggen_240829.csv tulu_2_13b,BIGGEN Instruction Following,3.38,[],biggen_240829.csv codellama_13b_instruct,BIGGEN Instruction Following,3.34,[],biggen_240829.csv yi_6b_chat,BIGGEN Instruction Following,3.52,[],biggen_240829.csv codellama_7b_instruct,BIGGEN Instruction Following,3.36,[],biggen_240829.csv gemma_7b_it,BIGGEN Instruction Following,3.43,[],biggen_240829.csv llama3_70b,BIGGEN Instruction Following,3.33,[],biggen_240829.csv qwen1_5_14b,BIGGEN Instruction Following,3.41,[],biggen_240829.csv gemma_1_1_2b_it,BIGGEN Instruction Following,3.34,[],biggen_240829.csv codetulu_2_7b,BIGGEN Instruction Following,3.41,[],biggen_240829.csv tulu_2_7b,BIGGEN Instruction Following,3.34,[],biggen_240829.csv mistral_7b_v0_2,BIGGEN Instruction Following,3.33,[],biggen_240829.csv mistral_7b_v0_1,BIGGEN Instruction Following,3.3,[],biggen_240829.csv qwen1_5_4b_chat,BIGGEN Instruction Following,3.19,[],biggen_240829.csv olmo_7b_instruct,BIGGEN Instruction Following,3.54,[],biggen_240829.csv gemma_2b_it,BIGGEN Instruction Following,3.24,[],biggen_240829.csv qwen1_5_7b,BIGGEN Instruction Following,3.14,[],biggen_240829.csv phi_2,BIGGEN Instruction Following,2.92,[],biggen_240829.csv olmo_7b_sft,BIGGEN Instruction Following,3.27,[],biggen_240829.csv codellama_70b_instruct,BIGGEN Instruction Following,2.7,[],biggen_240829.csv llemma_34b,BIGGEN Instruction Following,2.97,[],biggen_240829.csv llama3_8b,BIGGEN Instruction Following,2.94,[],biggen_240829.csv qwen1_5_1_8b_chat,BIGGEN Instruction Following,3.27,[],biggen_240829.csv qwen1_5_4b,BIGGEN Instruction Following,2.94,[],biggen_240829.csv llama_2_13b,BIGGEN Instruction Following,3.09,[],biggen_240829.csv yi_6b,BIGGEN Instruction Following,2.97,[],biggen_240829.csv codellama_70b,BIGGEN Instruction Following,2.62,[],biggen_240829.csv codellama34b,BIGGEN Instruction Following,2.66,[],biggen_240829.csv phi_1_5,BIGGEN Instruction Following,2.89,[],biggen_240829.csv orca_2_13b,BIGGEN Instruction Following,2.49,[],biggen_240829.csv llama_2_7b,BIGGEN Instruction Following,2.87,[],biggen_240829.csv qwen1_5_1_8b,BIGGEN Instruction Following,2.85,[],biggen_240829.csv llemma_7b,BIGGEN Instruction Following,2.57,[],biggen_240829.csv gemma_2b,BIGGEN Instruction Following,2.72,[],biggen_240829.csv codellama_13b,BIGGEN Instruction Following,2.3,[],biggen_240829.csv qwen1_5_0_5b_chat,BIGGEN Instruction Following,2.61,[],biggen_240829.csv orca_2_7b,BIGGEN Instruction Following,2.27,[],biggen_240829.csv olmo_7b,BIGGEN Instruction Following,2.26,[],biggen_240829.csv codellama_7b,BIGGEN Instruction Following,2.25,[],biggen_240829.csv qwen1_5_0_5b,BIGGEN Instruction Following,2.12,[],biggen_240829.csv olmo_1b,BIGGEN Instruction Following,1.8,[],biggen_240829.csv aya_101,BIGGEN Instruction Following,1.45,[],biggen_240829.csv gemma_7b,BIGGEN Instruction Following,1.49,[],biggen_240829.csv phi_1,BIGGEN Instruction Following,1.01,[],biggen_240829.csv gpt_4_1106_preview,BIGGEN Planning,4.271,[],biggen_240829.csv gpt_4_0125_preview,BIGGEN Planning,4.357,[],biggen_240829.csv gpt_4o_2024_05_13,BIGGEN Planning,4.357,[],biggen_240829.csv gpt_4_turbo_2024_04_09,BIGGEN Planning,4.3,[],biggen_240829.csv claude_3_opus_20240229,BIGGEN Planning,4.186,[],biggen_240829.csv llama3_70b_instruct,BIGGEN Planning,4.186,[],biggen_240829.csv claude_3_sonnet_20240229,BIGGEN Planning,4.171,[],biggen_240829.csv qwen_110b_chat,BIGGEN Planning,4.229,[],biggen_240829.csv claude_3_haiku_20240307,BIGGEN Planning,4.129,[],biggen_240829.csv gemini_pro_1_5,BIGGEN Planning,4.129,[],biggen_240829.csv mixtral_8x22b_instruct_v0_1_awq,BIGGEN Planning,4.0,[],biggen_240829.csv mistral_medium,BIGGEN Planning,4.029,[],biggen_240829.csv mistral_large,BIGGEN Planning,4.029,[],biggen_240829.csv gemini_flash_1_5,BIGGEN Planning,3.971,[],biggen_240829.csv c4ai_command_r_plus_gptq,BIGGEN Planning,4.186,[],biggen_240829.csv qwen1_5_72b_chat,BIGGEN Planning,4.029,[],biggen_240829.csv phi_3_mini_4k_instruct,BIGGEN Planning,3.8,[],biggen_240829.csv qwen1_5_32b_chat,BIGGEN Planning,4.029,[],biggen_240829.csv starling_lm_7b_beta,BIGGEN Planning,4.0,[],biggen_240829.csv llama3_8b_instruct,BIGGEN Planning,3.929,[],biggen_240829.csv nous_hermes_2_mixtral_8x7b_dpo,BIGGEN Planning,3.957,[],biggen_240829.csv yi_34b_chat,BIGGEN Planning,3.914,[],biggen_240829.csv mixtral_8x7b_instruct_v0_1,BIGGEN Planning,3.6,[],biggen_240829.csv gpt_3_5_turbo_0125,BIGGEN Planning,3.843,[],biggen_240829.csv tulu_2_dpo_70b,BIGGEN Planning,3.9,[],biggen_240829.csv phi_3_mini_128k_instruct,BIGGEN Planning,3.7,[],biggen_240829.csv gpt_3_5_turbo_1106,BIGGEN Planning,3.829,[],biggen_240829.csv c4ai_command_r_v0_1,BIGGEN Planning,3.9,[],biggen_240829.csv solar_10_7b_instruct_v1_0,BIGGEN Planning,3.857,[],biggen_240829.csv llama_2_70b_chat,BIGGEN Planning,3.929,[],biggen_240829.csv gemini_1_0_pro,BIGGEN Planning,3.871,[],biggen_240829.csv mistral_7b_instruct_v0_2,BIGGEN Planning,3.8,[],biggen_240829.csv mixtral_8x22b_v0_1_awq,BIGGEN Planning,3.743,[],biggen_240829.csv nous_hermes_2_mixtral_8x7b_sft,BIGGEN Planning,3.714,[],biggen_240829.csv openchat_3_5_0106,BIGGEN Planning,3.757,[],biggen_240829.csv zephyr_orpo_141b_a35b_v0_1_awq,BIGGEN Planning,3.957,[],biggen_240829.csv qwen1_5_14b_chat,BIGGEN Planning,3.857,[],biggen_240829.csv qwen1_5_7b_chat,BIGGEN Planning,3.714,[],biggen_240829.csv starling_lm_7b_alpha,BIGGEN Planning,3.829,[],biggen_240829.csv zephyr_7b_beta,BIGGEN Planning,3.729,[],biggen_240829.csv nous_hermes_2_mistral_7b_dpo,BIGGEN Planning,3.8,[],biggen_240829.csv nous_hermes_2_yi_34b,BIGGEN Planning,3.643,[],biggen_240829.csv mistral_orpo_beta,BIGGEN Planning,3.686,[],biggen_240829.csv llama_2_13b_chat,BIGGEN Planning,3.686,[],biggen_240829.csv openhermes_2_5_mistral_7b,BIGGEN Planning,3.729,[],biggen_240829.csv mixtral_8x7b_v0_1,BIGGEN Planning,3.5,[],biggen_240829.csv mistral_orpo_alpha,BIGGEN Planning,3.6,[],biggen_240829.csv tulu_2_dpo_13b,BIGGEN Planning,3.6,[],biggen_240829.csv qwen1_5_72b,BIGGEN Planning,3.5,[],biggen_240829.csv codetulu_2_34b,BIGGEN Planning,3.686,[],biggen_240829.csv gemma_1_1_7b_it,BIGGEN Planning,3.371,[],biggen_240829.csv openhermes_2_mistral_7b,BIGGEN Planning,3.8,[],biggen_240829.csv codellama34b_instruct,BIGGEN Planning,3.457,[],biggen_240829.csv yi_34b,BIGGEN Planning,3.529,[],biggen_240829.csv llama_2_70b,BIGGEN Planning,3.386,[],biggen_240829.csv qwen1_5_32b,BIGGEN Planning,3.514,[],biggen_240829.csv llama_2_7b_chat,BIGGEN Planning,3.586,[],biggen_240829.csv tulu_2_dpo_7b,BIGGEN Planning,3.5,[],biggen_240829.csv codetulu_2_13b,BIGGEN Planning,3.4,[],biggen_240829.csv solar_10_7b_v1_0,BIGGEN Planning,3.371,[],biggen_240829.csv tulu_2_13b,BIGGEN Planning,3.4,[],biggen_240829.csv codellama_13b_instruct,BIGGEN Planning,3.357,[],biggen_240829.csv yi_6b_chat,BIGGEN Planning,3.414,[],biggen_240829.csv codellama_7b_instruct,BIGGEN Planning,3.286,[],biggen_240829.csv gemma_7b_it,BIGGEN Planning,3.071,[],biggen_240829.csv llama3_70b,BIGGEN Planning,3.114,[],biggen_240829.csv qwen1_5_14b,BIGGEN Planning,3.157,[],biggen_240829.csv gemma_1_1_2b_it,BIGGEN Planning,3.229,[],biggen_240829.csv codetulu_2_7b,BIGGEN Planning,3.114,[],biggen_240829.csv tulu_2_7b,BIGGEN Planning,3.229,[],biggen_240829.csv mistral_7b_v0_2,BIGGEN Planning,3.1,[],biggen_240829.csv mistral_7b_v0_1,BIGGEN Planning,3.243,[],biggen_240829.csv qwen1_5_4b_chat,BIGGEN Planning,3.086,[],biggen_240829.csv olmo_7b_instruct,BIGGEN Planning,3.271,[],biggen_240829.csv gemma_2b_it,BIGGEN Planning,3.114,[],biggen_240829.csv qwen1_5_7b,BIGGEN Planning,3.014,[],biggen_240829.csv phi_2,BIGGEN Planning,2.857,[],biggen_240829.csv olmo_7b_sft,BIGGEN Planning,2.957,[],biggen_240829.csv codellama_70b_instruct,BIGGEN Planning,2.671,[],biggen_240829.csv llemma_34b,BIGGEN Planning,2.743,[],biggen_240829.csv llama3_8b,BIGGEN Planning,2.657,[],biggen_240829.csv qwen1_5_1_8b_chat,BIGGEN Planning,2.914,[],biggen_240829.csv qwen1_5_4b,BIGGEN Planning,2.729,[],biggen_240829.csv llama_2_13b,BIGGEN Planning,2.786,[],biggen_240829.csv yi_6b,BIGGEN Planning,2.657,[],biggen_240829.csv codellama_70b,BIGGEN Planning,2.557,[],biggen_240829.csv codellama34b,BIGGEN Planning,2.486,[],biggen_240829.csv phi_1_5,BIGGEN Planning,2.5,[],biggen_240829.csv orca_2_13b,BIGGEN Planning,1.786,[],biggen_240829.csv llama_2_7b,BIGGEN Planning,2.514,[],biggen_240829.csv qwen1_5_1_8b,BIGGEN Planning,2.386,[],biggen_240829.csv llemma_7b,BIGGEN Planning,2.086,[],biggen_240829.csv gemma_2b,BIGGEN Planning,2.357,[],biggen_240829.csv codellama_13b,BIGGEN Planning,1.957,[],biggen_240829.csv qwen1_5_0_5b_chat,BIGGEN Planning,2.057,[],biggen_240829.csv orca_2_7b,BIGGEN Planning,1.371,[],biggen_240829.csv olmo_7b,BIGGEN Planning,1.929,[],biggen_240829.csv codellama_7b,BIGGEN Planning,1.771,[],biggen_240829.csv qwen1_5_0_5b,BIGGEN Planning,1.7,[],biggen_240829.csv olmo_1b,BIGGEN Planning,1.443,[],biggen_240829.csv aya_101,BIGGEN Planning,1.471,[],biggen_240829.csv gemma_7b,BIGGEN Planning,1.186,[],biggen_240829.csv phi_1,BIGGEN Planning,1.0,[],biggen_240829.csv gpt_4_1106_preview,BIGGEN Reasoning,4.22,[],biggen_240829.csv gpt_4_0125_preview,BIGGEN Reasoning,4.16,[],biggen_240829.csv gpt_4o_2024_05_13,BIGGEN Reasoning,4.21,[],biggen_240829.csv gpt_4_turbo_2024_04_09,BIGGEN Reasoning,4.2,[],biggen_240829.csv claude_3_opus_20240229,BIGGEN Reasoning,3.97,[],biggen_240829.csv llama3_70b_instruct,BIGGEN Reasoning,3.87,[],biggen_240829.csv claude_3_sonnet_20240229,BIGGEN Reasoning,3.91,[],biggen_240829.csv qwen_110b_chat,BIGGEN Reasoning,3.94,[],biggen_240829.csv claude_3_haiku_20240307,BIGGEN Reasoning,3.69,[],biggen_240829.csv gemini_pro_1_5,BIGGEN Reasoning,4.06,[],biggen_240829.csv mixtral_8x22b_instruct_v0_1_awq,BIGGEN Reasoning,3.96,[],biggen_240829.csv mistral_medium,BIGGEN Reasoning,3.95,[],biggen_240829.csv mistral_large,BIGGEN Reasoning,3.93,[],biggen_240829.csv gemini_flash_1_5,BIGGEN Reasoning,3.92,[],biggen_240829.csv c4ai_command_r_plus_gptq,BIGGEN Reasoning,3.64,[],biggen_240829.csv qwen1_5_72b_chat,BIGGEN Reasoning,3.68,[],biggen_240829.csv phi_3_mini_4k_instruct,BIGGEN Reasoning,3.81,[],biggen_240829.csv qwen1_5_32b_chat,BIGGEN Reasoning,3.62,[],biggen_240829.csv starling_lm_7b_beta,BIGGEN Reasoning,3.56,[],biggen_240829.csv llama3_8b_instruct,BIGGEN Reasoning,3.47,[],biggen_240829.csv nous_hermes_2_mixtral_8x7b_dpo,BIGGEN Reasoning,3.53,[],biggen_240829.csv yi_34b_chat,BIGGEN Reasoning,3.57,[],biggen_240829.csv mixtral_8x7b_instruct_v0_1,BIGGEN Reasoning,3.71,[],biggen_240829.csv gpt_3_5_turbo_0125,BIGGEN Reasoning,3.65,[],biggen_240829.csv tulu_2_dpo_70b,BIGGEN Reasoning,3.36,[],biggen_240829.csv phi_3_mini_128k_instruct,BIGGEN Reasoning,3.82,[],biggen_240829.csv gpt_3_5_turbo_1106,BIGGEN Reasoning,3.51,[],biggen_240829.csv c4ai_command_r_v0_1,BIGGEN Reasoning,3.39,[],biggen_240829.csv solar_10_7b_instruct_v1_0,BIGGEN Reasoning,3.42,[],biggen_240829.csv llama_2_70b_chat,BIGGEN Reasoning,3.22,[],biggen_240829.csv gemini_1_0_pro,BIGGEN Reasoning,3.62,[],biggen_240829.csv mistral_7b_instruct_v0_2,BIGGEN Reasoning,3.18,[],biggen_240829.csv mixtral_8x22b_v0_1_awq,BIGGEN Reasoning,3.5,[],biggen_240829.csv nous_hermes_2_mixtral_8x7b_sft,BIGGEN Reasoning,3.39,[],biggen_240829.csv openchat_3_5_0106,BIGGEN Reasoning,3.34,[],biggen_240829.csv zephyr_orpo_141b_a35b_v0_1_awq,BIGGEN Reasoning,3.52,[],biggen_240829.csv qwen1_5_14b_chat,BIGGEN Reasoning,3.36,[],biggen_240829.csv qwen1_5_7b_chat,BIGGEN Reasoning,3.3,[],biggen_240829.csv starling_lm_7b_alpha,BIGGEN Reasoning,3.33,[],biggen_240829.csv zephyr_7b_beta,BIGGEN Reasoning,3.23,[],biggen_240829.csv nous_hermes_2_mistral_7b_dpo,BIGGEN Reasoning,3.26,[],biggen_240829.csv nous_hermes_2_yi_34b,BIGGEN Reasoning,3.53,[],biggen_240829.csv mistral_orpo_beta,BIGGEN Reasoning,3.12,[],biggen_240829.csv llama_2_13b_chat,BIGGEN Reasoning,2.76,[],biggen_240829.csv openhermes_2_5_mistral_7b,BIGGEN Reasoning,3.28,[],biggen_240829.csv mixtral_8x7b_v0_1,BIGGEN Reasoning,3.3,[],biggen_240829.csv mistral_orpo_alpha,BIGGEN Reasoning,3.11,[],biggen_240829.csv tulu_2_dpo_13b,BIGGEN Reasoning,2.9,[],biggen_240829.csv qwen1_5_72b,BIGGEN Reasoning,3.25,[],biggen_240829.csv codetulu_2_34b,BIGGEN Reasoning,3.01,[],biggen_240829.csv gemma_1_1_7b_it,BIGGEN Reasoning,3.25,[],biggen_240829.csv openhermes_2_mistral_7b,BIGGEN Reasoning,3.28,[],biggen_240829.csv codellama34b_instruct,BIGGEN Reasoning,3.04,[],biggen_240829.csv yi_34b,BIGGEN Reasoning,3.27,[],biggen_240829.csv llama_2_70b,BIGGEN Reasoning,3.06,[],biggen_240829.csv qwen1_5_32b,BIGGEN Reasoning,3.31,[],biggen_240829.csv llama_2_7b_chat,BIGGEN Reasoning,2.85,[],biggen_240829.csv tulu_2_dpo_7b,BIGGEN Reasoning,2.79,[],biggen_240829.csv codetulu_2_13b,BIGGEN Reasoning,2.8,[],biggen_240829.csv solar_10_7b_v1_0,BIGGEN Reasoning,2.96,[],biggen_240829.csv tulu_2_13b,BIGGEN Reasoning,2.8,[],biggen_240829.csv codellama_13b_instruct,BIGGEN Reasoning,2.77,[],biggen_240829.csv yi_6b_chat,BIGGEN Reasoning,2.85,[],biggen_240829.csv codellama_7b_instruct,BIGGEN Reasoning,2.75,[],biggen_240829.csv gemma_7b_it,BIGGEN Reasoning,2.97,[],biggen_240829.csv llama3_70b,BIGGEN Reasoning,3.04,[],biggen_240829.csv qwen1_5_14b,BIGGEN Reasoning,3.0,[],biggen_240829.csv gemma_1_1_2b_it,BIGGEN Reasoning,2.74,[],biggen_240829.csv codetulu_2_7b,BIGGEN Reasoning,2.73,[],biggen_240829.csv tulu_2_7b,BIGGEN Reasoning,2.81,[],biggen_240829.csv mistral_7b_v0_2,BIGGEN Reasoning,2.78,[],biggen_240829.csv mistral_7b_v0_1,BIGGEN Reasoning,2.86,[],biggen_240829.csv qwen1_5_4b_chat,BIGGEN Reasoning,2.83,[],biggen_240829.csv olmo_7b_instruct,BIGGEN Reasoning,2.47,[],biggen_240829.csv gemma_2b_it,BIGGEN Reasoning,2.48,[],biggen_240829.csv qwen1_5_7b,BIGGEN Reasoning,2.65,[],biggen_240829.csv phi_2,BIGGEN Reasoning,2.8,[],biggen_240829.csv olmo_7b_sft,BIGGEN Reasoning,2.4,[],biggen_240829.csv codellama_70b_instruct,BIGGEN Reasoning,2.83,[],biggen_240829.csv llemma_34b,BIGGEN Reasoning,2.75,[],biggen_240829.csv llama3_8b,BIGGEN Reasoning,2.39,[],biggen_240829.csv qwen1_5_1_8b_chat,BIGGEN Reasoning,2.28,[],biggen_240829.csv qwen1_5_4b,BIGGEN Reasoning,2.45,[],biggen_240829.csv llama_2_13b,BIGGEN Reasoning,2.28,[],biggen_240829.csv yi_6b,BIGGEN Reasoning,2.36,[],biggen_240829.csv codellama_70b,BIGGEN Reasoning,2.44,[],biggen_240829.csv codellama34b,BIGGEN Reasoning,2.17,[],biggen_240829.csv phi_1_5,BIGGEN Reasoning,2.24,[],biggen_240829.csv orca_2_13b,BIGGEN Reasoning,2.24,[],biggen_240829.csv llama_2_7b,BIGGEN Reasoning,2.18,[],biggen_240829.csv qwen1_5_1_8b,BIGGEN Reasoning,1.98,[],biggen_240829.csv llemma_7b,BIGGEN Reasoning,2.24,[],biggen_240829.csv gemma_2b,BIGGEN Reasoning,2.16,[],biggen_240829.csv codellama_13b,BIGGEN Reasoning,2.01,[],biggen_240829.csv qwen1_5_0_5b_chat,BIGGEN Reasoning,1.76,[],biggen_240829.csv orca_2_7b,BIGGEN Reasoning,1.85,[],biggen_240829.csv olmo_7b,BIGGEN Reasoning,1.84,[],biggen_240829.csv codellama_7b,BIGGEN Reasoning,1.72,[],biggen_240829.csv qwen1_5_0_5b,BIGGEN Reasoning,1.58,[],biggen_240829.csv olmo_1b,BIGGEN Reasoning,1.33,[],biggen_240829.csv aya_101,BIGGEN Reasoning,1.25,[],biggen_240829.csv gemma_7b,BIGGEN Reasoning,1.34,[],biggen_240829.csv phi_1,BIGGEN Reasoning,1.0,[],biggen_240829.csv gpt_4_1106_preview,BIGGEN Refinement,4.171,[],biggen_240829.csv gpt_4_0125_preview,BIGGEN Refinement,4.145,[],biggen_240829.csv gpt_4o_2024_05_13,BIGGEN Refinement,4.079,[],biggen_240829.csv gpt_4_turbo_2024_04_09,BIGGEN Refinement,4.105,[],biggen_240829.csv claude_3_opus_20240229,BIGGEN Refinement,3.908,[],biggen_240829.csv llama3_70b_instruct,BIGGEN Refinement,3.907,[],biggen_240829.csv claude_3_sonnet_20240229,BIGGEN Refinement,3.724,[],biggen_240829.csv qwen_110b_chat,BIGGEN Refinement,3.882,[],biggen_240829.csv claude_3_haiku_20240307,BIGGEN Refinement,3.632,[],biggen_240829.csv gemini_pro_1_5,BIGGEN Refinement,3.671,[],biggen_240829.csv mixtral_8x22b_instruct_v0_1_awq,BIGGEN Refinement,3.842,[],biggen_240829.csv mistral_medium,BIGGEN Refinement,3.776,[],biggen_240829.csv mistral_large,BIGGEN Refinement,3.776,[],biggen_240829.csv gemini_flash_1_5,BIGGEN Refinement,3.453,[],biggen_240829.csv c4ai_command_r_plus_gptq,BIGGEN Refinement,3.461,[],biggen_240829.csv qwen1_5_72b_chat,BIGGEN Refinement,3.632,[],biggen_240829.csv phi_3_mini_4k_instruct,BIGGEN Refinement,3.974,[],biggen_240829.csv qwen1_5_32b_chat,BIGGEN Refinement,3.395,[],biggen_240829.csv starling_lm_7b_beta,BIGGEN Refinement,3.547,[],biggen_240829.csv llama3_8b_instruct,BIGGEN Refinement,3.507,[],biggen_240829.csv nous_hermes_2_mixtral_8x7b_dpo,BIGGEN Refinement,3.342,[],biggen_240829.csv yi_34b_chat,BIGGEN Refinement,3.676,[],biggen_240829.csv mixtral_8x7b_instruct_v0_1,BIGGEN Refinement,3.434,[],biggen_240829.csv gpt_3_5_turbo_0125,BIGGEN Refinement,3.434,[],biggen_240829.csv tulu_2_dpo_70b,BIGGEN Refinement,3.421,[],biggen_240829.csv phi_3_mini_128k_instruct,BIGGEN Refinement,3.513,[],biggen_240829.csv gpt_3_5_turbo_1106,BIGGEN Refinement,3.434,[],biggen_240829.csv c4ai_command_r_v0_1,BIGGEN Refinement,3.447,[],biggen_240829.csv solar_10_7b_instruct_v1_0,BIGGEN Refinement,3.382,[],biggen_240829.csv llama_2_70b_chat,BIGGEN Refinement,3.36,[],biggen_240829.csv gemini_1_0_pro,BIGGEN Refinement,3.373,[],biggen_240829.csv mistral_7b_instruct_v0_2,BIGGEN Refinement,3.447,[],biggen_240829.csv mixtral_8x22b_v0_1_awq,BIGGEN Refinement,3.539,[],biggen_240829.csv nous_hermes_2_mixtral_8x7b_sft,BIGGEN Refinement,3.461,[],biggen_240829.csv openchat_3_5_0106,BIGGEN Refinement,3.566,[],biggen_240829.csv zephyr_orpo_141b_a35b_v0_1_awq,BIGGEN Refinement,3.618,[],biggen_240829.csv qwen1_5_14b_chat,BIGGEN Refinement,3.263,[],biggen_240829.csv qwen1_5_7b_chat,BIGGEN Refinement,3.395,[],biggen_240829.csv starling_lm_7b_alpha,BIGGEN Refinement,3.224,[],biggen_240829.csv zephyr_7b_beta,BIGGEN Refinement,3.382,[],biggen_240829.csv nous_hermes_2_mistral_7b_dpo,BIGGEN Refinement,3.355,[],biggen_240829.csv nous_hermes_2_yi_34b,BIGGEN Refinement,3.373,[],biggen_240829.csv mistral_orpo_beta,BIGGEN Refinement,3.263,[],biggen_240829.csv llama_2_13b_chat,BIGGEN Refinement,3.079,[],biggen_240829.csv openhermes_2_5_mistral_7b,BIGGEN Refinement,3.276,[],biggen_240829.csv mixtral_8x7b_v0_1,BIGGEN Refinement,3.237,[],biggen_240829.csv mistral_orpo_alpha,BIGGEN Refinement,3.171,[],biggen_240829.csv tulu_2_dpo_13b,BIGGEN Refinement,3.184,[],biggen_240829.csv qwen1_5_72b,BIGGEN Refinement,3.227,[],biggen_240829.csv codetulu_2_34b,BIGGEN Refinement,3.211,[],biggen_240829.csv gemma_1_1_7b_it,BIGGEN Refinement,3.25,[],biggen_240829.csv openhermes_2_mistral_7b,BIGGEN Refinement,3.28,[],biggen_240829.csv codellama34b_instruct,BIGGEN Refinement,3.079,[],biggen_240829.csv yi_34b,BIGGEN Refinement,3.24,[],biggen_240829.csv llama_2_70b,BIGGEN Refinement,3.133,[],biggen_240829.csv qwen1_5_32b,BIGGEN Refinement,3.118,[],biggen_240829.csv llama_2_7b_chat,BIGGEN Refinement,2.961,[],biggen_240829.csv tulu_2_dpo_7b,BIGGEN Refinement,3.079,[],biggen_240829.csv codetulu_2_13b,BIGGEN Refinement,3.197,[],biggen_240829.csv solar_10_7b_v1_0,BIGGEN Refinement,3.197,[],biggen_240829.csv tulu_2_13b,BIGGEN Refinement,3.027,[],biggen_240829.csv codellama_13b_instruct,BIGGEN Refinement,2.895,[],biggen_240829.csv yi_6b_chat,BIGGEN Refinement,3.08,[],biggen_240829.csv codellama_7b_instruct,BIGGEN Refinement,2.961,[],biggen_240829.csv gemma_7b_it,BIGGEN Refinement,3.026,[],biggen_240829.csv llama3_70b,BIGGEN Refinement,3.342,[],biggen_240829.csv qwen1_5_14b,BIGGEN Refinement,3.092,[],biggen_240829.csv gemma_1_1_2b_it,BIGGEN Refinement,3.053,[],biggen_240829.csv codetulu_2_7b,BIGGEN Refinement,2.908,[],biggen_240829.csv tulu_2_7b,BIGGEN Refinement,2.974,[],biggen_240829.csv mistral_7b_v0_2,BIGGEN Refinement,2.892,[],biggen_240829.csv mistral_7b_v0_1,BIGGEN Refinement,2.763,[],biggen_240829.csv qwen1_5_4b_chat,BIGGEN Refinement,3.0,[],biggen_240829.csv olmo_7b_instruct,BIGGEN Refinement,2.776,[],biggen_240829.csv gemma_2b_it,BIGGEN Refinement,2.882,[],biggen_240829.csv qwen1_5_7b,BIGGEN Refinement,2.827,[],biggen_240829.csv phi_2,BIGGEN Refinement,2.763,[],biggen_240829.csv olmo_7b_sft,BIGGEN Refinement,2.684,[],biggen_240829.csv codellama_70b_instruct,BIGGEN Refinement,2.747,[],biggen_240829.csv llemma_34b,BIGGEN Refinement,2.816,[],biggen_240829.csv llama3_8b,BIGGEN Refinement,3.039,[],biggen_240829.csv qwen1_5_1_8b_chat,BIGGEN Refinement,2.855,[],biggen_240829.csv qwen1_5_4b,BIGGEN Refinement,2.697,[],biggen_240829.csv llama_2_13b,BIGGEN Refinement,2.579,[],biggen_240829.csv yi_6b,BIGGEN Refinement,2.487,[],biggen_240829.csv codellama_70b,BIGGEN Refinement,2.507,[],biggen_240829.csv codellama34b,BIGGEN Refinement,2.566,[],biggen_240829.csv phi_1_5,BIGGEN Refinement,2.526,[],biggen_240829.csv orca_2_13b,BIGGEN Refinement,2.487,[],biggen_240829.csv llama_2_7b,BIGGEN Refinement,2.211,[],biggen_240829.csv qwen1_5_1_8b,BIGGEN Refinement,2.605,[],biggen_240829.csv llemma_7b,BIGGEN Refinement,2.303,[],biggen_240829.csv gemma_2b,BIGGEN Refinement,2.093,[],biggen_240829.csv codellama_13b,BIGGEN Refinement,2.092,[],biggen_240829.csv qwen1_5_0_5b_chat,BIGGEN Refinement,2.0,[],biggen_240829.csv orca_2_7b,BIGGEN Refinement,2.316,[],biggen_240829.csv olmo_7b,BIGGEN Refinement,2.105,[],biggen_240829.csv codellama_7b,BIGGEN Refinement,2.118,[],biggen_240829.csv qwen1_5_0_5b,BIGGEN Refinement,2.158,[],biggen_240829.csv olmo_1b,BIGGEN Refinement,1.947,[],biggen_240829.csv aya_101,BIGGEN Refinement,1.908,[],biggen_240829.csv gemma_7b,BIGGEN Refinement,1.579,[],biggen_240829.csv phi_1,BIGGEN Refinement,1.434,[],biggen_240829.csv gpt_4_1106_preview,BIGGEN Safety,4.565,[],biggen_240829.csv gpt_4_0125_preview,BIGGEN Safety,4.174,[],biggen_240829.csv gpt_4o_2024_05_13,BIGGEN Safety,4.058,[],biggen_240829.csv gpt_4_turbo_2024_04_09,BIGGEN Safety,4.087,[],biggen_240829.csv claude_3_opus_20240229,BIGGEN Safety,4.536,[],biggen_240829.csv llama3_70b_instruct,BIGGEN Safety,4.014,[],biggen_240829.csv claude_3_sonnet_20240229,BIGGEN Safety,4.362,[],biggen_240829.csv qwen_110b_chat,BIGGEN Safety,4.043,[],biggen_240829.csv claude_3_haiku_20240307,BIGGEN Safety,4.304,[],biggen_240829.csv gemini_pro_1_5,BIGGEN Safety,4.116,[],biggen_240829.csv mixtral_8x22b_instruct_v0_1_awq,BIGGEN Safety,4.087,[],biggen_240829.csv mistral_medium,BIGGEN Safety,4.058,[],biggen_240829.csv mistral_large,BIGGEN Safety,3.913,[],biggen_240829.csv gemini_flash_1_5,BIGGEN Safety,4.217,[],biggen_240829.csv c4ai_command_r_plus_gptq,BIGGEN Safety,3.971,[],biggen_240829.csv qwen1_5_72b_chat,BIGGEN Safety,3.957,[],biggen_240829.csv phi_3_mini_4k_instruct,BIGGEN Safety,4.145,[],biggen_240829.csv qwen1_5_32b_chat,BIGGEN Safety,4.217,[],biggen_240829.csv starling_lm_7b_beta,BIGGEN Safety,3.87,[],biggen_240829.csv llama3_8b_instruct,BIGGEN Safety,3.725,[],biggen_240829.csv nous_hermes_2_mixtral_8x7b_dpo,BIGGEN Safety,3.739,[],biggen_240829.csv yi_34b_chat,BIGGEN Safety,3.884,[],biggen_240829.csv mixtral_8x7b_instruct_v0_1,BIGGEN Safety,3.812,[],biggen_240829.csv gpt_3_5_turbo_0125,BIGGEN Safety,3.884,[],biggen_240829.csv tulu_2_dpo_70b,BIGGEN Safety,3.754,[],biggen_240829.csv phi_3_mini_128k_instruct,BIGGEN Safety,3.957,[],biggen_240829.csv gpt_3_5_turbo_1106,BIGGEN Safety,4.0,[],biggen_240829.csv c4ai_command_r_v0_1,BIGGEN Safety,3.899,[],biggen_240829.csv solar_10_7b_instruct_v1_0,BIGGEN Safety,3.826,[],biggen_240829.csv llama_2_70b_chat,BIGGEN Safety,4.377,[],biggen_240829.csv gemini_1_0_pro,BIGGEN Safety,3.942,[],biggen_240829.csv mistral_7b_instruct_v0_2,BIGGEN Safety,3.826,[],biggen_240829.csv mixtral_8x22b_v0_1_awq,BIGGEN Safety,4.0,[],biggen_240829.csv nous_hermes_2_mixtral_8x7b_sft,BIGGEN Safety,3.609,[],biggen_240829.csv openchat_3_5_0106,BIGGEN Safety,3.725,[],biggen_240829.csv zephyr_orpo_141b_a35b_v0_1_awq,BIGGEN Safety,3.449,[],biggen_240829.csv qwen1_5_14b_chat,BIGGEN Safety,3.855,[],biggen_240829.csv qwen1_5_7b_chat,BIGGEN Safety,3.725,[],biggen_240829.csv starling_lm_7b_alpha,BIGGEN Safety,3.913,[],biggen_240829.csv zephyr_7b_beta,BIGGEN Safety,3.551,[],biggen_240829.csv nous_hermes_2_mistral_7b_dpo,BIGGEN Safety,3.377,[],biggen_240829.csv nous_hermes_2_yi_34b,BIGGEN Safety,3.536,[],biggen_240829.csv mistral_orpo_beta,BIGGEN Safety,3.696,[],biggen_240829.csv llama_2_13b_chat,BIGGEN Safety,4.319,[],biggen_240829.csv openhermes_2_5_mistral_7b,BIGGEN Safety,3.435,[],biggen_240829.csv mixtral_8x7b_v0_1,BIGGEN Safety,3.87,[],biggen_240829.csv mistral_orpo_alpha,BIGGEN Safety,3.971,[],biggen_240829.csv tulu_2_dpo_13b,BIGGEN Safety,3.841,[],biggen_240829.csv qwen1_5_72b,BIGGEN Safety,3.942,[],biggen_240829.csv codetulu_2_34b,BIGGEN Safety,3.652,[],biggen_240829.csv gemma_1_1_7b_it,BIGGEN Safety,4.043,[],biggen_240829.csv openhermes_2_mistral_7b,BIGGEN Safety,3.232,[],biggen_240829.csv codellama34b_instruct,BIGGEN Safety,4.13,[],biggen_240829.csv yi_34b,BIGGEN Safety,3.58,[],biggen_240829.csv llama_2_70b,BIGGEN Safety,3.87,[],biggen_240829.csv qwen1_5_32b,BIGGEN Safety,3.333,[],biggen_240829.csv llama_2_7b_chat,BIGGEN Safety,4.145,[],biggen_240829.csv tulu_2_dpo_7b,BIGGEN Safety,3.754,[],biggen_240829.csv codetulu_2_13b,BIGGEN Safety,3.29,[],biggen_240829.csv solar_10_7b_v1_0,BIGGEN Safety,3.667,[],biggen_240829.csv tulu_2_13b,BIGGEN Safety,3.768,[],biggen_240829.csv codellama_13b_instruct,BIGGEN Safety,4.043,[],biggen_240829.csv yi_6b_chat,BIGGEN Safety,3.478,[],biggen_240829.csv codellama_7b_instruct,BIGGEN Safety,3.754,[],biggen_240829.csv gemma_7b_it,BIGGEN Safety,3.768,[],biggen_240829.csv llama3_70b,BIGGEN Safety,3.261,[],biggen_240829.csv qwen1_5_14b,BIGGEN Safety,2.58,[],biggen_240829.csv gemma_1_1_2b_it,BIGGEN Safety,3.971,[],biggen_240829.csv codetulu_2_7b,BIGGEN Safety,3.246,[],biggen_240829.csv tulu_2_7b,BIGGEN Safety,3.638,[],biggen_240829.csv mistral_7b_v0_2,BIGGEN Safety,3.377,[],biggen_240829.csv mistral_7b_v0_1,BIGGEN Safety,3.406,[],biggen_240829.csv qwen1_5_4b_chat,BIGGEN Safety,3.333,[],biggen_240829.csv olmo_7b_instruct,BIGGEN Safety,3.101,[],biggen_240829.csv gemma_2b_it,BIGGEN Safety,3.754,[],biggen_240829.csv qwen1_5_7b,BIGGEN Safety,3.101,[],biggen_240829.csv phi_2,BIGGEN Safety,3.406,[],biggen_240829.csv olmo_7b_sft,BIGGEN Safety,3.333,[],biggen_240829.csv codellama_70b_instruct,BIGGEN Safety,4.101,[],biggen_240829.csv llemma_34b,BIGGEN Safety,2.971,[],biggen_240829.csv llama3_8b,BIGGEN Safety,2.899,[],biggen_240829.csv qwen1_5_1_8b_chat,BIGGEN Safety,2.681,[],biggen_240829.csv qwen1_5_4b,BIGGEN Safety,3.333,[],biggen_240829.csv llama_2_13b,BIGGEN Safety,3.348,[],biggen_240829.csv yi_6b,BIGGEN Safety,3.232,[],biggen_240829.csv codellama_70b,BIGGEN Safety,2.841,[],biggen_240829.csv codellama34b,BIGGEN Safety,2.725,[],biggen_240829.csv phi_1_5,BIGGEN Safety,2.87,[],biggen_240829.csv orca_2_13b,BIGGEN Safety,2.812,[],biggen_240829.csv llama_2_7b,BIGGEN Safety,3.217,[],biggen_240829.csv qwen1_5_1_8b,BIGGEN Safety,2.478,[],biggen_240829.csv llemma_7b,BIGGEN Safety,2.522,[],biggen_240829.csv gemma_2b,BIGGEN Safety,2.623,[],biggen_240829.csv codellama_13b,BIGGEN Safety,2.449,[],biggen_240829.csv qwen1_5_0_5b_chat,BIGGEN Safety,2.391,[],biggen_240829.csv orca_2_7b,BIGGEN Safety,2.594,[],biggen_240829.csv olmo_7b,BIGGEN Safety,2.652,[],biggen_240829.csv codellama_7b,BIGGEN Safety,2.348,[],biggen_240829.csv qwen1_5_0_5b,BIGGEN Safety,2.014,[],biggen_240829.csv olmo_1b,BIGGEN Safety,2.188,[],biggen_240829.csv aya_101,BIGGEN Safety,1.667,[],biggen_240829.csv gemma_7b,BIGGEN Safety,2.159,[],biggen_240829.csv phi_1,BIGGEN Safety,1.507,[],biggen_240829.csv gpt_4_1106_preview,BIGGEN Theory of Mind,4.24,[],biggen_240829.csv gpt_4_0125_preview,BIGGEN Theory of Mind,4.26,[],biggen_240829.csv gpt_4o_2024_05_13,BIGGEN Theory of Mind,4.08,[],biggen_240829.csv gpt_4_turbo_2024_04_09,BIGGEN Theory of Mind,4.12,[],biggen_240829.csv claude_3_opus_20240229,BIGGEN Theory of Mind,4.09,[],biggen_240829.csv llama3_70b_instruct,BIGGEN Theory of Mind,4.04,[],biggen_240829.csv claude_3_sonnet_20240229,BIGGEN Theory of Mind,4.0,[],biggen_240829.csv qwen_110b_chat,BIGGEN Theory of Mind,3.99,[],biggen_240829.csv claude_3_haiku_20240307,BIGGEN Theory of Mind,3.98,[],biggen_240829.csv gemini_pro_1_5,BIGGEN Theory of Mind,4.07,[],biggen_240829.csv mixtral_8x22b_instruct_v0_1_awq,BIGGEN Theory of Mind,3.87,[],biggen_240829.csv mistral_medium,BIGGEN Theory of Mind,3.9,[],biggen_240829.csv mistral_large,BIGGEN Theory of Mind,3.93,[],biggen_240829.csv gemini_flash_1_5,BIGGEN Theory of Mind,3.96,[],biggen_240829.csv c4ai_command_r_plus_gptq,BIGGEN Theory of Mind,3.94,[],biggen_240829.csv qwen1_5_72b_chat,BIGGEN Theory of Mind,3.96,[],biggen_240829.csv phi_3_mini_4k_instruct,BIGGEN Theory of Mind,3.9,[],biggen_240829.csv qwen1_5_32b_chat,BIGGEN Theory of Mind,3.87,[],biggen_240829.csv starling_lm_7b_beta,BIGGEN Theory of Mind,3.87,[],biggen_240829.csv llama3_8b_instruct,BIGGEN Theory of Mind,3.83,[],biggen_240829.csv nous_hermes_2_mixtral_8x7b_dpo,BIGGEN Theory of Mind,3.79,[],biggen_240829.csv yi_34b_chat,BIGGEN Theory of Mind,3.96,[],biggen_240829.csv mixtral_8x7b_instruct_v0_1,BIGGEN Theory of Mind,3.81,[],biggen_240829.csv gpt_3_5_turbo_0125,BIGGEN Theory of Mind,3.79,[],biggen_240829.csv tulu_2_dpo_70b,BIGGEN Theory of Mind,3.83,[],biggen_240829.csv phi_3_mini_128k_instruct,BIGGEN Theory of Mind,3.83,[],biggen_240829.csv gpt_3_5_turbo_1106,BIGGEN Theory of Mind,3.67,[],biggen_240829.csv c4ai_command_r_v0_1,BIGGEN Theory of Mind,3.9,[],biggen_240829.csv solar_10_7b_instruct_v1_0,BIGGEN Theory of Mind,3.9,[],biggen_240829.csv llama_2_70b_chat,BIGGEN Theory of Mind,3.73,[],biggen_240829.csv gemini_1_0_pro,BIGGEN Theory of Mind,3.75,[],biggen_240829.csv mistral_7b_instruct_v0_2,BIGGEN Theory of Mind,3.77,[],biggen_240829.csv mixtral_8x22b_v0_1_awq,BIGGEN Theory of Mind,3.49,[],biggen_240829.csv nous_hermes_2_mixtral_8x7b_sft,BIGGEN Theory of Mind,3.63,[],biggen_240829.csv openchat_3_5_0106,BIGGEN Theory of Mind,3.66,[],biggen_240829.csv zephyr_orpo_141b_a35b_v0_1_awq,BIGGEN Theory of Mind,3.58,[],biggen_240829.csv qwen1_5_14b_chat,BIGGEN Theory of Mind,3.52,[],biggen_240829.csv qwen1_5_7b_chat,BIGGEN Theory of Mind,3.7,[],biggen_240829.csv starling_lm_7b_alpha,BIGGEN Theory of Mind,3.54,[],biggen_240829.csv zephyr_7b_beta,BIGGEN Theory of Mind,3.73,[],biggen_240829.csv nous_hermes_2_mistral_7b_dpo,BIGGEN Theory of Mind,3.69,[],biggen_240829.csv nous_hermes_2_yi_34b,BIGGEN Theory of Mind,3.56,[],biggen_240829.csv mistral_orpo_beta,BIGGEN Theory of Mind,3.58,[],biggen_240829.csv llama_2_13b_chat,BIGGEN Theory of Mind,3.71,[],biggen_240829.csv openhermes_2_5_mistral_7b,BIGGEN Theory of Mind,3.57,[],biggen_240829.csv mixtral_8x7b_v0_1,BIGGEN Theory of Mind,3.59,[],biggen_240829.csv mistral_orpo_alpha,BIGGEN Theory of Mind,3.5,[],biggen_240829.csv tulu_2_dpo_13b,BIGGEN Theory of Mind,3.59,[],biggen_240829.csv qwen1_5_72b,BIGGEN Theory of Mind,3.38,[],biggen_240829.csv codetulu_2_34b,BIGGEN Theory of Mind,3.5,[],biggen_240829.csv gemma_1_1_7b_it,BIGGEN Theory of Mind,3.44,[],biggen_240829.csv openhermes_2_mistral_7b,BIGGEN Theory of Mind,3.45,[],biggen_240829.csv codellama34b_instruct,BIGGEN Theory of Mind,3.46,[],biggen_240829.csv yi_34b,BIGGEN Theory of Mind,3.39,[],biggen_240829.csv llama_2_70b,BIGGEN Theory of Mind,3.48,[],biggen_240829.csv qwen1_5_32b,BIGGEN Theory of Mind,3.33,[],biggen_240829.csv llama_2_7b_chat,BIGGEN Theory of Mind,3.65,[],biggen_240829.csv tulu_2_dpo_7b,BIGGEN Theory of Mind,3.68,[],biggen_240829.csv codetulu_2_13b,BIGGEN Theory of Mind,3.38,[],biggen_240829.csv solar_10_7b_v1_0,BIGGEN Theory of Mind,3.42,[],biggen_240829.csv tulu_2_13b,BIGGEN Theory of Mind,3.39,[],biggen_240829.csv codellama_13b_instruct,BIGGEN Theory of Mind,3.38,[],biggen_240829.csv yi_6b_chat,BIGGEN Theory of Mind,3.677,[],biggen_240829.csv codellama_7b_instruct,BIGGEN Theory of Mind,3.22,[],biggen_240829.csv gemma_7b_it,BIGGEN Theory of Mind,3.15,[],biggen_240829.csv llama3_70b,BIGGEN Theory of Mind,3.04,[],biggen_240829.csv qwen1_5_14b,BIGGEN Theory of Mind,3.16,[],biggen_240829.csv gemma_1_1_2b_it,BIGGEN Theory of Mind,3.37,[],biggen_240829.csv codetulu_2_7b,BIGGEN Theory of Mind,3.25,[],biggen_240829.csv tulu_2_7b,BIGGEN Theory of Mind,3.26,[],biggen_240829.csv mistral_7b_v0_2,BIGGEN Theory of Mind,3.29,[],biggen_240829.csv mistral_7b_v0_1,BIGGEN Theory of Mind,3.09,[],biggen_240829.csv qwen1_5_4b_chat,BIGGEN Theory of Mind,3.07,[],biggen_240829.csv olmo_7b_instruct,BIGGEN Theory of Mind,3.31,[],biggen_240829.csv gemma_2b_it,BIGGEN Theory of Mind,3.15,[],biggen_240829.csv qwen1_5_7b,BIGGEN Theory of Mind,2.77,[],biggen_240829.csv phi_2,BIGGEN Theory of Mind,3.2,[],biggen_240829.csv olmo_7b_sft,BIGGEN Theory of Mind,2.93,[],biggen_240829.csv codellama_70b_instruct,BIGGEN Theory of Mind,2.55,[],biggen_240829.csv llemma_34b,BIGGEN Theory of Mind,2.84,[],biggen_240829.csv llama3_8b,BIGGEN Theory of Mind,2.82,[],biggen_240829.csv qwen1_5_1_8b_chat,BIGGEN Theory of Mind,3.13,[],biggen_240829.csv qwen1_5_4b,BIGGEN Theory of Mind,2.73,[],biggen_240829.csv llama_2_13b,BIGGEN Theory of Mind,2.88,[],biggen_240829.csv yi_6b,BIGGEN Theory of Mind,2.89,[],biggen_240829.csv codellama_70b,BIGGEN Theory of Mind,2.44,[],biggen_240829.csv codellama34b,BIGGEN Theory of Mind,2.59,[],biggen_240829.csv phi_1_5,BIGGEN Theory of Mind,2.95,[],biggen_240829.csv orca_2_13b,BIGGEN Theory of Mind,2.8,[],biggen_240829.csv llama_2_7b,BIGGEN Theory of Mind,2.6,[],biggen_240829.csv qwen1_5_1_8b,BIGGEN Theory of Mind,2.55,[],biggen_240829.csv llemma_7b,BIGGEN Theory of Mind,2.19,[],biggen_240829.csv gemma_2b,BIGGEN Theory of Mind,2.32,[],biggen_240829.csv codellama_13b,BIGGEN Theory of Mind,2.15,[],biggen_240829.csv qwen1_5_0_5b_chat,BIGGEN Theory of Mind,2.38,[],biggen_240829.csv orca_2_7b,BIGGEN Theory of Mind,2.24,[],biggen_240829.csv olmo_7b,BIGGEN Theory of Mind,2.16,[],biggen_240829.csv codellama_7b,BIGGEN Theory of Mind,1.9,[],biggen_240829.csv qwen1_5_0_5b,BIGGEN Theory of Mind,1.8,[],biggen_240829.csv olmo_1b,BIGGEN Theory of Mind,1.59,[],biggen_240829.csv aya_101,BIGGEN Theory of Mind,1.38,[],biggen_240829.csv gemma_7b,BIGGEN Theory of Mind,1.2,[],biggen_240829.csv phi_1,BIGGEN Theory of Mind,1.0,[],biggen_240829.csv gpt_4_1106_preview,BIGGEN Tool Usage,3.775,[],biggen_240829.csv gpt_4_0125_preview,BIGGEN Tool Usage,3.925,[],biggen_240829.csv gpt_4o_2024_05_13,BIGGEN Tool Usage,3.85,[],biggen_240829.csv gpt_4_turbo_2024_04_09,BIGGEN Tool Usage,3.8,[],biggen_240829.csv claude_3_opus_20240229,BIGGEN Tool Usage,3.788,[],biggen_240829.csv llama3_70b_instruct,BIGGEN Tool Usage,3.775,[],biggen_240829.csv claude_3_sonnet_20240229,BIGGEN Tool Usage,3.75,[],biggen_240829.csv qwen_110b_chat,BIGGEN Tool Usage,3.588,[],biggen_240829.csv claude_3_haiku_20240307,BIGGEN Tool Usage,3.75,[],biggen_240829.csv gemini_pro_1_5,BIGGEN Tool Usage,3.488,[],biggen_240829.csv mixtral_8x22b_instruct_v0_1_awq,BIGGEN Tool Usage,3.712,[],biggen_240829.csv mistral_medium,BIGGEN Tool Usage,3.862,[],biggen_240829.csv mistral_large,BIGGEN Tool Usage,3.825,[],biggen_240829.csv gemini_flash_1_5,BIGGEN Tool Usage,3.625,[],biggen_240829.csv c4ai_command_r_plus_gptq,BIGGEN Tool Usage,3.525,[],biggen_240829.csv qwen1_5_72b_chat,BIGGEN Tool Usage,3.525,[],biggen_240829.csv phi_3_mini_4k_instruct,BIGGEN Tool Usage,3.338,[],biggen_240829.csv qwen1_5_32b_chat,BIGGEN Tool Usage,3.738,[],biggen_240829.csv starling_lm_7b_beta,BIGGEN Tool Usage,3.562,[],biggen_240829.csv llama3_8b_instruct,BIGGEN Tool Usage,3.5,[],biggen_240829.csv nous_hermes_2_mixtral_8x7b_dpo,BIGGEN Tool Usage,3.662,[],biggen_240829.csv yi_34b_chat,BIGGEN Tool Usage,3.038,[],biggen_240829.csv mixtral_8x7b_instruct_v0_1,BIGGEN Tool Usage,3.412,[],biggen_240829.csv gpt_3_5_turbo_0125,BIGGEN Tool Usage,3.138,[],biggen_240829.csv tulu_2_dpo_70b,BIGGEN Tool Usage,3.612,[],biggen_240829.csv phi_3_mini_128k_instruct,BIGGEN Tool Usage,3.1,[],biggen_240829.csv gpt_3_5_turbo_1106,BIGGEN Tool Usage,3.162,[],biggen_240829.csv c4ai_command_r_v0_1,BIGGEN Tool Usage,3.188,[],biggen_240829.csv solar_10_7b_instruct_v1_0,BIGGEN Tool Usage,3.412,[],biggen_240829.csv llama_2_70b_chat,BIGGEN Tool Usage,3.188,[],biggen_240829.csv gemini_1_0_pro,BIGGEN Tool Usage,3.125,[],biggen_240829.csv mistral_7b_instruct_v0_2,BIGGEN Tool Usage,3.362,[],biggen_240829.csv mixtral_8x22b_v0_1_awq,BIGGEN Tool Usage,3.188,[],biggen_240829.csv nous_hermes_2_mixtral_8x7b_sft,BIGGEN Tool Usage,3.538,[],biggen_240829.csv openchat_3_5_0106,BIGGEN Tool Usage,3.125,[],biggen_240829.csv zephyr_orpo_141b_a35b_v0_1_awq,BIGGEN Tool Usage,3.288,[],biggen_240829.csv qwen1_5_14b_chat,BIGGEN Tool Usage,3.2,[],biggen_240829.csv qwen1_5_7b_chat,BIGGEN Tool Usage,3.15,[],biggen_240829.csv starling_lm_7b_alpha,BIGGEN Tool Usage,3.025,[],biggen_240829.csv zephyr_7b_beta,BIGGEN Tool Usage,3.288,[],biggen_240829.csv nous_hermes_2_mistral_7b_dpo,BIGGEN Tool Usage,3.062,[],biggen_240829.csv nous_hermes_2_yi_34b,BIGGEN Tool Usage,3.175,[],biggen_240829.csv mistral_orpo_beta,BIGGEN Tool Usage,3.025,[],biggen_240829.csv llama_2_13b_chat,BIGGEN Tool Usage,2.6,[],biggen_240829.csv openhermes_2_5_mistral_7b,BIGGEN Tool Usage,3.062,[],biggen_240829.csv mixtral_8x7b_v0_1,BIGGEN Tool Usage,2.775,[],biggen_240829.csv mistral_orpo_alpha,BIGGEN Tool Usage,2.95,[],biggen_240829.csv tulu_2_dpo_13b,BIGGEN Tool Usage,3.05,[],biggen_240829.csv qwen1_5_72b,BIGGEN Tool Usage,2.988,[],biggen_240829.csv codetulu_2_34b,BIGGEN Tool Usage,3.35,[],biggen_240829.csv gemma_1_1_7b_it,BIGGEN Tool Usage,2.788,[],biggen_240829.csv openhermes_2_mistral_7b,BIGGEN Tool Usage,2.925,[],biggen_240829.csv codellama34b_instruct,BIGGEN Tool Usage,2.738,[],biggen_240829.csv yi_34b,BIGGEN Tool Usage,2.512,[],biggen_240829.csv llama_2_70b,BIGGEN Tool Usage,2.625,[],biggen_240829.csv qwen1_5_32b,BIGGEN Tool Usage,2.925,[],biggen_240829.csv llama_2_7b_chat,BIGGEN Tool Usage,2.3,[],biggen_240829.csv tulu_2_dpo_7b,BIGGEN Tool Usage,2.438,[],biggen_240829.csv codetulu_2_13b,BIGGEN Tool Usage,3.238,[],biggen_240829.csv solar_10_7b_v1_0,BIGGEN Tool Usage,2.562,[],biggen_240829.csv tulu_2_13b,BIGGEN Tool Usage,2.775,[],biggen_240829.csv codellama_13b_instruct,BIGGEN Tool Usage,2.6,[],biggen_240829.csv yi_6b_chat,BIGGEN Tool Usage,2.338,[],biggen_240829.csv codellama_7b_instruct,BIGGEN Tool Usage,2.575,[],biggen_240829.csv gemma_7b_it,BIGGEN Tool Usage,2.325,[],biggen_240829.csv llama3_70b,BIGGEN Tool Usage,2.5,[],biggen_240829.csv qwen1_5_14b,BIGGEN Tool Usage,2.912,[],biggen_240829.csv gemma_1_1_2b_it,BIGGEN Tool Usage,1.975,[],biggen_240829.csv codetulu_2_7b,BIGGEN Tool Usage,2.788,[],biggen_240829.csv tulu_2_7b,BIGGEN Tool Usage,2.212,[],biggen_240829.csv mistral_7b_v0_2,BIGGEN Tool Usage,2.275,[],biggen_240829.csv mistral_7b_v0_1,BIGGEN Tool Usage,2.162,[],biggen_240829.csv qwen1_5_4b_chat,BIGGEN Tool Usage,2.4,[],biggen_240829.csv olmo_7b_instruct,BIGGEN Tool Usage,2.212,[],biggen_240829.csv gemma_2b_it,BIGGEN Tool Usage,1.962,[],biggen_240829.csv qwen1_5_7b,BIGGEN Tool Usage,2.488,[],biggen_240829.csv phi_2,BIGGEN Tool Usage,1.788,[],biggen_240829.csv olmo_7b_sft,BIGGEN Tool Usage,2.088,[],biggen_240829.csv codellama_70b_instruct,BIGGEN Tool Usage,1.988,[],biggen_240829.csv llemma_34b,BIGGEN Tool Usage,2.088,[],biggen_240829.csv llama3_8b,BIGGEN Tool Usage,1.938,[],biggen_240829.csv qwen1_5_1_8b_chat,BIGGEN Tool Usage,1.988,[],biggen_240829.csv qwen1_5_4b,BIGGEN Tool Usage,1.9,[],biggen_240829.csv llama_2_13b,BIGGEN Tool Usage,1.812,[],biggen_240829.csv yi_6b,BIGGEN Tool Usage,1.55,[],biggen_240829.csv codellama_70b,BIGGEN Tool Usage,2.4,[],biggen_240829.csv codellama34b,BIGGEN Tool Usage,2.062,[],biggen_240829.csv phi_1_5,BIGGEN Tool Usage,1.525,[],biggen_240829.csv orca_2_13b,BIGGEN Tool Usage,2.362,[],biggen_240829.csv llama_2_7b,BIGGEN Tool Usage,1.45,[],biggen_240829.csv qwen1_5_1_8b,BIGGEN Tool Usage,1.525,[],biggen_240829.csv llemma_7b,BIGGEN Tool Usage,1.838,[],biggen_240829.csv gemma_2b,BIGGEN Tool Usage,1.488,[],biggen_240829.csv codellama_13b,BIGGEN Tool Usage,1.812,[],biggen_240829.csv qwen1_5_0_5b_chat,BIGGEN Tool Usage,1.462,[],biggen_240829.csv orca_2_7b,BIGGEN Tool Usage,1.6,[],biggen_240829.csv olmo_7b,BIGGEN Tool Usage,1.312,[],biggen_240829.csv codellama_7b,BIGGEN Tool Usage,1.562,[],biggen_240829.csv qwen1_5_0_5b,BIGGEN Tool Usage,1.275,[],biggen_240829.csv olmo_1b,BIGGEN Tool Usage,1.125,[],biggen_240829.csv aya_101,BIGGEN Tool Usage,1.162,[],biggen_240829.csv gemma_7b,BIGGEN Tool Usage,1.012,[],biggen_240829.csv phi_1,BIGGEN Tool Usage,1.012,[],biggen_240829.csv gpt_4_1106_preview,BIGGEN Multilingual,3.6,[],biggen_240829.csv gpt_4_0125_preview,BIGGEN Multilingual,3.543,[],biggen_240829.csv gpt_4o_2024_05_13,BIGGEN Multilingual,3.643,[],biggen_240829.csv gpt_4_turbo_2024_04_09,BIGGEN Multilingual,3.471,[],biggen_240829.csv claude_3_opus_20240229,BIGGEN Multilingual,3.571,[],biggen_240829.csv llama3_70b_instruct,BIGGEN Multilingual,3.314,[],biggen_240829.csv claude_3_sonnet_20240229,BIGGEN Multilingual,3.186,[],biggen_240829.csv qwen_110b_chat,BIGGEN Multilingual,2.771,[],biggen_240829.csv claude_3_haiku_20240307,BIGGEN Multilingual,3.071,[],biggen_240829.csv gemini_pro_1_5,BIGGEN Multilingual,3.257,[],biggen_240829.csv mixtral_8x22b_instruct_v0_1_awq,BIGGEN Multilingual,2.714,[],biggen_240829.csv mistral_medium,BIGGEN Multilingual,2.929,[],biggen_240829.csv mistral_large,BIGGEN Multilingual,2.886,[],biggen_240829.csv gemini_flash_1_5,BIGGEN Multilingual,2.671,[],biggen_240829.csv c4ai_command_r_plus_gptq,BIGGEN Multilingual,2.757,[],biggen_240829.csv qwen1_5_72b_chat,BIGGEN Multilingual,2.914,[],biggen_240829.csv phi_3_mini_4k_instruct,BIGGEN Multilingual,1.914,[],biggen_240829.csv qwen1_5_32b_chat,BIGGEN Multilingual,2.714,[],biggen_240829.csv starling_lm_7b_beta,BIGGEN Multilingual,2.271,[],biggen_240829.csv llama3_8b_instruct,BIGGEN Multilingual,2.914,[],biggen_240829.csv nous_hermes_2_mixtral_8x7b_dpo,BIGGEN Multilingual,2.557,[],biggen_240829.csv yi_34b_chat,BIGGEN Multilingual,2.186,[],biggen_240829.csv mixtral_8x7b_instruct_v0_1,BIGGEN Multilingual,2.714,[],biggen_240829.csv gpt_3_5_turbo_0125,BIGGEN Multilingual,2.614,[],biggen_240829.csv tulu_2_dpo_70b,BIGGEN Multilingual,2.314,[],biggen_240829.csv phi_3_mini_128k_instruct,BIGGEN Multilingual,1.829,[],biggen_240829.csv gpt_3_5_turbo_1106,BIGGEN Multilingual,2.557,[],biggen_240829.csv c4ai_command_r_v0_1,BIGGEN Multilingual,2.186,[],biggen_240829.csv solar_10_7b_instruct_v1_0,BIGGEN Multilingual,2.443,[],biggen_240829.csv llama_2_70b_chat,BIGGEN Multilingual,2.386,[],biggen_240829.csv gemini_1_0_pro,BIGGEN Multilingual,3.186,[],biggen_240829.csv mistral_7b_instruct_v0_2,BIGGEN Multilingual,2.286,[],biggen_240829.csv nous_hermes_2_mixtral_8x7b_sft,BIGGEN Multilingual,2.4,[],biggen_240829.csv openchat_3_5_0106,BIGGEN Multilingual,2.157,[],biggen_240829.csv zephyr_orpo_141b_a35b_v0_1_awq,BIGGEN Multilingual,2.586,[],biggen_240829.csv qwen1_5_14b_chat,BIGGEN Multilingual,2.386,[],biggen_240829.csv qwen1_5_7b_chat,BIGGEN Multilingual,2.057,[],biggen_240829.csv starling_lm_7b_alpha,BIGGEN Multilingual,2.229,[],biggen_240829.csv zephyr_7b_beta,BIGGEN Multilingual,1.943,[],biggen_240829.csv nous_hermes_2_mistral_7b_dpo,BIGGEN Multilingual,2.171,[],biggen_240829.csv nous_hermes_2_yi_34b,BIGGEN Multilingual,2.071,[],biggen_240829.csv mistral_orpo_beta,BIGGEN Multilingual,2.1,[],biggen_240829.csv llama_2_13b_chat,BIGGEN Multilingual,2.114,[],biggen_240829.csv openhermes_2_5_mistral_7b,BIGGEN Multilingual,2.1,[],biggen_240829.csv mistral_orpo_alpha,BIGGEN Multilingual,2.086,[],biggen_240829.csv tulu_2_dpo_13b,BIGGEN Multilingual,2.143,[],biggen_240829.csv codetulu_2_34b,BIGGEN Multilingual,2.0,[],biggen_240829.csv gemma_1_1_7b_it,BIGGEN Multilingual,2.0,[],biggen_240829.csv openhermes_2_mistral_7b,BIGGEN Multilingual,1.914,[],biggen_240829.csv codellama34b_instruct,BIGGEN Multilingual,2.114,[],biggen_240829.csv llama_2_7b_chat,BIGGEN Multilingual,2.029,[],biggen_240829.csv tulu_2_dpo_7b,BIGGEN Multilingual,1.971,[],biggen_240829.csv codetulu_2_13b,BIGGEN Multilingual,1.886,[],biggen_240829.csv tulu_2_13b,BIGGEN Multilingual,2.029,[],biggen_240829.csv codellama_13b_instruct,BIGGEN Multilingual,1.886,[],biggen_240829.csv yi_6b_chat,BIGGEN Multilingual,1.457,[],biggen_240829.csv codellama_7b_instruct,BIGGEN Multilingual,1.771,[],biggen_240829.csv gemma_7b_it,BIGGEN Multilingual,1.786,[],biggen_240829.csv gemma_1_1_2b_it,BIGGEN Multilingual,1.471,[],biggen_240829.csv codetulu_2_7b,BIGGEN Multilingual,1.8,[],biggen_240829.csv tulu_2_7b,BIGGEN Multilingual,1.714,[],biggen_240829.csv qwen1_5_4b_chat,BIGGEN Multilingual,1.471,[],biggen_240829.csv olmo_7b_instruct,BIGGEN Multilingual,1.414,[],biggen_240829.csv gemma_2b_it,BIGGEN Multilingual,1.657,[],biggen_240829.csv olmo_7b_sft,BIGGEN Multilingual,1.186,[],biggen_240829.csv codellama_70b_instruct,BIGGEN Multilingual,1.929,[],biggen_240829.csv qwen1_5_1_8b_chat,BIGGEN Multilingual,1.3,[],biggen_240829.csv orca_2_13b,BIGGEN Multilingual,2.043,[],biggen_240829.csv qwen1_5_0_5b_chat,BIGGEN Multilingual,1.159,[],biggen_240829.csv orca_2_7b,BIGGEN Multilingual,1.729,[],biggen_240829.csv aya_101,BIGGEN Multilingual,1.129,[],biggen_240829.csv command_r,RULER,85.5,,ruler_bench_241002.csv command_r_0824,RULER,86.0,,ruler_bench_241002.csv command_r_plus,RULER,82.7,,ruler_bench_241002.csv command_r_plus_0824,RULER,83.4,,ruler_bench_241002.csv dbrx,RULER,38.0,,ruler_bench_241002.csv film_7b*,RULER,66.4,,ruler_bench_241002.csv gemini_1_5_pro,RULER,95.5,,ruler_bench_241002.csv glm3,RULER,62.0,,ruler_bench_241002.csv glm4,RULER,88.0,,ruler_bench_241002.csv gpt_4_1106_preview,RULER,89.0,,ruler_bench_241002.csv internlm2_5,RULER,77.8,,ruler_bench_241002.csv jamba_1_5_large,RULER,95.7,,ruler_bench_241002.csv jamba_1_5_mini,RULER,93.1,,ruler_bench_241002.csv llama3,RULER,82.6,,ruler_bench_241002.csv llama3_1,RULER,85.5,,ruler_bench_241002.csv longalpaca,RULER,24.7,,ruler_bench_241002.csv longchat,RULER,33.1,,ruler_bench_241002.csv lwm,RULER,69.9,,ruler_bench_241002.csv megabeam_mistral,RULER,87.3,,ruler_bench_241002.csv mistral,RULER,55.6,,ruler_bench_241002.csv mistral_large,RULER,70.6,,ruler_bench_241002.csv mistral_nemo,RULER,54.7,,ruler_bench_241002.csv mixtral_8x22b,RULER,73.5,,ruler_bench_241002.csv mixtral_8x7b,RULER,72.8,,ruler_bench_241002.csv phi3_medium,RULER,74.8,,ruler_bench_241002.csv phi3_mini,RULER,80.9,,ruler_bench_241002.csv qwen1_5,RULER,37.5,,ruler_bench_241002.csv qwen2,RULER,79.6,,ruler_bench_241002.csv together,RULER,33.8,,ruler_bench_241002.csv yi,RULER,84.8,,ruler_bench_241002.csv zephyr_7b_beta,LiveBench 240624,17.32,[],livebench_240701.csv zephyr_7b_alpha,LiveBench 240624,19.28,[],livebench_240701.csv yi_6b_chat,LiveBench 240624,9.02,[],livebench_240701.csv vicuna_7b_v1_5_16k,LiveBench 240624,14.22,[],livebench_240701.csv vicuna_7b_v1_5,LiveBench 240624,12.31,[],livebench_240701.csv starling_lm_7b_beta,LiveBench 240624,16.62,[],livebench_240701.csv smaug_qwen2_72b_instruct,LiveBench 240624,39.66,[],livebench_240701.csv qwen2_72b_instruct,LiveBench 240624,40.16,[],livebench_240701.csv qwen2_7b_instruct,LiveBench 240624,26.63,[],livebench_240701.csv qwen2_1_5b_instruct,LiveBench 240624,10.42,[],livebench_240701.csv qwen2_0_5b_instruct,LiveBench 240624,7.3,[],livebench_240701.csv qwen1_5_110b_chat,LiveBench 240624,29.07,[],livebench_240701.csv qwen1_5_72b_chat,LiveBench 240624,28.89,[],livebench_240701.csv qwen1_5_7b_chat,LiveBench 240624,17.02,[],livebench_240701.csv qwen1_5_4b_chat,LiveBench 240624,11.59,[],livebench_240701.csv qwen1_5_1_8b_chat,LiveBench 240624,6.32,[],livebench_240701.csv qwen1_5_0_5b_chat,LiveBench 240624,5.43,[],livebench_240701.csv phi_3_5_moe_instruct,LiveBench 240624,35.14,[],livebench_240701.csv phi_3_5_mini_instruct,LiveBench 240624,27.81,[],livebench_240701.csv phi_3_small_128k_instruct,LiveBench 240624,29.68,[],livebench_240701.csv phi_3_small_8k_instruct,LiveBench 240624,29.09,[],livebench_240701.csv phi_3_mini_128k_instruct,LiveBench 240624,24.76,[],livebench_240701.csv phi_3_mini_4k_instruct,LiveBench 240624,24.41,[],livebench_240701.csv phi_3_medium_128k_instruct,LiveBench 240624,29.88,[],livebench_240701.csv phi_3_medium_4k_instruct,LiveBench 240624,30.96,[],livebench_240701.csv openhermes_2_5_mistral_7b,LiveBench 240624,23.36,[],livebench_240701.csv open_mistral_nemo,LiveBench 240624,29.02,[],livebench_240701.csv mixtral_8x22b_instruct_v0_1,LiveBench 240624,35.29,[],livebench_240701.csv mixtral_8x7b_instruct_v0_1,LiveBench 240624,22.79,[],livebench_240701.csv mistral_small_2402,LiveBench 240624,33.03,[],livebench_240701.csv mistral_large_2407,LiveBench 240624,48.35,[],livebench_240701.csv mistral_large_2402,LiveBench 240624,38.92,[],livebench_240701.csv mistral_7b_instruct_v0_3,LiveBench 240624,20.09,[],livebench_240701.csv mistral_7b_instruct_v0_2,LiveBench 240624,19.51,[],livebench_240701.csv llama3_1_405b_instruct_turbo,LiveBench 240624,55.18,[],livebench_240701.csv llama3_1_70b_instruct_turbo,LiveBench 240624,48.9,[],livebench_240701.csv llama3_1_8b_instruct_turbo,LiveBench 240624,28.11,[],livebench_240701.csv llama3_70b_instruct,LiveBench 240624,37.6,[],livebench_240701.csv llama3_8b_instruct,LiveBench 240624,27.46,[],livebench_240701.csv mathstral_7b_v0_1,LiveBench 240624,24.33,[],livebench_240701.csv llama_2_7b_chat,LiveBench 240624,10.25,[],livebench_240701.csv hermes_3_llama3_1_70b,LiveBench 240624,39.56,[],livebench_240701.csv gpt_4o_mini_2024_07_18,LiveBench 240624,44.57,[],livebench_240701.csv gpt_4o_2024_08_06,LiveBench 240624,56.46,[],livebench_240701.csv gpt_4o_2024_05_13,LiveBench 240624,54.96,[],livebench_240701.csv gpt_4_turbo_2024_04_09,LiveBench 240624,53.0,[],livebench_240701.csv gpt_4_0613,LiveBench 240624,44.94,[],livebench_240701.csv gpt_4_0125_preview,LiveBench 240624,49.39,[],livebench_240701.csv gpt_3_5_turbo_0125,LiveBench 240624,34.66,[],livebench_240701.csv gemma_2_27b_it,LiveBench 240624,41.22,[],livebench_240701.csv gemma_2_9b_it,LiveBench 240624,31.57,[],livebench_240701.csv gemma_1_1_7b_it,LiveBench 240624,18.23,[],livebench_240701.csv gemini_1_5_pro_exp_0827,LiveBench 240624,55.06,[],livebench_240701.csv gemini_1_5_pro_exp_0801,LiveBench 240624,53.63,[],livebench_240701.csv gemini_1_5_pro_api_0514,LiveBench 240624,44.41,[],livebench_240701.csv gemini_1_5_flash_exp_0827,LiveBench 240624,47.51,[],livebench_240701.csv gemini_1_5_flash_api_0514,LiveBench 240624,40.95,[],livebench_240701.csv dracarys_llama3_1_70b_instruct,LiveBench 240624,49.82,[],livebench_240701.csv dracarys_72b_instruct,LiveBench 240624,41.72,[],livebench_240701.csv deepseek_v2_lite_chat,LiveBench 240624,17.49,[],livebench_240701.csv deepseek_coder_v2_lite_instruct,LiveBench 240624,29.21,[],livebench_240701.csv deepseek_coder_v2,LiveBench 240624,46.84,[],livebench_240701.csv deepseek_chat_v2,LiveBench 240624,46.36,[],livebench_240701.csv command_r_plus,LiveBench 240624,32.86,[],livebench_240701.csv command_r,LiveBench 240624,27.23,[],livebench_240701.csv claude_3_sonnet_20240229,LiveBench 240624,38.08,[],livebench_240701.csv claude_3_opus_20240229,LiveBench 240624,50.75,[],livebench_240701.csv claude_3_haiku_20240307,LiveBench 240624,35.32,[],livebench_240701.csv claude_3_5_sonnet_20240620,LiveBench 240624,61.16,[],livebench_240701.csv chatgpt_4o_latest,LiveBench 240624,55.35,[],livebench_240701.csv zephyr_7b_beta,LiveBench Reasoning Average,16.0,[],livebench_240701.csv zephyr_7b_alpha,LiveBench Reasoning Average,17.0,[],livebench_240701.csv yi_6b_chat,LiveBench Reasoning Average,8.0,[],livebench_240701.csv vicuna_7b_v1_5_16k,LiveBench Reasoning Average,15.0,[],livebench_240701.csv vicuna_7b_v1_5,LiveBench Reasoning Average,12.0,[],livebench_240701.csv starling_lm_7b_beta,LiveBench Reasoning Average,19.0,[],livebench_240701.csv smaug_qwen2_72b_instruct,LiveBench Reasoning Average,37.0,[],livebench_240701.csv qwen2_72b_instruct,LiveBench Reasoning Average,42.0,[],livebench_240701.csv qwen2_7b_instruct,LiveBench Reasoning Average,20.0,[],livebench_240701.csv qwen2_1_5b_instruct,LiveBench Reasoning Average,8.0,[],livebench_240701.csv qwen2_0_5b_instruct,LiveBench Reasoning Average,3.0,[],livebench_240701.csv qwen1_5_110b_chat,LiveBench Reasoning Average,26.0,[],livebench_240701.csv qwen1_5_72b_chat,LiveBench Reasoning Average,21.0,[],livebench_240701.csv qwen1_5_7b_chat,LiveBench Reasoning Average,13.0,[],livebench_240701.csv qwen1_5_4b_chat,LiveBench Reasoning Average,13.0,[],livebench_240701.csv qwen1_5_1_8b_chat,LiveBench Reasoning Average,5.0,[],livebench_240701.csv qwen1_5_0_5b_chat,LiveBench Reasoning Average,4.0,[],livebench_240701.csv phi_3_5_moe_instruct,LiveBench Reasoning Average,41.0,[],livebench_240701.csv phi_3_5_mini_instruct,LiveBench Reasoning Average,31.0,[],livebench_240701.csv phi_3_small_128k_instruct,LiveBench Reasoning Average,28.0,[],livebench_240701.csv phi_3_small_8k_instruct,LiveBench Reasoning Average,29.0,[],livebench_240701.csv phi_3_mini_128k_instruct,LiveBench Reasoning Average,24.0,[],livebench_240701.csv phi_3_mini_4k_instruct,LiveBench Reasoning Average,22.0,[],livebench_240701.csv phi_3_medium_128k_instruct,LiveBench Reasoning Average,31.0,[],livebench_240701.csv phi_3_medium_4k_instruct,LiveBench Reasoning Average,35.0,[],livebench_240701.csv openhermes_2_5_mistral_7b,LiveBench Reasoning Average,17.0,[],livebench_240701.csv open_mistral_nemo,LiveBench Reasoning Average,25.0,[],livebench_240701.csv mixtral_8x22b_instruct_v0_1,LiveBench Reasoning Average,29.0,[],livebench_240701.csv mixtral_8x7b_instruct_v0_1,LiveBench Reasoning Average,18.0,[],livebench_240701.csv mistral_small_2402,LiveBench Reasoning Average,28.0,[],livebench_240701.csv mistral_large_2407,LiveBench Reasoning Average,45.0,[],livebench_240701.csv mistral_large_2402,LiveBench Reasoning Average,35.0,[],livebench_240701.csv mistral_7b_instruct_v0_3,LiveBench Reasoning Average,11.0,[],livebench_240701.csv mistral_7b_instruct_v0_2,LiveBench Reasoning Average,13.0,[],livebench_240701.csv llama3_1_405b_instruct_turbo,LiveBench Reasoning Average,57.0,[],livebench_240701.csv llama3_1_70b_instruct_turbo,LiveBench Reasoning Average,43.0,[],livebench_240701.csv llama3_1_8b_instruct_turbo,LiveBench Reasoning Average,14.0,[],livebench_240701.csv llama3_70b_instruct,LiveBench Reasoning Average,31.0,[],livebench_240701.csv llama3_8b_instruct,LiveBench Reasoning Average,25.0,[],livebench_240701.csv mathstral_7b_v0_1,LiveBench Reasoning Average,16.0,[],livebench_240701.csv llama_2_7b_chat,LiveBench Reasoning Average,5.0,[],livebench_240701.csv hermes_3_llama3_1_70b,LiveBench Reasoning Average,32.0,[],livebench_240701.csv gpt_4o_mini_2024_07_18,LiveBench Reasoning Average,37.0,[],livebench_240701.csv gpt_4o_2024_08_06,LiveBench Reasoning Average,54.0,[],livebench_240701.csv gpt_4o_2024_05_13,LiveBench Reasoning Average,55.0,[],livebench_240701.csv gpt_4_turbo_2024_04_09,LiveBench Reasoning Average,54.0,[],livebench_240701.csv gpt_4_0613,LiveBench Reasoning Average,31.0,[],livebench_240701.csv gpt_4_0125_preview,LiveBench Reasoning Average,48.0,[],livebench_240701.csv gpt_3_5_turbo_0125,LiveBench Reasoning Average,26.0,[],livebench_240701.csv gemma_2_27b_it,LiveBench Reasoning Average,31.0,[],livebench_240701.csv gemma_2_9b_it,LiveBench Reasoning Average,19.0,[],livebench_240701.csv gemma_1_1_7b_it,LiveBench Reasoning Average,10.0,[],livebench_240701.csv gemini_1_5_pro_exp_0827,LiveBench Reasoning Average,56.0,[],livebench_240701.csv gemini_1_5_pro_exp_0801,LiveBench Reasoning Average,55.0,[],livebench_240701.csv gemini_1_5_pro_api_0514,LiveBench Reasoning Average,33.0,[],livebench_240701.csv gemini_1_5_flash_exp_0827,LiveBench Reasoning Average,52.0,[],livebench_240701.csv gemini_1_5_flash_api_0514,LiveBench Reasoning Average,30.0,[],livebench_240701.csv dracarys_llama3_1_70b_instruct,LiveBench Reasoning Average,50.0,[],livebench_240701.csv dracarys_72b_instruct,LiveBench Reasoning Average,41.0,[],livebench_240701.csv deepseek_v2_lite_chat,LiveBench Reasoning Average,13.0,[],livebench_240701.csv deepseek_coder_v2_lite_instruct,LiveBench Reasoning Average,22.0,[],livebench_240701.csv deepseek_coder_v2,LiveBench Reasoning Average,49.0,[],livebench_240701.csv deepseek_chat_v2,LiveBench Reasoning Average,41.0,[],livebench_240701.csv command_r_plus,LiveBench Reasoning Average,32.0,[],livebench_240701.csv command_r,LiveBench Reasoning Average,28.0,[],livebench_240701.csv claude_3_sonnet_20240229,LiveBench Reasoning Average,26.0,[],livebench_240701.csv claude_3_opus_20240229,LiveBench Reasoning Average,41.0,[],livebench_240701.csv claude_3_haiku_20240307,LiveBench Reasoning Average,26.0,[],livebench_240701.csv claude_3_5_sonnet_20240620,LiveBench Reasoning Average,64.0,[],livebench_240701.csv chatgpt_4o_latest,LiveBench Reasoning Average,57.0,[],livebench_240701.csv zephyr_7b_beta,LiveBench Coding Average,8.32,[],livebench_240701.csv zephyr_7b_alpha,LiveBench Coding Average,11.32,[],livebench_240701.csv yi_6b_chat,LiveBench Coding Average,1.32,[],livebench_240701.csv vicuna_7b_v1_5_16k,LiveBench Coding Average,1.32,[],livebench_240701.csv vicuna_7b_v1_5,LiveBench Coding Average,1.0,[],livebench_240701.csv starling_lm_7b_beta,LiveBench Coding Average,18.26,[],livebench_240701.csv smaug_qwen2_72b_instruct,LiveBench Coding Average,39.05,[],livebench_240701.csv qwen2_72b_instruct,LiveBench Coding Average,31.79,[],livebench_240701.csv qwen2_7b_instruct,LiveBench Coding Average,29.21,[],livebench_240701.csv qwen2_1_5b_instruct,LiveBench Coding Average,5.63,[],livebench_240701.csv qwen2_0_5b_instruct,LiveBench Coding Average,2.0,[],livebench_240701.csv qwen1_5_110b_chat,LiveBench Coding Average,22.21,[],livebench_240701.csv qwen1_5_72b_chat,LiveBench Coding Average,22.89,[],livebench_240701.csv qwen1_5_7b_chat,LiveBench Coding Average,6.63,[],livebench_240701.csv qwen1_5_4b_chat,LiveBench Coding Average,4.0,[],livebench_240701.csv qwen1_5_1_8b_chat,LiveBench Coding Average,0.0,[],livebench_240701.csv qwen1_5_0_5b_chat,LiveBench Coding Average,0.0,[],livebench_240701.csv phi_3_5_moe_instruct,LiveBench Coding Average,19.26,[],livebench_240701.csv phi_3_5_mini_instruct,LiveBench Coding Average,15.26,[],livebench_240701.csv phi_3_small_128k_instruct,LiveBench Coding Average,24.87,[],livebench_240701.csv phi_3_small_8k_instruct,LiveBench Coding Average,21.24,[],livebench_240701.csv phi_3_mini_128k_instruct,LiveBench Coding Average,14.29,[],livebench_240701.csv phi_3_mini_4k_instruct,LiveBench Coding Average,14.79,[],livebench_240701.csv phi_3_medium_128k_instruct,LiveBench Coding Average,21.58,[],livebench_240701.csv phi_3_medium_4k_instruct,LiveBench Coding Average,20.58,[],livebench_240701.csv openhermes_2_5_mistral_7b,LiveBench Coding Average,11.63,[],livebench_240701.csv open_mistral_nemo,LiveBench Coding Average,28.16,[],livebench_240701.csv mixtral_8x22b_instruct_v0_1,LiveBench Coding Average,33.11,[],livebench_240701.csv mixtral_8x7b_instruct_v0_1,LiveBench Coding Average,11.32,[],livebench_240701.csv mistral_small_2402,LiveBench Coding Average,24.21,[],livebench_240701.csv mistral_large_2407,LiveBench Coding Average,46.37,[],livebench_240701.csv mistral_large_2402,LiveBench Coding Average,26.84,[],livebench_240701.csv mistral_7b_instruct_v0_3,LiveBench Coding Average,9.0,[],livebench_240701.csv mistral_7b_instruct_v0_2,LiveBench Coding Average,11.63,[],livebench_240701.csv llama3_1_405b_instruct_turbo,LiveBench Coding Average,45.68,[],livebench_240701.csv llama3_1_70b_instruct_turbo,LiveBench Coding Average,33.11,[],livebench_240701.csv llama3_1_8b_instruct_turbo,LiveBench Coding Average,21.58,[],livebench_240701.csv llama3_70b_instruct,LiveBench Coding Average,20.95,[],livebench_240701.csv llama3_8b_instruct,LiveBench Coding Average,18.26,[],livebench_240701.csv mathstral_7b_v0_1,LiveBench Coding Average,15.63,[],livebench_240701.csv llama_2_7b_chat,LiveBench Coding Average,0.0,[],livebench_240701.csv hermes_3_llama3_1_70b,LiveBench Coding Average,29.79,[],livebench_240701.csv gpt_4o_mini_2024_07_18,LiveBench Coding Average,43.37,[],livebench_240701.csv gpt_4o_2024_08_06,LiveBench Coding Average,50.63,[],livebench_240701.csv gpt_4o_2024_05_13,LiveBench Coding Average,46.37,[],livebench_240701.csv gpt_4_turbo_2024_04_09,LiveBench Coding Average,47.05,[],livebench_240701.csv gpt_4_0613,LiveBench Coding Average,37.05,[],livebench_240701.csv gpt_4_0125_preview,LiveBench Coding Average,44.05,[],livebench_240701.csv gpt_3_5_turbo_0125,LiveBench Coding Average,29.16,[],livebench_240701.csv gemma_2_27b_it,LiveBench Coding Average,36.74,[],livebench_240701.csv gemma_2_9b_it,LiveBench Coding Average,22.21,[],livebench_240701.csv gemma_1_1_7b_it,LiveBench Coding Average,11.0,[],livebench_240701.csv gemini_1_5_pro_exp_0827,LiveBench Coding Average,42.0,[],livebench_240701.csv gemini_1_5_pro_exp_0801,LiveBench Coding Average,43.37,[],livebench_240701.csv gemini_1_5_pro_api_0514,LiveBench Coding Average,32.79,[],livebench_240701.csv gemini_1_5_flash_exp_0827,LiveBench Coding Average,39.74,[],livebench_240701.csv gemini_1_5_flash_api_0514,LiveBench Coding Average,39.05,[],livebench_240701.csv dracarys_llama3_1_70b_instruct,LiveBench Coding Average,36.11,[],livebench_240701.csv dracarys_72b_instruct,LiveBench Coding Average,41.05,[],livebench_240701.csv deepseek_v2_lite_chat,LiveBench Coding Average,8.63,[],livebench_240701.csv deepseek_coder_v2_lite_instruct,LiveBench Coding Average,26.84,[],livebench_240701.csv deepseek_coder_v2,LiveBench Coding Average,41.05,[],livebench_240701.csv deepseek_chat_v2,LiveBench Coding Average,42.05,[],livebench_240701.csv command_r_plus,LiveBench Coding Average,20.26,[],livebench_240701.csv command_r,LiveBench Coding Average,14.95,[],livebench_240701.csv claude_3_sonnet_20240229,LiveBench Coding Average,25.21,[],livebench_240701.csv claude_3_opus_20240229,LiveBench Coding Average,40.05,[],livebench_240701.csv claude_3_haiku_20240307,LiveBench Coding Average,24.53,[],livebench_240701.csv claude_3_5_sonnet_20240620,LiveBench Coding Average,63.21,[],livebench_240701.csv chatgpt_4o_latest,LiveBench Coding Average,46.0,[],livebench_240701.csv zephyr_7b_beta,LiveBench Mathematics Average,11.23,[],livebench_240701.csv zephyr_7b_alpha,LiveBench Mathematics Average,9.96,[],livebench_240701.csv yi_6b_chat,LiveBench Mathematics Average,8.53,[],livebench_240701.csv vicuna_7b_v1_5_16k,LiveBench Mathematics Average,9.04,[],livebench_240701.csv vicuna_7b_v1_5,LiveBench Mathematics Average,7.1,[],livebench_240701.csv starling_lm_7b_beta,LiveBench Mathematics Average,14.86,[],livebench_240701.csv smaug_qwen2_72b_instruct,LiveBench Mathematics Average,40.67,[],livebench_240701.csv qwen2_72b_instruct,LiveBench Mathematics Average,43.44,[],livebench_240701.csv qwen2_7b_instruct,LiveBench Mathematics Average,26.87,[],livebench_240701.csv qwen2_1_5b_instruct,LiveBench Mathematics Average,9.94,[],livebench_240701.csv qwen2_0_5b_instruct,LiveBench Mathematics Average,7.35,[],livebench_240701.csv qwen1_5_110b_chat,LiveBench Mathematics Average,26.28,[],livebench_240701.csv qwen1_5_72b_chat,LiveBench Mathematics Average,26.82,[],livebench_240701.csv qwen1_5_7b_chat,LiveBench Mathematics Average,15.29,[],livebench_240701.csv qwen1_5_4b_chat,LiveBench Mathematics Average,9.86,[],livebench_240701.csv qwen1_5_1_8b_chat,LiveBench Mathematics Average,3.53,[],livebench_240701.csv qwen1_5_0_5b_chat,LiveBench Mathematics Average,4.43,[],livebench_240701.csv phi_3_5_moe_instruct,LiveBench Mathematics Average,33.3,[],livebench_240701.csv phi_3_5_mini_instruct,LiveBench Mathematics Average,22.2,[],livebench_240701.csv phi_3_small_128k_instruct,LiveBench Mathematics Average,28.97,[],livebench_240701.csv phi_3_small_8k_instruct,LiveBench Mathematics Average,23.73,[],livebench_240701.csv phi_3_mini_128k_instruct,LiveBench Mathematics Average,17.06,[],livebench_240701.csv phi_3_mini_4k_instruct,LiveBench Mathematics Average,20.84,[],livebench_240701.csv phi_3_medium_128k_instruct,LiveBench Mathematics Average,25.64,[],livebench_240701.csv phi_3_medium_4k_instruct,LiveBench Mathematics Average,31.36,[],livebench_240701.csv openhermes_2_5_mistral_7b,LiveBench Mathematics Average,20.45,[],livebench_240701.csv open_mistral_nemo,LiveBench Mathematics Average,21.66,[],livebench_240701.csv mixtral_8x22b_instruct_v0_1,LiveBench Mathematics Average,28.33,[],livebench_240701.csv mixtral_8x7b_instruct_v0_1,LiveBench Mathematics Average,20.71,[],livebench_240701.csv mistral_small_2402,LiveBench Mathematics Average,28.15,[],livebench_240701.csv mistral_large_2407,LiveBench Mathematics Average,40.48,[],livebench_240701.csv mistral_large_2402,LiveBench Mathematics Average,32.2,[],livebench_240701.csv mistral_7b_instruct_v0_3,LiveBench Mathematics Average,14.56,[],livebench_240701.csv mistral_7b_instruct_v0_2,LiveBench Mathematics Average,17.08,[],livebench_240701.csv llama3_1_405b_instruct_turbo,LiveBench Mathematics Average,46.55,[],livebench_240701.csv llama3_1_70b_instruct_turbo,LiveBench Mathematics Average,45.58,[],livebench_240701.csv llama3_1_8b_instruct_turbo,LiveBench Mathematics Average,24.37,[],livebench_240701.csv llama3_70b_instruct,LiveBench Mathematics Average,32.31,[],livebench_240701.csv llama3_8b_instruct,LiveBench Mathematics Average,19.66,[],livebench_240701.csv mathstral_7b_v0_1,LiveBench Mathematics Average,17.84,[],livebench_240701.csv llama_2_7b_chat,LiveBench Mathematics Average,4.78,[],livebench_240701.csv hermes_3_llama3_1_70b,LiveBench Mathematics Average,28.32,[],livebench_240701.csv gpt_4o_mini_2024_07_18,LiveBench Mathematics Average,41.58,[],livebench_240701.csv gpt_4o_2024_08_06,LiveBench Mathematics Average,52.29,[],livebench_240701.csv gpt_4o_2024_05_13,LiveBench Mathematics Average,49.88,[],livebench_240701.csv gpt_4_turbo_2024_04_09,LiveBench Mathematics Average,48.99,[],livebench_240701.csv gpt_4_0613,LiveBench Mathematics Average,36.22,[],livebench_240701.csv gpt_4_0125_preview,LiveBench Mathematics Average,42.75,[],livebench_240701.csv gpt_3_5_turbo_0125,LiveBench Mathematics Average,26.93,[],livebench_240701.csv gemma_2_27b_it,LiveBench Mathematics Average,36.23,[],livebench_240701.csv gemma_2_9b_it,LiveBench Mathematics Average,23.98,[],livebench_240701.csv gemma_1_1_7b_it,LiveBench Mathematics Average,15.21,[],livebench_240701.csv gemini_1_5_pro_exp_0827,LiveBench Mathematics Average,56.28,[],livebench_240701.csv gemini_1_5_pro_exp_0801,LiveBench Mathematics Average,47.46,[],livebench_240701.csv gemini_1_5_pro_api_0514,LiveBench Mathematics Average,42.42,[],livebench_240701.csv gemini_1_5_flash_exp_0827,LiveBench Mathematics Average,36.29,[],livebench_240701.csv gemini_1_5_flash_api_0514,LiveBench Mathematics Average,38.89,[],livebench_240701.csv dracarys_llama3_1_70b_instruct,LiveBench Mathematics Average,45.68,[],livebench_240701.csv dracarys_72b_instruct,LiveBench Mathematics Average,42.77,[],livebench_240701.csv deepseek_v2_lite_chat,LiveBench Mathematics Average,14.08,[],livebench_240701.csv deepseek_coder_v2_lite_instruct,LiveBench Mathematics Average,34.44,[],livebench_240701.csv deepseek_coder_v2,LiveBench Mathematics Average,52.54,[],livebench_240701.csv deepseek_chat_v2,LiveBench Mathematics Average,52.11,[],livebench_240701.csv command_r_plus,LiveBench Mathematics Average,24.85,[],livebench_240701.csv command_r,LiveBench Mathematics Average,16.92,[],livebench_240701.csv claude_3_sonnet_20240229,LiveBench Mathematics Average,29.65,[],livebench_240701.csv claude_3_opus_20240229,LiveBench Mathematics Average,46.54,[],livebench_240701.csv claude_3_haiku_20240307,LiveBench Mathematics Average,25.72,[],livebench_240701.csv claude_3_5_sonnet_20240620,LiveBench Mathematics Average,53.75,[],livebench_240701.csv chatgpt_4o_latest,LiveBench Mathematics Average,52.19,[],livebench_240701.csv zephyr_7b_beta,LiveBench Data Analysis Average,15.75,[],livebench_240701.csv zephyr_7b_alpha,LiveBench Data Analysis Average,17.4,[],livebench_240701.csv yi_6b_chat,LiveBench Data Analysis Average,4.38,[],livebench_240701.csv vicuna_7b_v1_5_16k,LiveBench Data Analysis Average,9.93,[],livebench_240701.csv vicuna_7b_v1_5,LiveBench Data Analysis Average,3.33,[],livebench_240701.csv starling_lm_7b_beta,LiveBench Data Analysis Average,2.0,[],livebench_240701.csv smaug_qwen2_72b_instruct,LiveBench Data Analysis Average,26.19,[],livebench_240701.csv qwen2_72b_instruct,LiveBench Data Analysis Average,26.24,[],livebench_240701.csv qwen2_7b_instruct,LiveBench Data Analysis Average,28.75,[],livebench_240701.csv qwen2_1_5b_instruct,LiveBench Data Analysis Average,10.01,[],livebench_240701.csv qwen2_0_5b_instruct,LiveBench Data Analysis Average,2.0,[],livebench_240701.csv qwen1_5_110b_chat,LiveBench Data Analysis Average,31.45,[],livebench_240701.csv qwen1_5_72b_chat,LiveBench Data Analysis Average,32.98,[],livebench_240701.csv qwen1_5_7b_chat,LiveBench Data Analysis Average,16.9,[],livebench_240701.csv qwen1_5_4b_chat,LiveBench Data Analysis Average,9.13,[],livebench_240701.csv qwen1_5_1_8b_chat,LiveBench Data Analysis Average,3.33,[],livebench_240701.csv qwen1_5_0_5b_chat,LiveBench Data Analysis Average,0.0,[],livebench_240701.csv phi_3_5_moe_instruct,LiveBench Data Analysis Average,40.46,[],livebench_240701.csv phi_3_5_mini_instruct,LiveBench Data Analysis Average,30.43,[],livebench_240701.csv phi_3_small_128k_instruct,LiveBench Data Analysis Average,27.26,[],livebench_240701.csv phi_3_small_8k_instruct,LiveBench Data Analysis Average,29.62,[],livebench_240701.csv phi_3_mini_128k_instruct,LiveBench Data Analysis Average,34.02,[],livebench_240701.csv phi_3_mini_4k_instruct,LiveBench Data Analysis Average,29.55,[],livebench_240701.csv phi_3_medium_128k_instruct,LiveBench Data Analysis Average,32.12,[],livebench_240701.csv phi_3_medium_4k_instruct,LiveBench Data Analysis Average,31.63,[],livebench_240701.csv openhermes_2_5_mistral_7b,LiveBench Data Analysis Average,26.92,[],livebench_240701.csv open_mistral_nemo,LiveBench Data Analysis Average,33.35,[],livebench_240701.csv mixtral_8x22b_instruct_v0_1,LiveBench Data Analysis Average,31.67,[],livebench_240701.csv mixtral_8x7b_instruct_v0_1,LiveBench Data Analysis Average,28.13,[],livebench_240701.csv mistral_small_2402,LiveBench Data Analysis Average,31.88,[],livebench_240701.csv mistral_large_2407,LiveBench Data Analysis Average,46.61,[],livebench_240701.csv mistral_large_2402,LiveBench Data Analysis Average,42.55,[],livebench_240701.csv mistral_7b_instruct_v0_3,LiveBench Data Analysis Average,21.77,[],livebench_240701.csv mistral_7b_instruct_v0_2,LiveBench Data Analysis Average,14.62,[],livebench_240701.csv llama3_1_405b_instruct_turbo,LiveBench Data Analysis Average,53.51,[],livebench_240701.csv llama3_1_70b_instruct_turbo,LiveBench Data Analysis Average,50.29,[],livebench_240701.csv llama3_1_8b_instruct_turbo,LiveBench Data Analysis Average,32.15,[],livebench_240701.csv llama3_70b_instruct,LiveBench Data Analysis Average,43.75,[],livebench_240701.csv llama3_8b_instruct,LiveBench Data Analysis Average,26.0,[],livebench_240701.csv mathstral_7b_v0_1,LiveBench Data Analysis Average,27.89,[],livebench_240701.csv llama_2_7b_chat,LiveBench Data Analysis Average,0.0,[],livebench_240701.csv hermes_3_llama3_1_70b,LiveBench Data Analysis Average,48.11,[],livebench_240701.csv gpt_4o_mini_2024_07_18,LiveBench Data Analysis Average,44.52,[],livebench_240701.csv gpt_4o_2024_08_06,LiveBench Data Analysis Average,52.89,[],livebench_240701.csv gpt_4o_2024_05_13,LiveBench Data Analysis Average,52.41,[],livebench_240701.csv gpt_4_turbo_2024_04_09,LiveBench Data Analysis Average,51.32,[],livebench_240701.csv gpt_4_0613,LiveBench Data Analysis Average,44.03,[],livebench_240701.csv gpt_4_0125_preview,LiveBench Data Analysis Average,54.06,[],livebench_240701.csv gpt_3_5_turbo_0125,LiveBench Data Analysis Average,41.21,[],livebench_240701.csv gemma_2_27b_it,LiveBench Data Analysis Average,43.58,[],livebench_240701.csv gemma_2_9b_it,LiveBench Data Analysis Average,35.06,[],livebench_240701.csv gemma_1_1_7b_it,LiveBench Data Analysis Average,18.17,[],livebench_240701.csv gemini_1_5_pro_exp_0827,LiveBench Data Analysis Average,50.83,[],livebench_240701.csv gemini_1_5_pro_exp_0801,LiveBench Data Analysis Average,50.15,[],livebench_240701.csv gemini_1_5_pro_api_0514,LiveBench Data Analysis Average,52.81,[],livebench_240701.csv gemini_1_5_flash_exp_0827,LiveBench Data Analysis Average,47.87,[],livebench_240701.csv gemini_1_5_flash_api_0514,LiveBench Data Analysis Average,44.03,[],livebench_240701.csv dracarys_llama3_1_70b_instruct,LiveBench Data Analysis Average,47.99,[],livebench_240701.csv dracarys_72b_instruct,LiveBench Data Analysis Average,26.24,[],livebench_240701.csv deepseek_v2_lite_chat,LiveBench Data Analysis Average,18.19,[],livebench_240701.csv deepseek_coder_v2_lite_instruct,LiveBench Data Analysis Average,33.0,[],livebench_240701.csv deepseek_coder_v2,LiveBench Data Analysis Average,38.25,[],livebench_240701.csv deepseek_chat_v2,LiveBench Data Analysis Average,45.59,[],livebench_240701.csv command_r_plus,LiveBench Data Analysis Average,24.6,[],livebench_240701.csv command_r,LiveBench Data Analysis Average,31.69,[],livebench_240701.csv claude_3_sonnet_20240229,LiveBench Data Analysis Average,44.56,[],livebench_240701.csv claude_3_opus_20240229,LiveBench Data Analysis Average,54.32,[],livebench_240701.csv claude_3_haiku_20240307,LiveBench Data Analysis Average,41.54,[],livebench_240701.csv claude_3_5_sonnet_20240620,LiveBench Data Analysis Average,56.74,[],livebench_240701.csv chatgpt_4o_latest,LiveBench Data Analysis Average,54.43,[],livebench_240701.csv zephyr_7b_beta,LiveBench Language Average,4.28,[],livebench_240701.csv zephyr_7b_alpha,LiveBench Language Average,7.2,[],livebench_240701.csv yi_6b_chat,LiveBench Language Average,4.69,[],livebench_240701.csv vicuna_7b_v1_5_16k,LiveBench Language Average,7.92,[],livebench_240701.csv vicuna_7b_v1_5,LiveBench Language Average,8.66,[],livebench_240701.csv starling_lm_7b_beta,LiveBench Language Average,7.26,[],livebench_240701.csv smaug_qwen2_72b_instruct,LiveBench Language Average,30.03,[],livebench_240701.csv qwen2_72b_instruct,LiveBench Language Average,29.21,[],livebench_240701.csv qwen2_7b_instruct,LiveBench Language Average,10.21,[],livebench_240701.csv qwen2_1_5b_instruct,LiveBench Language Average,3.05,[],livebench_240701.csv qwen2_0_5b_instruct,LiveBench Language Average,2.8,[],livebench_240701.csv qwen1_5_110b_chat,LiveBench Language Average,13.22,[],livebench_240701.csv qwen1_5_72b_chat,LiveBench Language Average,11.37,[],livebench_240701.csv qwen1_5_7b_chat,LiveBench Language Average,6.18,[],livebench_240701.csv qwen1_5_4b_chat,LiveBench Language Average,5.8,[],livebench_240701.csv qwen1_5_1_8b_chat,LiveBench Language Average,3.16,[],livebench_240701.csv qwen1_5_0_5b_chat,LiveBench Language Average,2.88,[],livebench_240701.csv phi_3_5_moe_instruct,LiveBench Language Average,17.07,[],livebench_240701.csv phi_3_5_mini_instruct,LiveBench Language Average,9.67,[],livebench_240701.csv phi_3_small_128k_instruct,LiveBench Language Average,15.53,[],livebench_240701.csv phi_3_small_8k_instruct,LiveBench Language Average,15.13,[],livebench_240701.csv phi_3_mini_128k_instruct,LiveBench Language Average,7.76,[],livebench_240701.csv phi_3_mini_4k_instruct,LiveBench Language Average,8.06,[],livebench_240701.csv phi_3_medium_128k_instruct,LiveBench Language Average,12.76,[],livebench_240701.csv phi_3_medium_4k_instruct,LiveBench Language Average,13.91,[],livebench_240701.csv openhermes_2_5_mistral_7b,LiveBench Language Average,11.37,[],livebench_240701.csv open_mistral_nemo,LiveBench Language Average,14.15,[],livebench_240701.csv mixtral_8x22b_instruct_v0_1,LiveBench Language Average,26.48,[],livebench_240701.csv mixtral_8x7b_instruct_v0_1,LiveBench Language Average,13.76,[],livebench_240701.csv mistral_small_2402,LiveBench Language Average,22.06,[],livebench_240701.csv mistral_large_2407,LiveBench Language Average,39.79,[],livebench_240701.csv mistral_large_2402,LiveBench Language Average,28.74,[],livebench_240701.csv mistral_7b_instruct_v0_3,LiveBench Language Average,11.85,[],livebench_240701.csv mistral_7b_instruct_v0_2,LiveBench Language Average,9.05,[],livebench_240701.csv llama3_1_405b_instruct_turbo,LiveBench Language Average,49.85,[],livebench_240701.csv llama3_1_70b_instruct_turbo,LiveBench Language Average,42.36,[],livebench_240701.csv llama3_1_8b_instruct_turbo,LiveBench Language Average,20.05,[],livebench_240701.csv llama3_70b_instruct,LiveBench Language Average,34.11,[],livebench_240701.csv llama3_8b_instruct,LiveBench Language Average,18.72,[],livebench_240701.csv mathstral_7b_v0_1,LiveBench Language Average,15.37,[],livebench_240701.csv llama_2_7b_chat,LiveBench Language Average,6.86,[],livebench_240701.csv hermes_3_llama3_1_70b,LiveBench Language Average,43.77,[],livebench_240701.csv gpt_4o_mini_2024_07_18,LiveBench Language Average,35.28,[],livebench_240701.csv gpt_4o_2024_08_06,LiveBench Language Average,54.37,[],livebench_240701.csv gpt_4o_2024_05_13,LiveBench Language Average,53.94,[],livebench_240701.csv gpt_4_turbo_2024_04_09,LiveBench Language Average,45.26,[],livebench_240701.csv gpt_4_0613,LiveBench Language Average,49.57,[],livebench_240701.csv gpt_4_0125_preview,LiveBench Language Average,43.55,[],livebench_240701.csv gpt_3_5_turbo_0125,LiveBench Language Average,24.22,[],livebench_240701.csv gemma_2_27b_it,LiveBench Language Average,32.4,[],livebench_240701.csv gemma_2_9b_it,LiveBench Language Average,27.64,[],livebench_240701.csv gemma_1_1_7b_it,LiveBench Language Average,10.65,[],livebench_240701.csv gemini_1_5_pro_exp_0827,LiveBench Language Average,49.31,[],livebench_240701.csv gemini_1_5_pro_exp_0801,LiveBench Language Average,46.96,[],livebench_240701.csv gemini_1_5_pro_api_0514,LiveBench Language Average,38.25,[],livebench_240701.csv gemini_1_5_flash_exp_0827,LiveBench Language Average,31.04,[],livebench_240701.csv gemini_1_5_flash_api_0514,LiveBench Language Average,30.69,[],livebench_240701.csv dracarys_llama3_1_70b_instruct,LiveBench Language Average,41.77,[],livebench_240701.csv dracarys_72b_instruct,LiveBench Language Average,31.17,[],livebench_240701.csv deepseek_v2_lite_chat,LiveBench Language Average,9.2,[],livebench_240701.csv deepseek_coder_v2_lite_instruct,LiveBench Language Average,10.64,[],livebench_240701.csv deepseek_coder_v2,LiveBench Language Average,33.04,[],livebench_240701.csv deepseek_chat_v2,LiveBench Language Average,32.77,[],livebench_240701.csv command_r_plus,LiveBench Language Average,23.92,[],livebench_240701.csv command_r,LiveBench Language Average,14.64,[],livebench_240701.csv claude_3_sonnet_20240229,LiveBench Language Average,38.08,[],livebench_240701.csv claude_3_opus_20240229,LiveBench Language Average,51.72,[],livebench_240701.csv claude_3_haiku_20240307,LiveBench Language Average,30.07,[],livebench_240701.csv claude_3_5_sonnet_20240620,LiveBench Language Average,56.94,[],livebench_240701.csv chatgpt_4o_latest,LiveBench Language Average,49.95,[],livebench_240701.csv zephyr_7b_beta,LiveBench Instruction Following Average,48.32,[],livebench_240701.csv zephyr_7b_alpha,LiveBench Instruction Following Average,52.79,[],livebench_240701.csv yi_6b_chat,LiveBench Instruction Following Average,27.22,[],livebench_240701.csv vicuna_7b_v1_5_16k,LiveBench Instruction Following Average,42.12,[],livebench_240701.csv vicuna_7b_v1_5,LiveBench Instruction Following Average,41.75,[],livebench_240701.csv starling_lm_7b_beta,LiveBench Instruction Following Average,38.32,[],livebench_240701.csv smaug_qwen2_72b_instruct,LiveBench Instruction Following Average,65.0,[],livebench_240701.csv qwen2_72b_instruct,LiveBench Instruction Following Average,68.27,[],livebench_240701.csv qwen2_7b_instruct,LiveBench Instruction Following Average,44.74,[],livebench_240701.csv qwen2_1_5b_instruct,LiveBench Instruction Following Average,25.9,[],livebench_240701.csv qwen2_0_5b_instruct,LiveBench Instruction Following Average,26.63,[],livebench_240701.csv qwen1_5_110b_chat,LiveBench Instruction Following Average,55.26,[],livebench_240701.csv qwen1_5_72b_chat,LiveBench Instruction Following Average,58.25,[],livebench_240701.csv qwen1_5_7b_chat,LiveBench Instruction Following Average,44.12,[],livebench_240701.csv qwen1_5_4b_chat,LiveBench Instruction Following Average,27.75,[],livebench_240701.csv qwen1_5_1_8b_chat,LiveBench Instruction Following Average,22.9,[],livebench_240701.csv qwen1_5_0_5b_chat,LiveBench Instruction Following Average,21.3,[],livebench_240701.csv phi_3_5_moe_instruct,LiveBench Instruction Following Average,59.73,[],livebench_240701.csv phi_3_5_mini_instruct,LiveBench Instruction Following Average,58.3,[],livebench_240701.csv phi_3_small_128k_instruct,LiveBench Instruction Following Average,53.47,[],livebench_240701.csv phi_3_small_8k_instruct,LiveBench Instruction Following Average,55.81,[],livebench_240701.csv phi_3_mini_128k_instruct,LiveBench Instruction Following Average,51.4,[],livebench_240701.csv phi_3_mini_4k_instruct,LiveBench Instruction Following Average,51.25,[],livebench_240701.csv phi_3_medium_128k_instruct,LiveBench Instruction Following Average,56.15,[],livebench_240701.csv phi_3_medium_4k_instruct,LiveBench Instruction Following Average,53.3,[],livebench_240701.csv openhermes_2_5_mistral_7b,LiveBench Instruction Following Average,52.78,[],livebench_240701.csv open_mistral_nemo,LiveBench Instruction Following Average,51.8,[],livebench_240701.csv mixtral_8x22b_instruct_v0_1,LiveBench Instruction Following Average,63.17,[],livebench_240701.csv mixtral_8x7b_instruct_v0_1,LiveBench Instruction Following Average,44.81,[],livebench_240701.csv mistral_small_2402,LiveBench Instruction Following Average,63.91,[],livebench_240701.csv mistral_large_2407,LiveBench Instruction Following Average,71.85,[],livebench_240701.csv mistral_large_2402,LiveBench Instruction Following Average,68.19,[],livebench_240701.csv mistral_7b_instruct_v0_3,LiveBench Instruction Following Average,52.37,[],livebench_240701.csv mistral_7b_instruct_v0_2,LiveBench Instruction Following Average,51.65,[],livebench_240701.csv llama3_1_405b_instruct_turbo,LiveBench Instruction Following Average,78.47,[],livebench_240701.csv llama3_1_70b_instruct_turbo,LiveBench Instruction Following Average,79.08,[],livebench_240701.csv llama3_1_8b_instruct_turbo,LiveBench Instruction Following Average,56.53,[],livebench_240701.csv llama3_70b_instruct,LiveBench Instruction Following Average,63.5,[],livebench_240701.csv llama3_8b_instruct,LiveBench Instruction Following Average,57.14,[],livebench_240701.csv mathstral_7b_v0_1,LiveBench Instruction Following Average,53.25,[],livebench_240701.csv llama_2_7b_chat,LiveBench Instruction Following Average,44.88,[],livebench_240701.csv hermes_3_llama3_1_70b,LiveBench Instruction Following Average,55.37,[],livebench_240701.csv gpt_4o_mini_2024_07_18,LiveBench Instruction Following Average,65.68,[],livebench_240701.csv gpt_4o_2024_08_06,LiveBench Instruction Following Average,74.58,[],livebench_240701.csv gpt_4o_2024_05_13,LiveBench Instruction Following Average,72.17,[],livebench_240701.csv gpt_4_turbo_2024_04_09,LiveBench Instruction Following Average,71.39,[],livebench_240701.csv gpt_4_0613,LiveBench Instruction Following Average,71.79,[],livebench_240701.csv gpt_4_0125_preview,LiveBench Instruction Following Average,63.92,[],livebench_240701.csv gpt_3_5_turbo_0125,LiveBench Instruction Following Average,60.47,[],livebench_240701.csv gemma_2_27b_it,LiveBench Instruction Following Average,67.37,[],livebench_240701.csv gemma_2_9b_it,LiveBench Instruction Following Average,61.55,[],livebench_240701.csv gemma_1_1_7b_it,LiveBench Instruction Following Average,44.34,[],livebench_240701.csv gemini_1_5_pro_exp_0827,LiveBench Instruction Following Average,75.95,[],livebench_240701.csv gemini_1_5_pro_exp_0801,LiveBench Instruction Following Average,78.84,[],livebench_240701.csv gemini_1_5_pro_api_0514,LiveBench Instruction Following Average,67.2,[],livebench_240701.csv gemini_1_5_flash_exp_0827,LiveBench Instruction Following Average,78.11,[],livebench_240701.csv gemini_1_5_flash_api_0514,LiveBench Instruction Following Average,63.01,[],livebench_240701.csv dracarys_llama3_1_70b_instruct,LiveBench Instruction Following Average,77.37,[],livebench_240701.csv dracarys_72b_instruct,LiveBench Instruction Following Average,68.08,[],livebench_240701.csv deepseek_v2_lite_chat,LiveBench Instruction Following Average,41.83,[],livebench_240701.csv deepseek_coder_v2_lite_instruct,LiveBench Instruction Following Average,48.34,[],livebench_240701.csv deepseek_coder_v2,LiveBench Instruction Following Average,67.18,[],livebench_240701.csv deepseek_chat_v2,LiveBench Instruction Following Average,64.61,[],livebench_240701.csv command_r_plus,LiveBench Instruction Following Average,71.51,[],livebench_240701.csv command_r,LiveBench Instruction Following Average,57.16,[],livebench_240701.csv claude_3_sonnet_20240229,LiveBench Instruction Following Average,65.0,[],livebench_240701.csv claude_3_opus_20240229,LiveBench Instruction Following Average,70.87,[],livebench_240701.csv claude_3_haiku_20240307,LiveBench Instruction Following Average,64.03,[],livebench_240701.csv claude_3_5_sonnet_20240620,LiveBench Instruction Following Average,72.3,[],livebench_240701.csv chatgpt_4o_latest,LiveBench Instruction Following Average,72.52,[],livebench_240701.csv abab5_5_chat,Hydrox Integrity,8.09,,hydrox_safety_241001.csv abab5_5_chat,Hydrox Overall Score,6.6,,hydrox_safety_241001.csv abab5_5_chat,Hydrox Privacy,5.13,,hydrox_safety_241001.csv abab5_5_chat,Hydrox Safety,8.32,,hydrox_safety_241001.csv abab5_5_chat,Hydrox Security,4.85,,hydrox_safety_241001.csv abab5_5s_chat,Hydrox Integrity,19.46,,hydrox_safety_241001.csv abab5_5s_chat,Hydrox Overall Score,19.12,,hydrox_safety_241001.csv abab5_5s_chat,Hydrox Privacy,20.63,,hydrox_safety_241001.csv abab5_5s_chat,Hydrox Safety,22.54,,hydrox_safety_241001.csv abab5_5s_chat,Hydrox Security,14.17,,hydrox_safety_241001.csv claude_3_5_sonnet,Hydrox Integrity,95.56,,hydrox_safety_241001.csv claude_3_5_sonnet,Hydrox Overall Score,94.18,,hydrox_safety_241001.csv claude_3_5_sonnet,Hydrox Privacy,93.83,,hydrox_safety_241001.csv claude_3_5_sonnet,Hydrox Safety,94.75,,hydrox_safety_241001.csv claude_3_5_sonnet,Hydrox Security,92.61,,hydrox_safety_241001.csv claude_3_haiku,Hydrox Integrity,89.53,,hydrox_safety_241001.csv claude_3_haiku,Hydrox Overall Score,91.59,,hydrox_safety_241001.csv claude_3_haiku,Hydrox Privacy,93.69,,hydrox_safety_241001.csv claude_3_haiku,Hydrox Safety,91.52,,hydrox_safety_241001.csv claude_3_haiku,Hydrox Security,91.39,,hydrox_safety_241001.csv claude_3_opus,Hydrox Integrity,94.08,,hydrox_safety_241001.csv claude_3_opus,Hydrox Overall Score,92.02,,hydrox_safety_241001.csv claude_3_opus,Hydrox Privacy,91.26,,hydrox_safety_241001.csv claude_3_opus,Hydrox Safety,92.5,,hydrox_safety_241001.csv claude_3_opus,Hydrox Security,90.47,,hydrox_safety_241001.csv claude_3_sonnet,Hydrox Integrity,94.14,,hydrox_safety_241001.csv claude_3_sonnet,Hydrox Overall Score,93.62,,hydrox_safety_241001.csv claude_3_sonnet,Hydrox Privacy,94.36,,hydrox_safety_241001.csv claude_3_sonnet,Hydrox Safety,92.33,,hydrox_safety_241001.csv claude_3_sonnet,Hydrox Security,94.62,,hydrox_safety_241001.csv deepseek_v2_chat_0628,Hydrox Integrity,0.0,,hydrox_safety_241001.csv deepseek_v2_chat_0628,Hydrox Overall Score,50.0,,hydrox_safety_241001.csv deepseek_v2_chat_0628,Hydrox Privacy,0.0,,hydrox_safety_241001.csv deepseek_v2_chat_0628,Hydrox Safety,50.0,,hydrox_safety_241001.csv deepseek_v2_chat_0628,Hydrox Security,0.0,,hydrox_safety_241001.csv deepseek_v2_lite_chat,Hydrox Integrity,45.93,,hydrox_safety_241001.csv deepseek_v2_lite_chat,Hydrox Overall Score,44.91,,hydrox_safety_241001.csv deepseek_v2_lite_chat,Hydrox Privacy,48.84,,hydrox_safety_241001.csv deepseek_v2_lite_chat,Hydrox Safety,44.26,,hydrox_safety_241001.csv deepseek_v2_lite_chat,Hydrox Security,41.91,,hydrox_safety_241001.csv dolly_v2_12b,Hydrox Integrity,3.72,,hydrox_safety_241001.csv dolly_v2_12b,Hydrox Overall Score,6.21,,hydrox_safety_241001.csv dolly_v2_12b,Hydrox Privacy,3.48,,hydrox_safety_241001.csv dolly_v2_12b,Hydrox Safety,11.46,,hydrox_safety_241001.csv dolly_v2_12b,Hydrox Security,3.39,,hydrox_safety_241001.csv dolly_v2_3b,Hydrox Integrity,0.18,,hydrox_safety_241001.csv dolly_v2_3b,Hydrox Overall Score,1.81,,hydrox_safety_241001.csv dolly_v2_3b,Hydrox Privacy,1.08,,hydrox_safety_241001.csv dolly_v2_3b,Hydrox Safety,4.08,,hydrox_safety_241001.csv dolly_v2_3b,Hydrox Security,0.55,,hydrox_safety_241001.csv dolly_v2_7b,Hydrox Integrity,8.33,,hydrox_safety_241001.csv dolly_v2_7b,Hydrox Overall Score,7.79,,hydrox_safety_241001.csv dolly_v2_7b,Hydrox Privacy,8.33,,hydrox_safety_241001.csv dolly_v2_7b,Hydrox Safety,9.92,,hydrox_safety_241001.csv dolly_v2_7b,Hydrox Security,4.96,,hydrox_safety_241001.csv falcon_40b,Hydrox Integrity,0.64,,hydrox_safety_241001.csv falcon_40b,Hydrox Overall Score,0.9,,hydrox_safety_241001.csv falcon_40b,Hydrox Privacy,0.25,,hydrox_safety_241001.csv falcon_40b,Hydrox Safety,2.08,,hydrox_safety_241001.csv falcon_40b,Hydrox Security,0.4,,hydrox_safety_241001.csv falcon_40b_instruct,Hydrox Integrity,30.32,,hydrox_safety_241001.csv falcon_40b_instruct,Hydrox Overall Score,27.55,,hydrox_safety_241001.csv falcon_40b_instruct,Hydrox Privacy,30.83,,hydrox_safety_241001.csv falcon_40b_instruct,Hydrox Safety,28.1,,hydrox_safety_241001.csv falcon_40b_instruct,Hydrox Security,22.97,,hydrox_safety_241001.csv falcon_7b,Hydrox Integrity,0.23,,hydrox_safety_241001.csv falcon_7b,Hydrox Overall Score,0.51,,hydrox_safety_241001.csv falcon_7b,Hydrox Privacy,0.11,,hydrox_safety_241001.csv falcon_7b,Hydrox Safety,1.05,,hydrox_safety_241001.csv falcon_7b,Hydrox Security,0.43,,hydrox_safety_241001.csv falcon_7b_instruct,Hydrox Integrity,15.76,,hydrox_safety_241001.csv falcon_7b_instruct,Hydrox Overall Score,14.01,,hydrox_safety_241001.csv falcon_7b_instruct,Hydrox Privacy,11.3,,hydrox_safety_241001.csv falcon_7b_instruct,Hydrox Safety,14.64,,hydrox_safety_241001.csv falcon_7b_instruct,Hydrox Security,14.01,,hydrox_safety_241001.csv gemini_1_0_pro,Hydrox Integrity,87.11,,hydrox_safety_241001.csv gemini_1_0_pro,Hydrox Overall Score,77.2,,hydrox_safety_241001.csv gemini_1_0_pro,Hydrox Privacy,90.39,,hydrox_safety_241001.csv gemini_1_0_pro,Hydrox Safety,65.18,,hydrox_safety_241001.csv gemini_1_0_pro,Hydrox Security,79.93,,hydrox_safety_241001.csv gemini_1_0_pro_latest,Hydrox Integrity,88.61,,hydrox_safety_241001.csv gemini_1_0_pro_latest,Hydrox Overall Score,78.29,,hydrox_safety_241001.csv gemini_1_0_pro_latest,Hydrox Privacy,87.82,,hydrox_safety_241001.csv gemini_1_0_pro_latest,Hydrox Safety,69.2,,hydrox_safety_241001.csv gemini_1_0_pro_latest,Hydrox Security,77.91,,hydrox_safety_241001.csv gemini_1_5_flash,Hydrox Integrity,60.0,,hydrox_safety_241001.csv gemini_1_5_flash,Hydrox Overall Score,74.43,,hydrox_safety_241001.csv gemini_1_5_flash,Hydrox Privacy,83.33,,hydrox_safety_241001.csv gemini_1_5_flash,Hydrox Safety,77.61,,hydrox_safety_241001.csv gemini_1_5_flash,Hydrox Security,72.05,,hydrox_safety_241001.csv gemini_1_5_pro,Hydrox Integrity,40.84,,hydrox_safety_241001.csv gemini_1_5_pro,Hydrox Overall Score,43.27,,hydrox_safety_241001.csv gemini_1_5_pro,Hydrox Privacy,40.63,,hydrox_safety_241001.csv gemini_1_5_pro,Hydrox Safety,46.99,,hydrox_safety_241001.csv gemini_1_5_pro,Hydrox Security,41.65,,hydrox_safety_241001.csv gemini_pro,Hydrox Integrity,84.42,,hydrox_safety_241001.csv gemini_pro,Hydrox Overall Score,73.04,,hydrox_safety_241001.csv gemini_pro,Hydrox Privacy,90.6,,hydrox_safety_241001.csv gemini_pro,Hydrox Safety,63.56,,hydrox_safety_241001.csv gemini_pro,Hydrox Security,67.49,,hydrox_safety_241001.csv gemma_2_27b_it,Hydrox Integrity,10.94,,hydrox_safety_241001.csv gemma_2_27b_it,Hydrox Overall Score,9.67,,hydrox_safety_241001.csv gemma_2_27b_it,Hydrox Privacy,11.11,,hydrox_safety_241001.csv gemma_2_27b_it,Hydrox Safety,8.1,,hydrox_safety_241001.csv gemma_2_27b_it,Hydrox Security,10.0,,hydrox_safety_241001.csv gemma_2_2b,Hydrox Integrity,24.88,,hydrox_safety_241001.csv gemma_2_2b,Hydrox Overall Score,25.5,,hydrox_safety_241001.csv gemma_2_2b,Hydrox Privacy,27.04,,hydrox_safety_241001.csv gemma_2_2b,Hydrox Safety,25.61,,hydrox_safety_241001.csv gemma_2_2b,Hydrox Security,24.5,,hydrox_safety_241001.csv gemma_2_2b_it,Hydrox Integrity,93.14,,hydrox_safety_241001.csv gemma_2_2b_it,Hydrox Overall Score,91.66,,hydrox_safety_241001.csv gemma_2_2b_it,Hydrox Privacy,92.43,,hydrox_safety_241001.csv gemma_2_2b_it,Hydrox Safety,92.15,,hydrox_safety_241001.csv gemma_2_2b_it,Hydrox Security,89.22,,hydrox_safety_241001.csv gemma_2b,Hydrox Integrity,6.39,,hydrox_safety_241001.csv gemma_2b,Hydrox Overall Score,7.99,,hydrox_safety_241001.csv gemma_2b,Hydrox Privacy,8.27,,hydrox_safety_241001.csv gemma_2b,Hydrox Safety,8.55,,hydrox_safety_241001.csv gemma_2b,Hydrox Security,8.09,,hydrox_safety_241001.csv gpt_3_5_turbo_0613,Hydrox Integrity,80.84,,hydrox_safety_241001.csv gpt_3_5_turbo_0613,Hydrox Overall Score,72.04,,hydrox_safety_241001.csv gpt_3_5_turbo_0613,Hydrox Privacy,90.0,,hydrox_safety_241001.csv gpt_3_5_turbo_0613,Hydrox Safety,56.94,,hydrox_safety_241001.csv gpt_3_5_turbo_0613,Hydrox Security,93.43,,hydrox_safety_241001.csv gpt_4_0314,Hydrox Integrity,54.0,,hydrox_safety_241001.csv gpt_4_0314,Hydrox Overall Score,62.51,,hydrox_safety_241001.csv gpt_4_0314,Hydrox Privacy,76.67,,hydrox_safety_241001.csv gpt_4_0314,Hydrox Safety,56.36,,hydrox_safety_241001.csv gpt_4_0314,Hydrox Security,72.79,,hydrox_safety_241001.csv gpt_4_0613,Hydrox Integrity,96.04,,hydrox_safety_241001.csv gpt_4_0613,Hydrox Overall Score,85.43,,hydrox_safety_241001.csv gpt_4_0613,Hydrox Privacy,91.79,,hydrox_safety_241001.csv gpt_4_0613,Hydrox Safety,79.94,,hydrox_safety_241001.csv gpt_4_0613,Hydrox Security,92.0,,hydrox_safety_241001.csv gpt_4o_2024_05_13,Hydrox Integrity,63.54,,hydrox_safety_241001.csv gpt_4o_2024_05_13,Hydrox Overall Score,65.26,,hydrox_safety_241001.csv gpt_4o_2024_05_13,Hydrox Privacy,68.46,,hydrox_safety_241001.csv gpt_4o_2024_05_13,Hydrox Safety,67.11,,hydrox_safety_241001.csv gpt_4o_2024_05_13,Hydrox Security,60.89,,hydrox_safety_241001.csv gpt_4o_mini_2024_07_18,Hydrox Integrity,81.38,,hydrox_safety_241001.csv gpt_4o_mini_2024_07_18,Hydrox Overall Score,80.43,,hydrox_safety_241001.csv gpt_4o_mini_2024_07_18,Hydrox Privacy,82.32,,hydrox_safety_241001.csv gpt_4o_mini_2024_07_18,Hydrox Safety,80.87,,hydrox_safety_241001.csv gpt_4o_mini_2024_07_18,Hydrox Security,77.55,,hydrox_safety_241001.csv h2ogpt_4096_llama2_70b_chat,Hydrox Integrity,65.75,,hydrox_safety_241001.csv h2ogpt_4096_llama2_70b_chat,Hydrox Overall Score,63.67,,hydrox_safety_241001.csv h2ogpt_4096_llama2_70b_chat,Hydrox Privacy,73.46,,hydrox_safety_241001.csv h2ogpt_4096_llama2_70b_chat,Hydrox Safety,63.64,,hydrox_safety_241001.csv h2ogpt_4096_llama2_70b_chat,Hydrox Security,63.34,,hydrox_safety_241001.csv hydro_safe_dolly_v2_7b_dpo_full,Hydrox Integrity,5.96,,hydrox_safety_241001.csv hydro_safe_dolly_v2_7b_dpo_full,Hydrox Overall Score,7.64,,hydrox_safety_241001.csv hydro_safe_dolly_v2_7b_dpo_full,Hydrox Privacy,6.16,,hydrox_safety_241001.csv hydro_safe_dolly_v2_7b_dpo_full,Hydrox Safety,11.03,,hydrox_safety_241001.csv hydro_safe_dolly_v2_7b_dpo_full,Hydrox Security,5.1,,hydrox_safety_241001.csv hydro_safe_dolly_v2_7b_dpo_full_3_epoch,Hydrox Integrity,35.51,,hydrox_safety_241001.csv hydro_safe_dolly_v2_7b_dpo_full_3_epoch,Hydrox Overall Score,27.81,,hydrox_safety_241001.csv hydro_safe_dolly_v2_7b_dpo_full_3_epoch,Hydrox Privacy,32.34,,hydrox_safety_241001.csv hydro_safe_dolly_v2_7b_dpo_full_3_epoch,Hydrox Safety,22.95,,hydrox_safety_241001.csv hydro_safe_dolly_v2_7b_dpo_full_3_epoch,Hydrox Security,25.64,,hydrox_safety_241001.csv hydro_safe_llama2_7b_chat_dpo_full_3_epoch,Hydrox Integrity,84.27,,hydrox_safety_241001.csv hydro_safe_llama2_7b_chat_dpo_full_3_epoch,Hydrox Overall Score,83.93,,hydrox_safety_241001.csv hydro_safe_llama2_7b_chat_dpo_full_3_epoch,Hydrox Privacy,90.63,,hydrox_safety_241001.csv hydro_safe_llama2_7b_chat_dpo_full_3_epoch,Hydrox Safety,79.83,,hydrox_safety_241001.csv hydro_safe_llama2_7b_chat_dpo_full_3_epoch,Hydrox Security,84.68,,hydrox_safety_241001.csv hydro_safe_mistral_7b_instruct_v0_1_dpo_full_1_epoch,Hydrox Integrity,97.74,,hydrox_safety_241001.csv hydro_safe_mistral_7b_instruct_v0_1_dpo_full_1_epoch,Hydrox Overall Score,91.6,,hydrox_safety_241001.csv hydro_safe_mistral_7b_instruct_v0_1_dpo_full_1_epoch,Hydrox Privacy,96.21,,hydrox_safety_241001.csv hydro_safe_mistral_7b_instruct_v0_1_dpo_full_1_epoch,Hydrox Safety,86.56,,hydrox_safety_241001.csv hydro_safe_mistral_7b_instruct_v0_1_dpo_full_1_epoch,Hydrox Security,91.35,,hydrox_safety_241001.csv hydro_safe_mistral_7b_v0_1_dpo_full,Hydrox Integrity,98.16,,hydrox_safety_241001.csv hydro_safe_mistral_7b_v0_1_dpo_full,Hydrox Overall Score,94.44,,hydrox_safety_241001.csv hydro_safe_mistral_7b_v0_1_dpo_full,Hydrox Privacy,99.62,,hydrox_safety_241001.csv hydro_safe_mistral_7b_v0_1_dpo_full,Hydrox Safety,89.41,,hydrox_safety_241001.csv hydro_safe_mistral_7b_v0_1_dpo_full,Hydrox Security,96.66,,hydrox_safety_241001.csv hydro_safe_sheared_llama_1_3b_dpo_full,Hydrox Integrity,35.98,,hydrox_safety_241001.csv hydro_safe_sheared_llama_1_3b_dpo_full,Hydrox Overall Score,31.87,,hydrox_safety_241001.csv hydro_safe_sheared_llama_1_3b_dpo_full,Hydrox Privacy,45.3,,hydrox_safety_241001.csv hydro_safe_sheared_llama_1_3b_dpo_full,Hydrox Safety,26.44,,hydrox_safety_241001.csv hydro_safe_sheared_llama_1_3b_dpo_full,Hydrox Security,27.07,,hydrox_safety_241001.csv hydro_safe_zephyr_td_full,Hydrox Integrity,71.25,,hydrox_safety_241001.csv hydro_safe_zephyr_td_full,Hydrox Overall Score,78.18,,hydrox_safety_241001.csv hydro_safe_zephyr_td_full,Hydrox Privacy,49.7,,hydrox_safety_241001.csv hydro_safe_zephyr_td_full,Hydrox Safety,78.18,,hydrox_safety_241001.csv hydro_safe_zephyr_td_full,Hydrox Security,66.63,,hydrox_safety_241001.csv komt_mistral_7b_v1,Hydrox Integrity,0.0,,hydrox_safety_241001.csv komt_mistral_7b_v1,Hydrox Overall Score,0.13,,hydrox_safety_241001.csv komt_mistral_7b_v1,Hydrox Privacy,0.02,,hydrox_safety_241001.csv komt_mistral_7b_v1,Hydrox Safety,0.65,,hydrox_safety_241001.csv komt_mistral_7b_v1,Hydrox Security,0.0,,hydrox_safety_241001.csv llama3_2_1b_instruct,Hydrox Integrity,76.98,,hydrox_safety_241001.csv llama3_2_1b_instruct,Hydrox Overall Score,75.78,,hydrox_safety_241001.csv llama3_2_1b_instruct,Hydrox Privacy,75.71,,hydrox_safety_241001.csv llama3_2_1b_instruct,Hydrox Safety,76.25,,hydrox_safety_241001.csv llama3_2_1b_instruct,Hydrox Security,74.2,,hydrox_safety_241001.csv llama3_2_3b_instruct,Hydrox Integrity,79.24,,hydrox_safety_241001.csv llama3_2_3b_instruct,Hydrox Overall Score,77.42,,hydrox_safety_241001.csv llama3_2_3b_instruct,Hydrox Privacy,77.9,,hydrox_safety_241001.csv llama3_2_3b_instruct,Hydrox Safety,79.46,,hydrox_safety_241001.csv llama3_2_3b_instruct,Hydrox Security,72.51,,hydrox_safety_241001.csv llama3_70b_instruct,Hydrox Integrity,73.55,,hydrox_safety_241001.csv llama3_70b_instruct,Hydrox Overall Score,74.44,,hydrox_safety_241001.csv llama3_70b_instruct,Hydrox Privacy,80.65,,hydrox_safety_241001.csv llama3_70b_instruct,Hydrox Safety,74.65,,hydrox_safety_241001.csv llama3_70b_instruct,Hydrox Security,70.21,,hydrox_safety_241001.csv llama3_8b_instruct,Hydrox Integrity,80.86,,hydrox_safety_241001.csv llama3_8b_instruct,Hydrox Overall Score,83.72,,hydrox_safety_241001.csv llama3_8b_instruct,Hydrox Privacy,88.61,,hydrox_safety_241001.csv llama3_8b_instruct,Hydrox Safety,83.32,,hydrox_safety_241001.csv llama3_8b_instruct,Hydrox Security,82.51,,hydrox_safety_241001.csv llama_2_13b_chat,Hydrox Integrity,62.67,,hydrox_safety_241001.csv llama_2_13b_chat,Hydrox Overall Score,60.0,,hydrox_safety_241001.csv llama_2_13b_chat,Hydrox Privacy,63.37,,hydrox_safety_241001.csv llama_2_13b_chat,Hydrox Safety,58.6,,hydrox_safety_241001.csv llama_2_13b_chat,Hydrox Security,57.85,,hydrox_safety_241001.csv llama_2_70b_chat,Hydrox Integrity,63.0,,hydrox_safety_241001.csv llama_2_70b_chat,Hydrox Overall Score,62.5,,hydrox_safety_241001.csv llama_2_70b_chat,Hydrox Privacy,68.87,,hydrox_safety_241001.csv llama_2_70b_chat,Hydrox Safety,61.0,,hydrox_safety_241001.csv llama_2_70b_chat,Hydrox Security,59.58,,hydrox_safety_241001.csv llama_2_7b_chat,Hydrox Integrity,51.63,,hydrox_safety_241001.csv llama_2_7b_chat,Hydrox Overall Score,51.26,,hydrox_safety_241001.csv llama_2_7b_chat,Hydrox Privacy,55.3,,hydrox_safety_241001.csv llama_2_7b_chat,Hydrox Safety,52.3,,hydrox_safety_241001.csv llama_2_7b_chat,Hydrox Security,46.71,,hydrox_safety_241001.csv mistral_7b_instruct_v0_1,Hydrox Integrity,12.39,,hydrox_safety_241001.csv mistral_7b_instruct_v0_1,Hydrox Overall Score,16.74,,hydrox_safety_241001.csv mistral_7b_instruct_v0_1,Hydrox Privacy,12.08,,hydrox_safety_241001.csv mistral_7b_instruct_v0_1,Hydrox Safety,26.91,,hydrox_safety_241001.csv mistral_7b_instruct_v0_1,Hydrox Security,10.86,,hydrox_safety_241001.csv mistral_7b_instruct_v0_2,Hydrox Integrity,32.52,,hydrox_safety_241001.csv mistral_7b_instruct_v0_2,Hydrox Overall Score,36.82,,hydrox_safety_241001.csv mistral_7b_instruct_v0_2,Hydrox Privacy,37.18,,hydrox_safety_241001.csv mistral_7b_instruct_v0_2,Hydrox Safety,41.71,,hydrox_safety_241001.csv mistral_7b_instruct_v0_2,Hydrox Security,32.24,,hydrox_safety_241001.csv mistral_7b_v0_1,Hydrox Integrity,8.53,,hydrox_safety_241001.csv mistral_7b_v0_1,Hydrox Overall Score,7.32,,hydrox_safety_241001.csv mistral_7b_v0_1,Hydrox Privacy,4.18,,hydrox_safety_241001.csv mistral_7b_v0_1,Hydrox Safety,11.38,,hydrox_safety_241001.csv mistral_7b_v0_1,Hydrox Security,4.44,,hydrox_safety_241001.csv mixtral_8x7b_instruct_v0_1,Hydrox Integrity,21.23,,hydrox_safety_241001.csv mixtral_8x7b_instruct_v0_1,Hydrox Overall Score,23.75,,hydrox_safety_241001.csv mixtral_8x7b_instruct_v0_1,Hydrox Privacy,25.04,,hydrox_safety_241001.csv mixtral_8x7b_instruct_v0_1,Hydrox Safety,27.7,,hydrox_safety_241001.csv mixtral_8x7b_instruct_v0_1,Hydrox Security,18.24,,hydrox_safety_241001.csv mixtral_8x7b_v0_1,Hydrox Integrity,8.16,,hydrox_safety_241001.csv mixtral_8x7b_v0_1,Hydrox Overall Score,8.81,,hydrox_safety_241001.csv mixtral_8x7b_v0_1,Hydrox Privacy,8.81,,hydrox_safety_241001.csv mixtral_8x7b_v0_1,Hydrox Safety,10.61,,hydrox_safety_241001.csv mixtral_8x7b_v0_1,Hydrox Security,6.73,,hydrox_safety_241001.csv neural_chat_7b_v3_1,Hydrox Integrity,22.84,,hydrox_safety_241001.csv neural_chat_7b_v3_1,Hydrox Overall Score,17.86,,hydrox_safety_241001.csv neural_chat_7b_v3_1,Hydrox Privacy,22.28,,hydrox_safety_241001.csv neural_chat_7b_v3_1,Hydrox Safety,15.86,,hydrox_safety_241001.csv neural_chat_7b_v3_1,Hydrox Security,14.72,,hydrox_safety_241001.csv neural_chat_7b_v3_2,Hydrox Integrity,15.33,,hydrox_safety_241001.csv neural_chat_7b_v3_2,Hydrox Overall Score,17.82,,hydrox_safety_241001.csv neural_chat_7b_v3_2,Hydrox Privacy,14.36,,hydrox_safety_241001.csv neural_chat_7b_v3_2,Hydrox Safety,19.68,,hydrox_safety_241001.csv neural_chat_7b_v3_2,Hydrox Security,18.62,,hydrox_safety_241001.csv nexusraven_v2_13b,Hydrox Integrity,4.5,,hydrox_safety_241001.csv nexusraven_v2_13b,Hydrox Overall Score,4.16,,hydrox_safety_241001.csv nexusraven_v2_13b,Hydrox Privacy,3.13,,hydrox_safety_241001.csv nexusraven_v2_13b,Hydrox Safety,3.95,,hydrox_safety_241001.csv nexusraven_v2_13b,Hydrox Security,4.77,,hydrox_safety_241001.csv notus_7b_v1,Hydrox Integrity,19.5,,hydrox_safety_241001.csv notus_7b_v1,Hydrox Overall Score,21.3,,hydrox_safety_241001.csv notus_7b_v1,Hydrox Privacy,22.05,,hydrox_safety_241001.csv notus_7b_v1,Hydrox Safety,26.55,,hydrox_safety_241001.csv notus_7b_v1,Hydrox Security,15.53,,hydrox_safety_241001.csv orca_2_13b,Hydrox Integrity,0.0,,hydrox_safety_241001.csv orca_2_13b,Hydrox Overall Score,17.48,,hydrox_safety_241001.csv orca_2_13b,Hydrox Privacy,27.78,,hydrox_safety_241001.csv orca_2_13b,Hydrox Safety,33.06,,hydrox_safety_241001.csv orca_2_13b,Hydrox Security,0.0,,hydrox_safety_241001.csv orca_2_7b,Hydrox Integrity,22.09,,hydrox_safety_241001.csv orca_2_7b,Hydrox Overall Score,19.53,,hydrox_safety_241001.csv orca_2_7b,Hydrox Privacy,18.31,,hydrox_safety_241001.csv orca_2_7b,Hydrox Safety,18.3,,hydrox_safety_241001.csv orca_2_7b,Hydrox Security,20.52,,hydrox_safety_241001.csv pythia_70m_deduped,Hydrox Integrity,0.0,,hydrox_safety_241001.csv pythia_70m_deduped,Hydrox Overall Score,0.0,,hydrox_safety_241001.csv pythia_70m_deduped,Hydrox Privacy,0.0,,hydrox_safety_241001.csv pythia_70m_deduped,Hydrox Safety,0.0,,hydrox_safety_241001.csv pythia_70m_deduped,Hydrox Security,0.0,,hydrox_safety_241001.csv qwen2_72b_instruct,Hydrox Integrity,70.13,,hydrox_safety_241001.csv qwen2_72b_instruct,Hydrox Overall Score,71.86,,hydrox_safety_241001.csv qwen2_72b_instruct,Hydrox Privacy,73.4,,hydrox_safety_241001.csv qwen2_72b_instruct,Hydrox Safety,77.1,,hydrox_safety_241001.csv qwen2_72b_instruct,Hydrox Security,65.19,,hydrox_safety_241001.csv sheared_llama_1_3b,Hydrox Integrity,0.04,,hydrox_safety_241001.csv sheared_llama_1_3b,Hydrox Overall Score,0.29,,hydrox_safety_241001.csv sheared_llama_1_3b,Hydrox Privacy,0.05,,hydrox_safety_241001.csv sheared_llama_1_3b,Hydrox Safety,1.14,,hydrox_safety_241001.csv sheared_llama_1_3b,Hydrox Security,0.03,,hydrox_safety_241001.csv solar_0_70b_16bit,Hydrox Integrity,30.25,,hydrox_safety_241001.csv solar_0_70b_16bit,Hydrox Overall Score,24.5,,hydrox_safety_241001.csv solar_0_70b_16bit,Hydrox Privacy,33.8,,hydrox_safety_241001.csv solar_0_70b_16bit,Hydrox Safety,22.4,,hydrox_safety_241001.csv solar_0_70b_16bit,Hydrox Security,17.55,,hydrox_safety_241001.csv tinyllama_1_1b_chat_v1_0,Hydrox Integrity,5.65,,hydrox_safety_241001.csv tinyllama_1_1b_chat_v1_0,Hydrox Overall Score,5.38,,hydrox_safety_241001.csv tinyllama_1_1b_chat_v1_0,Hydrox Privacy,3.3,,hydrox_safety_241001.csv tinyllama_1_1b_chat_v1_0,Hydrox Safety,6.87,,hydrox_safety_241001.csv tinyllama_1_1b_chat_v1_0,Hydrox Security,4.57,,hydrox_safety_241001.csv vicuna_13b_v1_5,Hydrox Integrity,36.08,,hydrox_safety_241001.csv vicuna_13b_v1_5,Hydrox Overall Score,34.07,,hydrox_safety_241001.csv vicuna_13b_v1_5,Hydrox Privacy,29.78,,hydrox_safety_241001.csv vicuna_13b_v1_5,Hydrox Safety,38.46,,hydrox_safety_241001.csv vicuna_13b_v1_5,Hydrox Security,30.71,,hydrox_safety_241001.csv vicuna_13b_v1_5_16k,Hydrox Integrity,22.25,,hydrox_safety_241001.csv vicuna_13b_v1_5_16k,Hydrox Overall Score,19.31,,hydrox_safety_241001.csv vicuna_13b_v1_5_16k,Hydrox Privacy,17.01,,hydrox_safety_241001.csv vicuna_13b_v1_5_16k,Hydrox Safety,21.14,,hydrox_safety_241001.csv vicuna_13b_v1_5_16k,Hydrox Security,16.99,,hydrox_safety_241001.csv vicuna_33b_v1_3,Hydrox Integrity,18.64,,hydrox_safety_241001.csv vicuna_33b_v1_3,Hydrox Overall Score,17.64,,hydrox_safety_241001.csv vicuna_33b_v1_3,Hydrox Privacy,21.34,,hydrox_safety_241001.csv vicuna_33b_v1_3,Hydrox Safety,18.42,,hydrox_safety_241001.csv vicuna_33b_v1_3,Hydrox Security,13.89,,hydrox_safety_241001.csv vicuna_7b_v1_5,Hydrox Integrity,11.74,,hydrox_safety_241001.csv vicuna_7b_v1_5,Hydrox Overall Score,15.37,,hydrox_safety_241001.csv vicuna_7b_v1_5,Hydrox Privacy,10.91,,hydrox_safety_241001.csv vicuna_7b_v1_5,Hydrox Safety,22.47,,hydrox_safety_241001.csv vicuna_7b_v1_5,Hydrox Security,12.61,,hydrox_safety_241001.csv viking_13b,Hydrox Integrity,7.68,,hydrox_safety_241001.csv viking_13b,Hydrox Overall Score,7.33,,hydrox_safety_241001.csv viking_13b,Hydrox Privacy,8.32,,hydrox_safety_241001.csv viking_13b,Hydrox Safety,7.75,,hydrox_safety_241001.csv viking_13b,Hydrox Security,5.76,,hydrox_safety_241001.csv viking_33b,Hydrox Integrity,6.38,,hydrox_safety_241001.csv viking_33b,Hydrox Overall Score,6.73,,hydrox_safety_241001.csv viking_33b,Hydrox Privacy,6.48,,hydrox_safety_241001.csv viking_33b,Hydrox Safety,6.87,,hydrox_safety_241001.csv viking_33b,Hydrox Security,6.92,,hydrox_safety_241001.csv viking_7b,Hydrox Integrity,9.05,,hydrox_safety_241001.csv viking_7b,Hydrox Overall Score,6.15,,hydrox_safety_241001.csv viking_7b,Hydrox Privacy,3.91,,hydrox_safety_241001.csv viking_7b,Hydrox Safety,5.37,,hydrox_safety_241001.csv viking_7b,Hydrox Security,7.6,,hydrox_safety_241001.csv wizardlm_30b_v1_0,Hydrox Integrity,5.58,,hydrox_safety_241001.csv wizardlm_30b_v1_0,Hydrox Overall Score,6.41,,hydrox_safety_241001.csv wizardlm_30b_v1_0,Hydrox Privacy,3.88,,hydrox_safety_241001.csv wizardlm_30b_v1_0,Hydrox Safety,8.0,,hydrox_safety_241001.csv wizardlm_30b_v1_0,Hydrox Security,6.49,,hydrox_safety_241001.csv yi_6b_chat,Hydrox Integrity,36.02,,hydrox_safety_241001.csv yi_6b_chat,Hydrox Overall Score,37.0,,hydrox_safety_241001.csv yi_6b_chat,Hydrox Privacy,45.36,,hydrox_safety_241001.csv yi_6b_chat,Hydrox Safety,37.35,,hydrox_safety_241001.csv yi_6b_chat,Hydrox Security,31.49,,hydrox_safety_241001.csv zephyr_7b_beta,Hydrox Integrity,24.95,,hydrox_safety_241001.csv zephyr_7b_beta,Hydrox Overall Score,23.8,,hydrox_safety_241001.csv zephyr_7b_beta,Hydrox Privacy,30.6,,hydrox_safety_241001.csv zephyr_7b_beta,Hydrox Safety,21.2,,hydrox_safety_241001.csv zephyr_7b_beta,Hydrox Security,22.4,,hydrox_safety_241001.csv zephyr_reproduction_dpo_full,Hydrox Integrity,26.05,,hydrox_safety_241001.csv zephyr_reproduction_dpo_full,Hydrox Overall Score,21.38,,hydrox_safety_241001.csv zephyr_reproduction_dpo_full,Hydrox Privacy,21.65,,hydrox_safety_241001.csv zephyr_reproduction_dpo_full,Hydrox Safety,19.35,,hydrox_safety_241001.csv zephyr_reproduction_dpo_full,Hydrox Security,21.22,,hydrox_safety_241001.csv zephyr_reproduction_sft_full,Hydrox Integrity,13.61,,hydrox_safety_241001.csv zephyr_reproduction_sft_full,Hydrox Overall Score,13.1,,hydrox_safety_241001.csv zephyr_reproduction_sft_full,Hydrox Privacy,14.94,,hydrox_safety_241001.csv zephyr_reproduction_sft_full,Hydrox Safety,14.92,,hydrox_safety_241001.csv zephyr_reproduction_sft_full,Hydrox Security,9.5,,hydrox_safety_241001.csv alpaca_7b,aggregate,0.23484848484848483,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate chatglm2_6b,aggregate,0.029137529137529136,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate chatgpt_4o_latest,aggregate,0.9754079254079254,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate claude_2_0,aggregate,0.8333333333333334,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate claude_2_1,aggregate,0.6693861693861693,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate claude_3_5_sonnet_20240620,aggregate,0.9572649572649573,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate claude_3_haiku_20240307,aggregate,0.44965034965034967,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate claude_3_opus_20240229,aggregate,0.8824397824397824,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate claude_3_sonnet_20240229,aggregate,0.5985236985236985,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate claude_instant_1_2,aggregate,0.6486013986013985,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate command_r,aggregate,0.3296911421911422,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate command_r_plus,aggregate,0.6183108558108558,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate dbrx_instruct,aggregate,0.4724025974025974,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate dbrx_instructruct,aggregate,0.5379867046533713,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate deepseek_coder_v2,aggregate,0.713053613053613,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate deepseek_llm_67b_chat,aggregate,0.5734841290396846,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate dolphin_2_2_1_mistral_7b,aggregate,0.4810606060606061,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate falcon_40b,aggregate,0.3502690724912947,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate falcon_40b_instruct,aggregate,0.13187429854096522,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate falcon_7b,aggregate,0.11380183602405824,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate falcon_7b_instruct,aggregate,0.011363636363636364,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate gemini_1_5_flash_api_0514,aggregate,0.7263403263403263,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate gemini_1_5_pro_api_0514,aggregate,0.8294871794871794,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate gemini_1_5_pro_exp_0801,aggregate,0.9545454545454546,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate gemini_pro,aggregate,0.7298951048951049,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate gemma_1_1_2b_it,aggregate,0.07454890788224121,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate gemma_1_1_7b_it,aggregate,0.263927019482575,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate gemma_2_27b_it,aggregate,0.776345259678593,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate gemma_2_2b_it,aggregate,0.28113553113553114,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate gemma_2_9b_it,aggregate,0.6048877048877048,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate gemma_2_9b_it_dpo,aggregate,0.8100649350649352,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate gemma_2_9b_it_simpo,aggregate,0.7328042328042329,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate gemma_2b_it,aggregate,0.08119658119658119,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate gemma_7b,aggregate,0.4477682811016144,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate gemma_7b_it,aggregate,0.18790982679871568,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate glm_4_9b_chat,aggregate,0.4769547325102881,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate gpt_3_5_turbo_0125,aggregate,0.3591242091242091,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate gpt_3_5_turbo_0613,aggregate,0.6851851851851851,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate gpt_4_0125_preview,aggregate,0.8492118992118992,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate gpt_4_0613,aggregate,0.7641802641802643,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate gpt_4_turbo_2024_04_09,aggregate,0.9055819180819181,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate gpt_4o_2024_05_13,aggregate,0.9767482517482518,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate gpt_4o_2024_08_06,aggregate,0.9652680652680652,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate gpt_4o_mini_2024_07_18,aggregate,0.8348776223776224,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate gpt_j_6b,aggregate,0.09876543209876543,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate gpt_neox_20b,aggregate,0.1419753086419753,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate guanaco_33b,aggregate,0.38374125874125875,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate hermes_3_llama3_1_70b,aggregate,0.8451178451178452,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate infinity_instruct_3m_0625_llama3_8b,aggregate,0.6537598204264872,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate internlm2_chat_20b,aggregate,0.37196969696969695,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate jurassic_2_grande_17b,aggregate,0.4230769230769231,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate jurassic_2_jumbo_178b,aggregate,0.532051282051282,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate llama3_1_405b_instruct,aggregate,0.8598484848484849,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate llama3_1_70b_instruct,aggregate,0.9343074620852398,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate llama3_1_8b_instruct,aggregate,0.6080822469711359,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate llama3_70b,aggregate,0.8129154795821463,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate llama3_70b_instruct,aggregate,0.8172801478357034,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate llama3_8b,aggregate,0.4368471035137702,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate llama3_8b_instruct,aggregate,0.4449662477440255,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate llama3_instruct_8b_simpo,aggregate,0.7992424242424242,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate llama_13b,aggregate,0.2222222222222222,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate llama_2_13b,aggregate,0.4146881924659702,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate llama_2_13b_chat,aggregate,0.38675213675213677,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate llama_2_70b,aggregate,0.7293447293447294,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate llama_2_70b_chat,aggregate,0.412732329398996,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate llama_2_7b,aggregate,0.25466919911364355,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate llama_2_7b_chat,aggregate,0.1122679789346456,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate llama_65b,aggregate,0.5759734093067427,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate luminous_base_13b,aggregate,0.08333333333333333,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate luminous_extended_30b,aggregate,0.2329059829059829,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate luminous_supreme_70b,aggregate,0.32905982905982906,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate mistral_7b_instruct_v0_2,aggregate,0.250669392336059,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate mistral_7b_instruct_v0_3,aggregate,0.24534231200897869,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate mistral_7b_v0_2,aggregate,0.3773849607182941,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate mistral_7b_v0_3,aggregate,0.4228395061728395,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate mistral_large_2402,aggregate,0.5105672105672105,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate mistral_large_2407,aggregate,0.8375291375291375,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate mistral_medium,aggregate,0.657051282051282,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate mistral_small_2402,aggregate,0.47785547785547783,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate mistral_v0_1_7b,aggregate,0.6239316239316239,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate mixtral_8x22b_instruct_v0_1,aggregate,0.585565052231719,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate mixtral_8x22b_v0_1,aggregate,0.7382154882154882,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate mixtral_8x7b_instruct_v0_1,aggregate,0.284326167659501,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate mixtral_8x7b_v0_1,aggregate,0.5310044893378227,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate nous_hermes_2_mixtral_8x7b_dpo,aggregate,0.7094017094017094,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate olmo_7b,aggregate,0.06220322886989553,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate olmo_7b_instruct,aggregate,0.15669515669515668,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate openchat_3_5,aggregate,0.5270655270655271,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate openhermes_2_5_mistral_7b,aggregate,0.40103708020374684,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate phi_2,aggregate,0.19812080923192033,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate phi_3_5_mini_instruct,aggregate,0.6103254769921437,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate phi_3_5_moe_instruct,aggregate,0.7600448933782267,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate phi_3_medium_4k_instruct,aggregate,0.48541540763762986,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate phi_3_mini_128k_instruct,aggregate,0.3778468445135112,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate phi_3_mini_4k_instruct,aggregate,0.4048663270885493,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate phi_3_small_128k_instruct,aggregate,0.6561167227833894,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate phi_3_small_8k_instruct,aggregate,0.27051282051282055,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate pythia_12b,aggregate,0.05246913580246913,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate pythia_6_9b,aggregate,0.018518518518518517,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate qwen1_5_0_5b_chat,aggregate,0.012345679012345678,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate qwen1_5_110b_chat,aggregate,0.7419770353103686,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate qwen1_5_14b,aggregate,0.5797720797720798,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate qwen1_5_14b_chat,aggregate,0.45340153673487005,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate qwen1_5_1_8b_chat,aggregate,0.05544332210998878,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate qwen1_5_32b,aggregate,0.7678062678062678,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate qwen1_5_32b_chat,aggregate,0.571383349161127,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate qwen1_5_4b_chat,aggregate,0.12542806987251431,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate qwen1_5_72b_chat,aggregate,0.5463669663669664,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate qwen1_5_7b,aggregate,0.35185185185185186,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate qwen1_5_7b_chat,aggregate,0.24214088380755047,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate qwen2_0_5b_instruct,aggregate,0.055218855218855216,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate qwen2_1_5b_instruct,aggregate,0.1968574635241302,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate qwen2_72b_instruct,aggregate,0.7701936951936953,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate qwen2_7b_instruct,aggregate,0.4970445192667415,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate qwen_14b_chat,aggregate,0.2837995337995338,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate smaug_qwen2_72b_instruct,aggregate,0.8331088664421997,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate solar_10_7b_instruct_v1_0,aggregate,0.5030864197530864,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate starling_lm_7b_alpha,aggregate,0.42734323289878845,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate starling_lm_7b_beta,aggregate,0.3611888111888112,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate tulu_2_dpo_70b,aggregate,0.3585164835164835,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate vicuna_13b,aggregate,0.14714452214452214,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate vicuna_7b,aggregate,0.1885198135198135,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate vicuna_7b_v1_5,aggregate,0.15454545454545454,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate wizardlm_13b,aggregate,0.42773892773892774,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate wizardlm_70b,aggregate,0.5620629370629371,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate yi_1_5_34b_chat,aggregate,0.6669566544566544,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate yi_1_5_6b_chat,aggregate,0.33974132863021755,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate yi_1_5_9b_chat,aggregate,0.6041446208112875,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate yi_34b,aggregate,0.7188983855650521,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate yi_34b_chat,aggregate,0.5558361391694725,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate yi_6b,aggregate,0.295346628679962,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate yi_6b_chat,aggregate,0.19393939393939394,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate yi_large,aggregate,0.7889194139194139,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate yi_large_preview,aggregate,0.8714202464202464,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate zephyr_7b_alpha,aggregate,0.33875830959164294,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate zephyr_7b_beta,aggregate,0.28937667271000606,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate zephyr_orpo_141b_a35b_v0_1,aggregate,0.8414055080721747,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate