diff --git "a/cache/allbenchs_cache_5e66a88dab42480065db47711c55c458.csv" "b/cache/allbenchs_cache_5e66a88dab42480065db47711c55c458.csv" --- "a/cache/allbenchs_cache_5e66a88dab42480065db47711c55c458.csv" +++ "b/cache/allbenchs_cache_5e66a88dab42480065db47711c55c458.csv" @@ -901,6 +901,135 @@ luminous_extended_30b,Helm Lite WMT2014,0.083,[],helm_lite_240829.csv falcon_7b,Helm Lite WMT2014,0.094,[],helm_lite_240829.csv olmo_7b,Helm Lite WMT2014,0.097,[],helm_lite_240829.csv luminous_base_13b,Helm Lite WMT2014,0.066,[],helm_lite_240829.csv +chatglm_6b,LMSys Arena,855.910565721209,[],chatbot_arena_241104.csv +koala_13b,LMSys Arena,901.4444159097708,[],chatbot_arena_241104.csv +oasst_pythia_12b,LMSys Arena,812.3918514404036,[],chatbot_arena_241104.csv +alpaca_13b,LMSys Arena,851.3113435573603,[],chatbot_arena_241104.csv +vicuna_13b,LMSys Arena,874.2126379649785,[],chatbot_arena_241104.csv +dolly_v2_12b,LMSys Arena,781.4370567093974,[],chatbot_arena_241104.csv +stablelm_tuned_alpha_7b,LMSys Arena,829.7609284591157,[],chatbot_arena_241104.csv +llama_13b,LMSys Arena,800.0,[],chatbot_arena_241104.csv +fastchat_t5_3b,LMSys Arena,794.3748535699036,[],chatbot_arena_241104.csv +gpt_3_5_turbo_0314,LMSys Arena,1051.024508411953,[],chatbot_arena_241104.csv +gpt_4_0314,LMSys Arena,980.6906633214736,[],chatbot_arena_241104.csv +rwkv_4_raven_14b,LMSys Arena,874.536173297737,[],chatbot_arena_241104.csv +claude_1,LMSys Arena,1039.7803750141782,[],chatbot_arena_241104.csv +mpt_7b_chat,LMSys Arena,869.0762171208861,[],chatbot_arena_241104.csv +palm_2,LMSys Arena,922.5218005276812,[],chatbot_arena_241104.csv +claude_instant_1,LMSys Arena,991.8056867962612,[],chatbot_arena_241104.csv +vicuna_7b,LMSys Arena,910.6856107758756,[],chatbot_arena_241104.csv +wizardlm_13b,LMSys Arena,971.8432912657483,[],chatbot_arena_241104.csv +gpt4all_13b_snoozy,LMSys Arena,885.7452637089059,[],chatbot_arena_241104.csv +guanaco_33b,LMSys Arena,974.3076720194276,[],chatbot_arena_241104.csv +vicuna_33b,LMSys Arena,906.4317166108784,[],chatbot_arena_241104.csv +mpt_30b_chat,LMSys Arena,971.1057122702124,[],chatbot_arena_241104.csv +gpt_3_5_turbo_0613,LMSys Arena,999.7201069046866,[],chatbot_arena_241104.csv +gpt_4_0613,LMSys Arena,960.3770824361336,[],chatbot_arena_241104.csv +llama_2_7b_chat,LMSys Arena,895.4706517283653,[],chatbot_arena_241104.csv +claude_2_0,LMSys Arena,1016.5801503367938,[],chatbot_arena_241104.csv +llama_2_13b_chat,LMSys Arena,963.7146661400922,[],chatbot_arena_241104.csv +chatglm2_6b,LMSys Arena,835.3074735731766,[],chatbot_arena_241104.csv +llama_2_70b_chat,LMSys Arena,1007.6844327159828,[],chatbot_arena_241104.csv +codellama34b_instruct,LMSys Arena,934.0457254208728,[],chatbot_arena_241104.csv +wizardlm_70b,LMSys Arena,979.5605650746356,[],chatbot_arena_241104.csv +falcon_180b_chat,LMSys Arena,923.054729229491,[],chatbot_arena_241104.csv +mistral_7b_instruct,LMSys Arena,895.9405753947756,[],chatbot_arena_241104.csv +qwen_14b_chat,LMSys Arena,921.4887868532272,[],chatbot_arena_241104.csv +zephyr_7b_alpha,LMSys Arena,946.9339607858802,[],chatbot_arena_241104.csv +zephyr_7b_beta,LMSys Arena,913.246312461937,[],chatbot_arena_241104.csv +openchat_3_5,LMSys Arena,948.9893819327424,[],chatbot_arena_241104.csv +gpt_4_1106_preview,LMSys Arena,1001.256303019811,[],chatbot_arena_241104.csv +gpt_3_5_turbo_1106,LMSys Arena,937.6322384103784,[],chatbot_arena_241104.csv +chatglm3_6b,LMSys Arena,814.5480014217649,[],chatbot_arena_241104.csv +claude_2_1,LMSys Arena,979.863770513184,[],chatbot_arena_241104.csv +tulu_2_dpo_70b,LMSys Arena,961.7298633389956,[],chatbot_arena_241104.csv +yi_34b_chat,LMSys Arena,932.0283635154188,[],chatbot_arena_241104.csv +starling_lm_7b_alpha,LMSys Arena,945.1430459412009,[],chatbot_arena_241104.csv +openhermes_2_5_mistral_7b,LMSys Arena,935.5573447997912,[],chatbot_arena_241104.csv +pplx_70b_online,LMSys Arena,931.0576338876376,[],chatbot_arena_241104.csv +pplx_7b_online,LMSys Arena,948.7421850358356,[],chatbot_arena_241104.csv +dolphin_2_2_1_mistral_7b,LMSys Arena,977.0069489193058,[],chatbot_arena_241104.csv +mixtral_8x7b_instruct_v0_1,LMSys Arena,867.9036424292025,[],chatbot_arena_241104.csv +gemini_pro,LMSys Arena,1006.251403062337,[],chatbot_arena_241104.csv +solar_10_7b_instruct_v1_0,LMSys Arena,958.6549095565916,[],chatbot_arena_241104.csv +mistral_medium,LMSys Arena,965.0537859905728,[],chatbot_arena_241104.csv +llama2_70b_steerlm_chat,LMSys Arena,965.6376159085758,[],chatbot_arena_241104.csv +gemini_pro_dev_api,LMSys Arena,1019.3566145491036,[],chatbot_arena_241104.csv +stripedhyena_nous_7b,LMSys Arena,919.5708420570646,[],chatbot_arena_241104.csv +bard_jan_24_gemini_pro,LMSys Arena,1041.261256012453,[],chatbot_arena_241104.csv +deepseek_llm_67b_chat,LMSys Arena,958.7276958964316,[],chatbot_arena_241104.csv +gpt_4_0125_preview,LMSys Arena,997.1712467949897,[],chatbot_arena_241104.csv +gpt_3_5_turbo_0125,LMSys Arena,898.9675086846296,[],chatbot_arena_241104.csv +nous_hermes_2_mixtral_8x7b_dpo,LMSys Arena,972.2639217501226,[],chatbot_arena_241104.csv +mistral_7b_instruct_v0_2,LMSys Arena,892.8914241485261,[],chatbot_arena_241104.csv +qwen1_5_72b_chat,LMSys Arena,947.9919390672214,[],chatbot_arena_241104.csv +openchat_3_5_0106,LMSys Arena,956.5639851579056,[],chatbot_arena_241104.csv +qwen1_5_4b_chat,LMSys Arena,857.8615305194531,[],chatbot_arena_241104.csv +qwen1_5_7b_chat,LMSys Arena,937.5784150291832,[],chatbot_arena_241104.csv +codellama_70b_instruct,LMSys Arena,873.7635218944325,[],chatbot_arena_241104.csv +mistral_next,LMSys Arena,969.0249137331156,[],chatbot_arena_241104.csv +gemma_2b_it,LMSys Arena,865.630898513726,[],chatbot_arena_241104.csv +gemma_7b_it,LMSys Arena,913.3020846629596,[],chatbot_arena_241104.csv +mistral_large_2402,LMSys Arena,939.5529442890696,[],chatbot_arena_241104.csv +olmo_7b_instruct,LMSys Arena,875.880001693062,[],chatbot_arena_241104.csv +claude_3_sonnet_20240229,LMSys Arena,970.6832692453124,[],chatbot_arena_241104.csv +claude_3_opus_20240229,LMSys Arena,1021.9572137608476,[],chatbot_arena_241104.csv +claude_3_haiku_20240307,LMSys Arena,946.756591266114,[],chatbot_arena_241104.csv +starling_lm_7b_beta,LMSys Arena,967.1740802373936,[],chatbot_arena_241104.csv +command_r,LMSys Arena,915.3923710382184,[],chatbot_arena_241104.csv +dbrx_instructruct_preview,LMSys Arena,930.1149113654316,[],chatbot_arena_241104.csv +qwen1_5_14b_chat,LMSys Arena,932.8461519507624,[],chatbot_arena_241104.csv +qwen1_5_32b_chat,LMSys Arena,917.6067239158654,[],chatbot_arena_241104.csv +command_r_plus,LMSys Arena,981.9316261444284,[],chatbot_arena_241104.csv +gemma_1_1_7b_it,LMSys Arena,888.863535227059,[],chatbot_arena_241104.csv +gpt_4_turbo_2024_04_09,LMSys Arena,1001.95083675947,[],chatbot_arena_241104.csv +zephyr_orpo_141b_a35b_v0_1,LMSys Arena,992.2709969445071,[],chatbot_arena_241104.csv +gemma_1_1_2b_it,LMSys Arena,839.3449619004468,[],chatbot_arena_241104.csv +gemini_1_5_pro_api_0409_preview,LMSys Arena,1106.8697777575628,[],chatbot_arena_241104.csv +reka_flash_21b_20240226_online,LMSys Arena,967.873277488609,[],chatbot_arena_241104.csv +reka_flash_21b_20240226,LMSys Arena,939.8601363871352,[],chatbot_arena_241104.csv +mixtral_8x22b_instruct_v0_1,LMSys Arena,911.463562145636,[],chatbot_arena_241104.csv +llama3_8b_instruct,LMSys Arena,925.300077951389,[],chatbot_arena_241104.csv +llama3_70b_instruct,LMSys Arena,987.92132812523,[],chatbot_arena_241104.csv +phi_3_mini_128k_instruct,LMSys Arena,875.3830177408651,[],chatbot_arena_241104.csv +snowflake_arctic_instruct,LMSys Arena,908.9578096804898,[],chatbot_arena_241104.csv +reka_core_20240501,LMSys Arena,960.871641047353,[],chatbot_arena_241104.csv +qwen1_5_110b_chat,LMSys Arena,970.825546150876,[],chatbot_arena_241104.csv +qwen_max_0428,LMSys Arena,991.8829133949346,[],chatbot_arena_241104.csv +gpt_4o_2024_05_13,LMSys Arena,1033.7736651812086,[],chatbot_arena_241104.csv +yi_large_preview,LMSys Arena,1007.9055342457846,[],chatbot_arena_241104.csv +glm_4_0116,LMSys Arena,996.2388680185244,[],chatbot_arena_241104.csv +phi_3_mini_4k_instruct,LMSys Arena,875.486575120554,[],chatbot_arena_241104.csv +gemini_advanced_0514,LMSys Arena,1034.5901919978594,[],chatbot_arena_241104.csv +gemini_1_5_pro_api_0514,LMSys Arena,1006.938590226684,[],chatbot_arena_241104.csv +gemini_1_5_flash_api_0514,LMSys Arena,988.426072144592,[],chatbot_arena_241104.csv +yi_1_5_34b_chat,LMSys Arena,935.8573439301474,[],chatbot_arena_241104.csv +phi_3_small_8k_instruct,LMSys Arena,877.7438151636035,[],chatbot_arena_241104.csv +phi_3_medium_4k_instruct,LMSys Arena,866.7539620360035,[],chatbot_arena_241104.csv +qwen2_72b_instruct,LMSys Arena,930.7722721046767,[],chatbot_arena_241104.csv +yi_large,LMSys Arena,991.78684277118,[],chatbot_arena_241104.csv +nemotron_4_340b_instruct,LMSys Arena,1011.0291063554424,[],chatbot_arena_241104.csv +reka_flash_preview_20240611,LMSys Arena,937.4782906143832,[],chatbot_arena_241104.csv +glm_4_0520,LMSys Arena,1012.3461462160476,[],chatbot_arena_241104.csv +deepseek_coder_v2,LMSys Arena,968.7272337322494,[],chatbot_arena_241104.csv +claude_3_5_sonnet_20240620,LMSys Arena,1026.059060767346,[],chatbot_arena_241104.csv +gemma_2_9b_it,LMSys Arena,950.0755523266928,[],chatbot_arena_241104.csv +gemma_2_27b_it,LMSys Arena,977.8470656596852,[],chatbot_arena_241104.csv +phi_3_mini_4k_instruct_june_2024,LMSys Arena,860.4379813139254,[],chatbot_arena_241104.csv +deepseek_v2_api_0628,LMSys Arena,989.5345921181048,[],chatbot_arena_241104.csv +athene_70b_0725,LMSys Arena,1020.8101504540734,[],chatbot_arena_241104.csv +gemini_1_5_pro_exp_0801,LMSys Arena,1074.9371768117894,[],chatbot_arena_241104.csv +gpt_4o_mini_2024_07_18,LMSys Arena,1026.236414405759,[],chatbot_arena_241104.csv +deepseek_coder_v2_0724,LMSys Arena,990.94288841608,[],chatbot_arena_241104.csv +gemma_2_2b_it,LMSys Arena,906.320768087545,[],chatbot_arena_241104.csv +llama3_1_8b_instruct,LMSys Arena,949.3125757952852,[],chatbot_arena_241104.csv +llama3_1_405b_instruct,LMSys Arena,1005.4497444176718,[],chatbot_arena_241104.csv +llama3_1_70b_instruct,LMSys Arena,1034.402372751568,[],chatbot_arena_241104.csv +mistral_large_2407,LMSys Arena,1005.1771608005986,[],chatbot_arena_241104.csv +reka_core_20240722,LMSys Arena,1006.982150804202,[],chatbot_arena_241104.csv +reka_flash_20240722,LMSys Arena,950.554264764622,[],chatbot_arena_241104.csv +chatgpt_4o_latest,LMSys Arena,1073.7429047571106,[],chatbot_arena_241104.csv +gpt_4o_2024_08_06,LMSys Arena,1032.650635133711,[],chatbot_arena_241104.csv alphamonarch_7b,HF OpenLLM v2,17.59,,hf_open_llm_v2_240829.csv alphamonarch_7b,HFv2 BBH,23.95,,hf_open_llm_v2_240829.csv alphamonarch_7b,HFv2 GPQA,2.68,,hf_open_llm_v2_240829.csv @@ -3645,67 +3774,6 @@ llama_2_13b,Helm MMLU,0.554,[],helm_mmlu_240829.csv olmo_1_7_7b,Helm MMLU,0.538,[],helm_mmlu_240829.csv llama_2_7b,Helm MMLU,0.458,[],helm_mmlu_240829.csv olmo_7b,Helm MMLU,0.295,[],helm_mmlu_240829.csv -claude_3_5_sonnet_20240620,LMSys Arena,79.3,[],chatbot_arena_240829.csv -gpt_4o_2024_05_13,LMSys Arena,79.2,[],chatbot_arena_240829.csv -gpt_4_0125_preview,LMSys Arena,78.0,[],chatbot_arena_240829.csv -gpt_4o_2024_08_06,LMSys Arena,77.9,[],chatbot_arena_240829.csv -athene_70b,LMSys Arena,77.6,[],chatbot_arena_240829.csv -gpt_4o_mini,LMSys Arena,74.9,[],chatbot_arena_240829.csv -gemini_1_5_pro_api_preview,LMSys Arena,72.0,[],chatbot_arena_240829.csv -mistral_large_2407,LMSys Arena,70.4,[],chatbot_arena_240829.csv -llama3_1_405b_instruct,LMSys Arena,64.1,[],chatbot_arena_240829.csv -glm_4_0520,LMSys Arena,63.8,[],chatbot_arena_240829.csv -yi_large,LMSys Arena,63.7,[],chatbot_arena_240829.csv -deepseek_coder_v2,LMSys Arena,62.3,[],chatbot_arena_240829.csv -claude_3_opus_20240229,LMSys Arena,60.4,[],chatbot_arena_240829.csv -gemma_2_27b_it,LMSys Arena,57.5,[],chatbot_arena_240829.csv -llama3_1_70b_instruct,LMSys Arena,55.7,[],chatbot_arena_240829.csv -glm_4_0116,LMSys Arena,55.7,[],chatbot_arena_240829.csv -glm_4_air,LMSys Arena,50.9,[],chatbot_arena_240829.csv -gpt_4_0314,LMSys Arena,50.0,[],chatbot_arena_240829.csv -gemini_1_5_flash_api_preview,LMSys Arena,49.6,[],chatbot_arena_240829.csv -qwen2_72b_instruct,LMSys Arena,46.9,[],chatbot_arena_240829.csv -claude_3_sonnet_20240229,LMSys Arena,46.8,[],chatbot_arena_240829.csv -llama3_70b_instruct,LMSys Arena,46.6,[],chatbot_arena_240829.csv -claude_3_haiku_20240307,LMSys Arena,41.5,[],chatbot_arena_240829.csv -gpt_4_0613,LMSys Arena,37.9,[],chatbot_arena_240829.csv -mistral_large_2402,LMSys Arena,37.7,[],chatbot_arena_240829.csv -mixtral_8x22b_instruct_v0_1,LMSys Arena,36.4,[],chatbot_arena_240829.csv -qwen1_5_72b_chat,LMSys Arena,36.1,[],chatbot_arena_240829.csv -phi_3_medium_4k_instruct,LMSys Arena,33.4,[],chatbot_arena_240829.csv -command_r_plus,LMSys Arena,33.1,[],chatbot_arena_240829.csv -mistral_medium,LMSys Arena,31.9,[],chatbot_arena_240829.csv -internlm2_5_20b_chat,LMSys Arena,31.2,[],chatbot_arena_240829.csv -phi_3_small_8k_instruct,LMSys Arena,29.8,[],chatbot_arena_240829.csv -mistral_next,LMSys Arena,27.4,[],chatbot_arena_240829.csv -gpt_3_5_turbo_0613,LMSys Arena,24.8,[],chatbot_arena_240829.csv -dbrx_instructruct_preview,LMSys Arena,24.6,[],chatbot_arena_240829.csv -internlm2_20b_chat,LMSys Arena,24.4,[],chatbot_arena_240829.csv -claude_2_0,LMSys Arena,24.0,[],chatbot_arena_240829.csv -mixtral_8x7b_instruct_v0_1,LMSys Arena,23.4,[],chatbot_arena_240829.csv -gpt_3_5_turbo_0125,LMSys Arena,23.3,[],chatbot_arena_240829.csv -yi_34b_chat,LMSys Arena,23.1,[],chatbot_arena_240829.csv -starling_lm_7b_beta,LMSys Arena,23.0,[],chatbot_arena_240829.csv -claude_2_1,LMSys Arena,22.8,[],chatbot_arena_240829.csv -llama3_1_8b_instruct,LMSys Arena,21.3,[],chatbot_arena_240829.csv -snorkel_mistral_pairrm_dpo,LMSys Arena,20.7,[],chatbot_arena_240829.csv -llama3_8b_instruct,LMSys Arena,20.6,[],chatbot_arena_240829.csv -gpt_3_5_turbo_1106,LMSys Arena,18.9,[],chatbot_arena_240829.csv -gpt_3_5_turbo_0301,LMSys Arena,18.1,[],chatbot_arena_240829.csv -gemini_1_0_pro,LMSys Arena,17.8,[],chatbot_arena_240829.csv -snowflake_arctic_instruct,LMSys Arena,17.6,[],chatbot_arena_240829.csv -command_r,LMSys Arena,17.0,[],chatbot_arena_240829.csv -phi_3_mini_128k_instruct,LMSys Arena,15.4,[],chatbot_arena_240829.csv -tulu_2_dpo_70b,LMSys Arena,15.0,[],chatbot_arena_240829.csv -starling_lm_7b_alpha,LMSys Arena,12.8,[],chatbot_arena_240829.csv -mistral_7b_instruct,LMSys Arena,12.6,[],chatbot_arena_240829.csv -gemma_1_1_7b_it,LMSys Arena,12.1,[],chatbot_arena_240829.csv -llama_2_70b_chat,LMSys Arena,11.6,[],chatbot_arena_240829.csv -vicuna_33b_v1_3,LMSys Arena,8.6,[],chatbot_arena_240829.csv -gemma_7b_it,LMSys Arena,7.5,[],chatbot_arena_240829.csv -llama_2_7b_chat,LMSys Arena,4.6,[],chatbot_arena_240829.csv -gemma_1_1_2b_it,LMSys Arena,3.4,[],chatbot_arena_240829.csv -gemma_2b_it,LMSys Arena,3.0,[],chatbot_arena_240829.csv llama_2_70b,Helm Classic,0.944,[],helm_classic_240829.csv llama_65b,Helm Classic,0.908,[],helm_classic_240829.csv text_davinci_002,Helm Classic,0.905,[],helm_classic_240829.csv @@ -21815,124 +21883,140 @@ zephyr_reproduction_sft_full,hydrox_overall_score,13.1,,hydrox_safety_241001.csv zephyr_reproduction_sft_full,hydrox_privacy,14.94,,hydrox_safety_241001.csv zephyr_reproduction_sft_full,hydrox_safety,14.92,,hydrox_safety_241001.csv zephyr_reproduction_sft_full,hydrox_security,9.5,,hydrox_safety_241001.csv -alpaca_7b,aggregate,0.22072072072072071,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -athene_70b,aggregate,0.8493788819875776,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -claude_2_0,aggregate,0.6020066889632107,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -claude_2_1,aggregate,0.5110980545763154,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -claude_3_5_sonnet_20240620,aggregate,0.982905982905983,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -claude_3_haiku_20240307,aggregate,0.549424005945745,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -claude_3_opus_20240229,aggregate,0.8573567665639277,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -claude_3_sonnet_20240229,aggregate,0.653911731916847,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -claude_instant_1_2,aggregate,0.6049896049896051,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -command_r,aggregate,0.32386140074759,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -command_r_plus,aggregate,0.5761033510394125,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -dbrx_instruct,aggregate,0.4266409266409266,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -dbrx_instructruct,aggregate,0.5344129554655871,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -deepseek_coder_v2,aggregate,0.8444160272804775,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -deepseek_llm_67b_chat,aggregate,0.5506756756756757,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -falcon_40b,aggregate,0.32812265707002547,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -falcon_40b_instruct,aggregate,0.13264580369843526,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -falcon_7b,aggregate,0.11407257459889038,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -falcon_7b_instruct,aggregate,0.013513513513513514,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -gemma_1_1_2b_it,aggregate,0.07665903890160183,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -gemma_1_1_7b_it,aggregate,0.26226051061156724,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -gemma_2_27b_it,aggregate,0.8045273029120115,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -gemma_2_9b_it,aggregate,0.6422797189051059,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -gemma_2_9b_it_dpo,aggregate,0.790057915057915,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -gemma_2_9b_it_simpo,aggregate,0.7199248120300753,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -gemma_2b_it,aggregate,0.05921052631578947,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -gemma_7b,aggregate,0.4471997300944669,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -gemma_7b_it,aggregate,0.12136319058515854,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -glm_4_9b_chat,aggregate,0.46499582289055974,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -gpt_3_5_turbo_0125,aggregate,0.4401920188365201,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -gpt_3_5_turbo_0301,aggregate,0.4528985507246377,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -gpt_3_5_turbo_0613,aggregate,0.5724018332713985,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -gpt_4_0125_preview,aggregate,0.9171132221004344,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -gpt_4_0613,aggregate,0.8146763722211293,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -gpt_4_turbo_2024_04_09,aggregate,0.9428463693169576,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -gpt_4o_2024_05_13,aggregate,0.9847612958226769,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -gpt_4o_2024_08_06,aggregate,0.9575873827791986,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -gpt_4o_mini_2024_07_18,aggregate,0.8032033326150972,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -gpt_j_6b,aggregate,0.10160818713450293,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -gpt_neox_20b,aggregate,0.14400584795321636,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -hermes_3_llama3_1_70b,aggregate,0.8626160990712074,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -infinity_instruct_3m_0625_llama3_8b,aggregate,0.6273115220483642,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -internlm2_5_20b_chat,aggregate,0.6842105263157895,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -internlm2_chat_20b,aggregate,0.32252252252252256,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -jurassic_2_grande_17b,aggregate,0.39529914529914534,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -jurassic_2_jumbo_178b,aggregate,0.532051282051282,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -llama3_1_405b_instruct,aggregate,0.8672150411280846,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -llama3_1_70b_instruct,aggregate,0.8528408270971201,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -llama3_1_8b_instruct,aggregate,0.5175232440678665,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -llama3_70b,aggregate,0.8105600539811066,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -llama3_70b_instruct,aggregate,0.8127546753337573,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -llama3_8b,aggregate,0.43302968960863697,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -llama3_8b_instruct,aggregate,0.420135922511747,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -llama3_instruct_8b_simpo,aggregate,0.7884068278805121,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -llama_2_13b,aggregate,0.41490478332583597,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -llama_2_70b,aggregate,0.7303193882141251,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -llama_2_70b_chat,aggregate,0.15527950310559005,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -llama_2_7b,aggregate,0.2391288049182786,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -llama_2_7b_chat,aggregate,0.08304448781801049,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -llama_65b,aggregate,0.5736992052781527,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -luminous_base_13b,aggregate,0.08333333333333333,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -luminous_extended_30b,aggregate,0.2329059829059829,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -luminous_supreme_70b,aggregate,0.30128205128205127,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -mistral_7b_instruct_v0_2,aggregate,0.28609513981031004,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -mistral_7b_instruct_v0_3,aggregate,0.2537839697282422,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -mistral_7b_v0_2,aggregate,0.31970128022759603,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -mistral_7b_v0_3,aggregate,0.3737553342816501,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -mistral_large_2402,aggregate,0.6058211467418628,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -mistral_large_2407,aggregate,0.8868286445012787,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -mistral_medium,aggregate,0.6122209165687427,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -mistral_small_2402,aggregate,0.49924585218702866,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -mistral_v0_1_7b,aggregate,0.6239316239316239,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -mixtral_8x22b_instruct_v0_1,aggregate,0.7256023690940907,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -mixtral_8x22b_v0_1,aggregate,0.7135490753911806,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -mixtral_8x7b_instruct_v0_1,aggregate,0.3713078251895724,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -mixtral_8x7b_v0_1,aggregate,0.49324324324324326,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -olmo_7b,aggregate,0.06545209176788123,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -openhermes_2_5_mistral_7b,aggregate,0.3832617447168531,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -phi_2,aggregate,0.20087901666849037,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -phi_3_5_mini_instruct,aggregate,0.6202270381836945,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -phi_3_5_moe_instruct,aggregate,0.7808307533539731,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -phi_3_medium_4k_instruct,aggregate,0.6675079642841117,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -phi_3_mini_128k_instruct,aggregate,0.4153205904787544,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -phi_3_mini_4k_instruct,aggregate,0.5548245614035088,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -phi_3_small_128k_instruct,aggregate,0.66937564499484,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -phi_3_small_8k_instruct,aggregate,0.45481670929241264,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -pythia_12b,aggregate,0.054093567251461985,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -pythia_6_9b,aggregate,0.019736842105263157,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -qwen1_5_0_5b_chat,aggregate,0.013157894736842105,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -qwen1_5_110b_chat,aggregate,0.776004448721167,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -qwen1_5_14b,aggregate,0.5770917678812416,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -qwen1_5_14b_chat,aggregate,0.4621068436857911,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -qwen1_5_1_8b_chat,aggregate,0.059167526659786716,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -qwen1_5_32b,aggregate,0.7658569500674763,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -qwen1_5_32b_chat,aggregate,0.7149122807017544,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -qwen1_5_4b_chat,aggregate,0.1674406604747162,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -qwen1_5_72b_chat,aggregate,0.5668371367348349,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -qwen1_5_7b,aggregate,0.3508771929824561,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -qwen1_5_7b_chat,aggregate,0.1916569245052217,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -qwen2_0_5b_instruct,aggregate,0.059081527347781215,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -qwen2_1_5b_instruct,aggregate,0.19711042311661506,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -qwen2_72b_instruct,aggregate,0.8354710666091739,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -qwen2_7b_instruct,aggregate,0.5034227726178191,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -smaug_qwen2_72b_instruct,aggregate,0.8593911248710011,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -snorkel_mistral_pairrm_dpo,aggregate,0.4521151586368978,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -starling_lm_7b_alpha,aggregate,0.29823530624445954,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -starling_lm_7b_beta,aggregate,0.25234441602728047,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -tulu_2_dpo_70b,aggregate,0.17624223602484473,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -vicuna_33b_v1_3,aggregate,0.2056404230317274,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -vicuna_7b_v1_5,aggregate,0.13619501854795973,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -yi_1_5_34b_chat,aggregate,0.7553884711779449,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -yi_1_5_6b_chat,aggregate,0.3354636591478697,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -yi_1_5_9b_chat,aggregate,0.5881787802840435,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -yi_34b,aggregate,0.7128879892037787,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -yi_34b_chat,aggregate,0.5455449728905107,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -yi_6b,aggregate,0.29234143049932526,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -yi_6b_chat,aggregate,0.1938854489164087,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -yi_large,aggregate,0.8346273291925466,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -yi_large_preview,aggregate,0.8641553641553642,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -zephyr_7b_alpha,aggregate,0.2838442157327606,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -zephyr_7b_beta,aggregate,0.2666234345800909,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +alpaca_7b,aggregate,0.23484848484848483,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +chatglm2_6b,aggregate,0.029137529137529136,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +chatgpt_4o_latest,aggregate,0.9754079254079254,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +claude_2_0,aggregate,0.8333333333333334,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +claude_2_1,aggregate,0.6693861693861693,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +claude_3_5_sonnet_20240620,aggregate,0.9572649572649573,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +claude_3_haiku_20240307,aggregate,0.44965034965034967,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +claude_3_opus_20240229,aggregate,0.8824397824397824,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +claude_3_sonnet_20240229,aggregate,0.5985236985236985,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +claude_instant_1_2,aggregate,0.6486013986013985,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +command_r,aggregate,0.3296911421911422,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +command_r_plus,aggregate,0.6183108558108558,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +dbrx_instruct,aggregate,0.4724025974025974,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +dbrx_instructruct,aggregate,0.5379867046533713,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +deepseek_coder_v2,aggregate,0.713053613053613,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +deepseek_llm_67b_chat,aggregate,0.5734841290396846,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +dolphin_2_2_1_mistral_7b,aggregate,0.4810606060606061,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +falcon_40b,aggregate,0.3502690724912947,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +falcon_40b_instruct,aggregate,0.13187429854096522,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +falcon_7b,aggregate,0.11380183602405824,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +falcon_7b_instruct,aggregate,0.011363636363636364,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +gemini_1_5_flash_api_0514,aggregate,0.7263403263403263,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +gemini_1_5_pro_api_0514,aggregate,0.8294871794871794,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +gemini_1_5_pro_exp_0801,aggregate,0.9545454545454546,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +gemini_pro,aggregate,0.7298951048951049,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +gemma_1_1_2b_it,aggregate,0.07454890788224121,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +gemma_1_1_7b_it,aggregate,0.263927019482575,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +gemma_2_27b_it,aggregate,0.776345259678593,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +gemma_2_2b_it,aggregate,0.28113553113553114,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +gemma_2_9b_it,aggregate,0.6048877048877048,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +gemma_2_9b_it_dpo,aggregate,0.8100649350649352,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +gemma_2_9b_it_simpo,aggregate,0.7328042328042329,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +gemma_2b_it,aggregate,0.08119658119658119,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +gemma_7b,aggregate,0.4477682811016144,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +gemma_7b_it,aggregate,0.18790982679871568,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +glm_4_9b_chat,aggregate,0.4769547325102881,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +gpt_3_5_turbo_0125,aggregate,0.3591242091242091,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +gpt_3_5_turbo_0613,aggregate,0.6851851851851851,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +gpt_4_0125_preview,aggregate,0.8492118992118992,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +gpt_4_0613,aggregate,0.7641802641802643,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +gpt_4_turbo_2024_04_09,aggregate,0.9055819180819181,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +gpt_4o_2024_05_13,aggregate,0.9767482517482518,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +gpt_4o_2024_08_06,aggregate,0.9652680652680652,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +gpt_4o_mini_2024_07_18,aggregate,0.8348776223776224,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +gpt_j_6b,aggregate,0.09876543209876543,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +gpt_neox_20b,aggregate,0.1419753086419753,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +guanaco_33b,aggregate,0.38374125874125875,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +hermes_3_llama3_1_70b,aggregate,0.8451178451178452,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +infinity_instruct_3m_0625_llama3_8b,aggregate,0.6537598204264872,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +internlm2_chat_20b,aggregate,0.37196969696969695,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +jurassic_2_grande_17b,aggregate,0.4230769230769231,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +jurassic_2_jumbo_178b,aggregate,0.532051282051282,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +llama3_1_405b_instruct,aggregate,0.8598484848484849,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +llama3_1_70b_instruct,aggregate,0.9343074620852398,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +llama3_1_8b_instruct,aggregate,0.6080822469711359,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +llama3_70b,aggregate,0.8129154795821463,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +llama3_70b_instruct,aggregate,0.8172801478357034,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +llama3_8b,aggregate,0.4368471035137702,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +llama3_8b_instruct,aggregate,0.4449662477440255,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +llama3_instruct_8b_simpo,aggregate,0.7992424242424242,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +llama_13b,aggregate,0.2222222222222222,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +llama_2_13b,aggregate,0.4146881924659702,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +llama_2_13b_chat,aggregate,0.38675213675213677,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +llama_2_70b,aggregate,0.7293447293447294,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +llama_2_70b_chat,aggregate,0.412732329398996,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +llama_2_7b,aggregate,0.25466919911364355,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +llama_2_7b_chat,aggregate,0.1122679789346456,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +llama_65b,aggregate,0.5759734093067427,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +luminous_base_13b,aggregate,0.08333333333333333,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +luminous_extended_30b,aggregate,0.2329059829059829,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +luminous_supreme_70b,aggregate,0.32905982905982906,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +mistral_7b_instruct_v0_2,aggregate,0.250669392336059,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +mistral_7b_instruct_v0_3,aggregate,0.24534231200897869,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +mistral_7b_v0_2,aggregate,0.3773849607182941,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +mistral_7b_v0_3,aggregate,0.4228395061728395,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +mistral_large_2402,aggregate,0.5105672105672105,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +mistral_large_2407,aggregate,0.8375291375291375,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +mistral_medium,aggregate,0.657051282051282,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +mistral_small_2402,aggregate,0.47785547785547783,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +mistral_v0_1_7b,aggregate,0.6239316239316239,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +mixtral_8x22b_instruct_v0_1,aggregate,0.585565052231719,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +mixtral_8x22b_v0_1,aggregate,0.7382154882154882,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +mixtral_8x7b_instruct_v0_1,aggregate,0.284326167659501,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +mixtral_8x7b_v0_1,aggregate,0.5310044893378227,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +nous_hermes_2_mixtral_8x7b_dpo,aggregate,0.7094017094017094,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +olmo_7b,aggregate,0.06220322886989553,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +olmo_7b_instruct,aggregate,0.15669515669515668,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +openchat_3_5,aggregate,0.5270655270655271,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +openhermes_2_5_mistral_7b,aggregate,0.40103708020374684,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +phi_2,aggregate,0.19812080923192033,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +phi_3_5_mini_instruct,aggregate,0.6103254769921437,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +phi_3_5_moe_instruct,aggregate,0.7600448933782267,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +phi_3_medium_4k_instruct,aggregate,0.48541540763762986,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +phi_3_mini_128k_instruct,aggregate,0.3778468445135112,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +phi_3_mini_4k_instruct,aggregate,0.4048663270885493,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +phi_3_small_128k_instruct,aggregate,0.6561167227833894,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +phi_3_small_8k_instruct,aggregate,0.27051282051282055,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +pythia_12b,aggregate,0.05246913580246913,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +pythia_6_9b,aggregate,0.018518518518518517,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +qwen1_5_0_5b_chat,aggregate,0.012345679012345678,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +qwen1_5_110b_chat,aggregate,0.7419770353103686,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +qwen1_5_14b,aggregate,0.5797720797720798,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +qwen1_5_14b_chat,aggregate,0.45340153673487005,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +qwen1_5_1_8b_chat,aggregate,0.05544332210998878,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +qwen1_5_32b,aggregate,0.7678062678062678,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +qwen1_5_32b_chat,aggregate,0.571383349161127,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +qwen1_5_4b_chat,aggregate,0.12542806987251431,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +qwen1_5_72b_chat,aggregate,0.5463669663669664,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +qwen1_5_7b,aggregate,0.35185185185185186,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +qwen1_5_7b_chat,aggregate,0.24214088380755047,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +qwen2_0_5b_instruct,aggregate,0.055218855218855216,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +qwen2_1_5b_instruct,aggregate,0.1968574635241302,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +qwen2_72b_instruct,aggregate,0.7701936951936953,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +qwen2_7b_instruct,aggregate,0.4970445192667415,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +qwen_14b_chat,aggregate,0.2837995337995338,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +smaug_qwen2_72b_instruct,aggregate,0.8331088664421997,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +solar_10_7b_instruct_v1_0,aggregate,0.5030864197530864,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +starling_lm_7b_alpha,aggregate,0.42734323289878845,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +starling_lm_7b_beta,aggregate,0.3611888111888112,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +tulu_2_dpo_70b,aggregate,0.3585164835164835,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +vicuna_13b,aggregate,0.14714452214452214,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +vicuna_7b,aggregate,0.1885198135198135,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +vicuna_7b_v1_5,aggregate,0.15454545454545454,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +wizardlm_13b,aggregate,0.42773892773892774,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +wizardlm_70b,aggregate,0.5620629370629371,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +yi_1_5_34b_chat,aggregate,0.6669566544566544,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +yi_1_5_6b_chat,aggregate,0.33974132863021755,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +yi_1_5_9b_chat,aggregate,0.6041446208112875,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +yi_34b,aggregate,0.7188983855650521,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +yi_34b_chat,aggregate,0.5558361391694725,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +yi_6b,aggregate,0.295346628679962,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +yi_6b_chat,aggregate,0.19393939393939394,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +yi_large,aggregate,0.7889194139194139,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +yi_large_preview,aggregate,0.8714202464202464,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +zephyr_7b_alpha,aggregate,0.33875830959164294,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +zephyr_7b_beta,aggregate,0.28937667271000606,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +zephyr_orpo_141b_a35b_v0_1,aggregate,0.8414055080721747,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate