Spaces:
Running
Running
scenario,scenario_source,ref_scenario,ref_source,corr_type,model_select_strategy,model_subset_size_requested,exp_n,correlation,p_value | |
Helm Lite,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,0,0.2778254199662385,0.2400384567875128 | |
Helm Lite,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,1,0.40368671387966554,0.08581278065055217 | |
Helm Lite,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,2,0.42599897728156577,0.07162425926742408 | |
Helm Lite,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,3,0.2778254199662385,0.2400384567875128 | |
Helm Lite,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,4,0.36698792170878686,0.11834981273562825 | |
Helm Lite NarrativeQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,0,-0.018181818181818184,1.0 | |
Helm Lite NarrativeQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,1,-0.018181818181818184,1.0 | |
Helm Lite NarrativeQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,2,-0.05454545454545454,0.8792698312489979 | |
Helm Lite NarrativeQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,3,-0.018181818181818184,1.0 | |
Helm Lite NarrativeQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,4,-0.1272727272727273,0.6480954385121052 | |
Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,0,0.05454545454545454,0.8792698312489979 | |
Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,1,-0.018181818181818184,1.0 | |
Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,2,-0.018181818181818184,1.0 | |
Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,3,0.05454545454545454,0.8792698312489979 | |
Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,4,-0.05454545454545454,0.8792698312489979 | |
Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,0,0.05454545454545454,0.8792698312489979 | |
Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,1,0.23636363636363636,0.3587114698573032 | |
Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,2,0.2,0.4453821448613115 | |
Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,3,0.05454545454545454,0.8792698312489979 | |
Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,4,0.1272727272727273,0.6480954385121052 | |
Helm Lite OpenBookQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,0,0.587180674734059,0.01246215829454031 | |
Helm Lite OpenBookQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,1,0.6727272727272727,0.0031063111271444604 | |
Helm Lite OpenBookQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,2,0.697277051246695,0.003004262239398284 | |
Helm Lite OpenBookQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,3,0.587180674734059,0.01246215829454031 | |
Helm Lite OpenBookQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,4,0.6605782590758164,0.004936818556325077 | |
Helm Lite MMLU,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,0,0.6000000000000001,0.00994553671637005 | |
Helm Lite MMLU,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,1,0.6727272727272727,0.0031063111271444604 | |
Helm Lite MMLU,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,2,0.7090909090909091,0.0015912097162097162 | |
Helm Lite MMLU,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,3,0.6000000000000001,0.00994553671637005 | |
Helm Lite MMLU,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,4,0.6363636363636364,0.005707170915504249 | |
Helm Lite MathEquivalentCOT,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,0,0.2727272727272727,0.2829668209876543 | |
Helm Lite MathEquivalentCOT,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,1,0.34545454545454546,0.16457331248997917 | |
Helm Lite MathEquivalentCOT,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,2,0.38181818181818183,0.12097096961680295 | |
Helm Lite MathEquivalentCOT,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,3,0.2727272727272727,0.2829668209876543 | |
Helm Lite MathEquivalentCOT,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,4,0.34545454545454546,0.16457331248997917 | |
Helm Lite GSM8K,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,0,0.6363636363636364,0.005707170915504249 | |
Helm Lite GSM8K,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,1,0.6727272727272727,0.0031063111271444604 | |
Helm Lite GSM8K,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,2,0.6363636363636364,0.005707170915504249 | |
Helm Lite GSM8K,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,3,0.6363636363636364,0.005707170915504249 | |
Helm Lite GSM8K,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,4,0.7090909090909091,0.0015912097162097162 | |
Helm Lite LegalBench,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,0,0.18349396085439343,0.43487965849578336 | |
Helm Lite LegalBench,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,1,0.2935903373670295,0.21152242941072896 | |
Helm Lite LegalBench,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,2,0.2727272727272727,0.2829668209876543 | |
Helm Lite LegalBench,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,3,0.18349396085439343,0.43487965849578336 | |
Helm Lite LegalBench,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,4,0.2568915451961508,0.27429882739587574 | |
Helm Lite MedQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,0,0.4909090909090909,0.04053235730319064 | |
Helm Lite MedQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,1,0.6000000000000001,0.00994553671637005 | |
Helm Lite MedQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,2,0.5636363636363636,0.016540504248837583 | |
Helm Lite MedQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,3,0.4909090909090909,0.04053235730319064 | |
Helm Lite MedQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,4,0.5636363636363636,0.016540504248837583 | |
Helm Lite WMT2014,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,0,0.34545454545454546,0.16457331248997917 | |
Helm Lite WMT2014,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,1,0.41818181818181815,0.08656124739458072 | |
Helm Lite WMT2014,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,2,0.4909090909090909,0.04053235730319064 | |
Helm Lite WMT2014,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,3,0.34545454545454546,0.16457331248997917 | |
Helm Lite WMT2014,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,4,0.34545454545454546,0.16457331248997917 | |
HF OpenLLM v2,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,0,0.9272727272727274,3.2567740901074234e-06 | |
HF OpenLLM v2,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,1,0.8545454545454545,4.624619207952541e-05 | |
HF OpenLLM v2,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,2,0.9272727272727274,3.2567740901074234e-06 | |
HF OpenLLM v2,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,3,0.8545454545454545,4.624619207952541e-05 | |
HF OpenLLM v2,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,4,0.7818181818181819,0.0003334435626102293 | |
HFv2 BBH,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,0,0.8181818181818182,0.00013227513227513228 | |
HFv2 BBH,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,1,0.8181818181818182,0.00013227513227513228 | |
HFv2 BBH,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,2,1.0,5.010421677088344e-08 | |
HFv2 BBH,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,3,0.6727272727272727,0.0031063111271444604 | |
HFv2 BBH,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,4,0.7454545454545454,0.000759529822029822 | |
HFv2 GPQA,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,0,0.45454545454545453,0.06017015392015392 | |
HFv2 GPQA,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,1,0.41818181818181815,0.08656124739458072 | |
HFv2 GPQA,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,2,0.45454545454545453,0.06017015392015392 | |
HFv2 GPQA,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,3,0.7454545454545454,0.000759529822029822 | |
HFv2 GPQA,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,4,0.5272727272727272,0.02638447971781305 | |
HFv2 IFEval,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,0,0.8181818181818182,0.00013227513227513228 | |
HFv2 IFEval,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,1,0.5636363636363636,0.016540504248837583 | |
HFv2 IFEval,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,2,0.7454545454545454,0.000759529822029822 | |
HFv2 IFEval,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,3,0.7454545454545454,0.000759529822029822 | |
HFv2 IFEval,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,4,0.7090909090909091,0.0015912097162097162 | |
HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,0,0.8909090909090909,1.3728555395222063e-05 | |
HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,1,0.9272727272727274,3.2567740901074234e-06 | |
HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,2,1.0,5.010421677088344e-08 | |
HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,3,0.8181818181818182,0.00013227513227513228 | |
HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,4,0.7818181818181819,0.0003334435626102293 | |
HFv2 Math Level 5,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,0,0.8909090909090909,1.3728555395222063e-05 | |
HFv2 Math Level 5,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,1,0.5272727272727272,0.02638447971781305 | |
HFv2 Math Level 5,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,2,0.8545454545454545,4.624619207952541e-05 | |
HFv2 Math Level 5,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,3,0.4403855060505442,0.06091869077971648 | |
HFv2 Math Level 5,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,4,0.587180674734059,0.01246215829454031 | |
HFv2 MuSR,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,0,0.7454545454545454,0.000759529822029822 | |
HFv2 MuSR,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,1,0.45454545454545453,0.06017015392015392 | |
HFv2 MuSR,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,2,0.38181818181818183,0.12097096961680295 | |
HFv2 MuSR,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,3,0.6363636363636364,0.005707170915504249 | |
HFv2 MuSR,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,4,0.587180674734059,0.01246215829454031 | |
Helm MMLU,helm_mmlu_240829.csv,aggregate,aggregate,kendall,random,11,0,0.7090909090909091,0.0015912097162097162 | |
Helm MMLU,helm_mmlu_240829.csv,aggregate,aggregate,kendall,random,11,1,0.7090909090909091,0.0015912097162097162 | |
Helm MMLU,helm_mmlu_240829.csv,aggregate,aggregate,kendall,random,11,2,0.7090909090909091,0.0015912097162097162 | |
Helm MMLU,helm_mmlu_240829.csv,aggregate,aggregate,kendall,random,11,3,0.7090909090909091,0.0015912097162097162 | |
Helm MMLU,helm_mmlu_240829.csv,aggregate,aggregate,kendall,random,11,4,0.7090909090909091,0.0015912097162097162 | |
LMSys Arena,chatbot_arena_240829.csv,aggregate,aggregate,kendall,random,11,0,1.0,5.010421677088344e-08 | |
LMSys Arena,chatbot_arena_240829.csv,aggregate,aggregate,kendall,random,11,1,1.0,5.010421677088344e-08 | |
LMSys Arena,chatbot_arena_240829.csv,aggregate,aggregate,kendall,random,11,2,1.0,5.010421677088344e-08 | |
LMSys Arena,chatbot_arena_240829.csv,aggregate,aggregate,kendall,random,11,3,1.0,5.010421677088344e-08 | |
LMSys Arena,chatbot_arena_240829.csv,aggregate,aggregate,kendall,random,11,4,1.0,5.010421677088344e-08 | |
MixEval,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,0,0.8545454545454545,4.624619207952541e-05 | |
MixEval,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,1,0.7818181818181819,0.0003334435626102293 | |
MixEval,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,2,0.8181818181818182,0.00013227513227513228 | |
MixEval,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,3,0.8181818181818182,0.00013227513227513228 | |
MixEval,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,4,0.8181818181818182,0.00013227513227513228 | |
MixEval Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,0,0.7818181818181819,0.0003334435626102293 | |
MixEval Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,1,0.7454545454545454,0.000759529822029822 | |
MixEval Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,2,0.7090909090909091,0.0015912097162097162 | |
MixEval Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,3,0.4909090909090909,0.04053235730319064 | |
MixEval Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,4,0.8181818181818182,0.00013227513227513228 | |
MixEval TriviaQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,0,0.6238794669049377,0.007931923532795268 | |
MixEval TriviaQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,1,0.6605782590758164,0.004936818556325077 | |
MixEval TriviaQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,2,0.4403855060505442,0.06091869077971648 | |
MixEval TriviaQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,3,0.7090909090909091,0.0015912097162097162 | |
MixEval TriviaQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,4,0.697277051246695,0.003004262239398284 | |
MixEval MMLU,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,0,0.8545454545454545,4.624619207952541e-05 | |
MixEval MMLU,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,1,0.7818181818181819,0.0003334435626102293 | |
MixEval MMLU,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,2,0.8181818181818182,0.00013227513227513228 | |
MixEval MMLU,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,3,0.7818181818181819,0.0003334435626102293 | |
MixEval MMLU,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,4,0.7090909090909091,0.0015912097162097162 | |
MixEval DROP,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,0,0.6238794669049377,0.007931923532795268 | |
MixEval DROP,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,1,0.4403855060505442,0.06091869077971648 | |
MixEval DROP,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,2,0.5636363636363636,0.016540504248837583 | |
MixEval DROP,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,3,0.6363636363636364,0.005707170915504249 | |
MixEval DROP,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,4,0.5636363636363636,0.016540504248837583 | |
MixEval HellaSwag,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,0,0.6000000000000001,0.00994553671637005 | |
MixEval HellaSwag,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,1,0.5636363636363636,0.016540504248837583 | |
MixEval HellaSwag,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,2,0.6000000000000001,0.00994553671637005 | |
MixEval HellaSwag,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,3,0.6000000000000001,0.00994553671637005 | |
MixEval HellaSwag,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,4,0.5636363636363636,0.016540504248837583 | |
MixEval CommonsenseQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,0,0.7339758434175737,0.0017872890369872653 | |
MixEval CommonsenseQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,1,0.587180674734059,0.01246215829454031 | |
MixEval CommonsenseQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,2,0.6482593132545567,0.006117582447622459 | |
MixEval CommonsenseQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,3,0.759389481241052,0.0013210471654040124 | |
MixEval CommonsenseQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,4,0.759389481241052,0.0013210471654040124 | |
MixEval TriviaQA Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,0,0.6363636363636364,0.005707170915504249 | |
MixEval TriviaQA Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,1,0.7090909090909091,0.0015912097162097162 | |
MixEval TriviaQA Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,2,0.6727272727272727,0.0031063111271444604 | |
MixEval TriviaQA Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,3,0.2727272727272727,0.2829668209876543 | |
MixEval TriviaQA Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,4,0.5272727272727272,0.02638447971781305 | |
MixEval MMLU Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,0,0.38181818181818183,0.12097096961680295 | |
MixEval MMLU Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,1,0.4909090909090909,0.04053235730319064 | |
MixEval MMLU Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,2,0.38895558795273394,0.10000137830747906 | |
MixEval MMLU Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,3,0.38181818181818183,0.12097096961680295 | |
MixEval MMLU Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,4,0.697277051246695,0.003004262239398284 | |
MixEval DROP Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,0,0.4909090909090909,0.04053235730319064 | |
MixEval DROP Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,1,0.41818181818181815,0.08656124739458072 | |
MixEval DROP Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,2,0.6000000000000001,0.00994553671637005 | |
MixEval DROP Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,3,0.6000000000000001,0.00994553671637005 | |
MixEval DROP Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,4,0.6000000000000001,0.00994553671637005 | |
AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,aggregate,aggregate,kendall,random,11,0,0.8181818181818182,0.00013227513227513228 | |
AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,aggregate,aggregate,kendall,random,11,1,0.8807710121010884,0.00017812930545546289 | |
AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,aggregate,aggregate,kendall,random,11,2,0.8807710121010884,0.00017812930545546289 | |
AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,aggregate,aggregate,kendall,random,11,3,0.8807710121010884,0.00017812930545546289 | |
AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,aggregate,aggregate,kendall,random,11,4,0.8545454545454545,4.624619207952541e-05 | |
LiveBench 240725,livebench_240829.csv,aggregate,aggregate,kendall,random,11,0,0.7454545454545454,0.000759529822029822 | |
LiveBench 240725,livebench_240829.csv,aggregate,aggregate,kendall,random,11,1,0.8181818181818182,0.00013227513227513228 | |
LiveBench 240725,livebench_240829.csv,aggregate,aggregate,kendall,random,11,2,0.8545454545454545,4.624619207952541e-05 | |
LiveBench 240725,livebench_240829.csv,aggregate,aggregate,kendall,random,11,3,0.8545454545454545,4.624619207952541e-05 | |
LiveBench 240725,livebench_240829.csv,aggregate,aggregate,kendall,random,11,4,0.8545454545454545,4.624619207952541e-05 | |
LiveBench Reasoning,livebench_240829.csv,aggregate,aggregate,kendall,random,11,0,0.4770842982214229,0.042330229121360724 | |
LiveBench Reasoning,livebench_240829.csv,aggregate,aggregate,kendall,random,11,1,0.6727272727272727,0.0031063111271444604 | |
LiveBench Reasoning,livebench_240829.csv,aggregate,aggregate,kendall,random,11,2,0.6605782590758164,0.004936818556325077 | |
LiveBench Reasoning,livebench_240829.csv,aggregate,aggregate,kendall,random,11,3,0.8440722199302099,0.0003281542287518694 | |
LiveBench Reasoning,livebench_240829.csv,aggregate,aggregate,kendall,random,11,4,0.7339758434175737,0.0017872890369872653 | |
LiveBench Coding,livebench_240829.csv,aggregate,aggregate,kendall,random,11,0,0.9636363636363636,5.511463844797178e-07 | |
LiveBench Coding,livebench_240829.csv,aggregate,aggregate,kendall,random,11,1,0.6727272727272727,0.0031063111271444604 | |
LiveBench Coding,livebench_240829.csv,aggregate,aggregate,kendall,random,11,2,0.7818181818181819,0.0003334435626102293 | |
LiveBench Coding,livebench_240829.csv,aggregate,aggregate,kendall,random,11,3,0.8545454545454545,4.624619207952541e-05 | |
LiveBench Coding,livebench_240829.csv,aggregate,aggregate,kendall,random,11,4,0.7818181818181819,0.0003334435626102293 | |
LiveBench Mathematics,livebench_240829.csv,aggregate,aggregate,kendall,random,11,0,0.7090909090909091,0.0015912097162097162 | |
LiveBench Mathematics,livebench_240829.csv,aggregate,aggregate,kendall,random,11,1,0.7454545454545454,0.000759529822029822 | |
LiveBench Mathematics,livebench_240829.csv,aggregate,aggregate,kendall,random,11,2,0.8181818181818182,0.00013227513227513228 | |
LiveBench Mathematics,livebench_240829.csv,aggregate,aggregate,kendall,random,11,3,0.8909090909090909,1.3728555395222063e-05 | |
LiveBench Mathematics,livebench_240829.csv,aggregate,aggregate,kendall,random,11,4,0.8545454545454545,4.624619207952541e-05 | |
LiveBench Data Analysis,livebench_240829.csv,aggregate,aggregate,kendall,random,11,0,0.4909090909090909,0.04053235730319064 | |
LiveBench Data Analysis,livebench_240829.csv,aggregate,aggregate,kendall,random,11,1,0.6363636363636364,0.005707170915504249 | |
LiveBench Data Analysis,livebench_240829.csv,aggregate,aggregate,kendall,random,11,2,0.7454545454545454,0.000759529822029822 | |
LiveBench Data Analysis,livebench_240829.csv,aggregate,aggregate,kendall,random,11,3,0.8545454545454545,4.624619207952541e-05 | |
LiveBench Data Analysis,livebench_240829.csv,aggregate,aggregate,kendall,random,11,4,0.7454545454545454,0.000759529822029822 | |
LiveBench Language,livebench_240829.csv,aggregate,aggregate,kendall,random,11,0,0.45454545454545453,0.06017015392015392 | |
LiveBench Language,livebench_240829.csv,aggregate,aggregate,kendall,random,11,1,0.8181818181818182,0.00013227513227513228 | |
LiveBench Language,livebench_240829.csv,aggregate,aggregate,kendall,random,11,2,0.6363636363636364,0.005707170915504249 | |
LiveBench Language,livebench_240829.csv,aggregate,aggregate,kendall,random,11,3,0.9272727272727274,3.2567740901074234e-06 | |
LiveBench Language,livebench_240829.csv,aggregate,aggregate,kendall,random,11,4,0.8545454545454545,4.624619207952541e-05 | |
LiveBench Instruction Following,livebench_240829.csv,aggregate,aggregate,kendall,random,11,0,0.6000000000000001,0.00994553671637005 | |
LiveBench Instruction Following,livebench_240829.csv,aggregate,aggregate,kendall,random,11,1,0.7090909090909091,0.0015912097162097162 | |
LiveBench Instruction Following,livebench_240829.csv,aggregate,aggregate,kendall,random,11,2,0.6727272727272727,0.0031063111271444604 | |
LiveBench Instruction Following,livebench_240829.csv,aggregate,aggregate,kendall,random,11,3,0.4909090909090909,0.04053235730319064 | |
LiveBench Instruction Following,livebench_240829.csv,aggregate,aggregate,kendall,random,11,4,0.6727272727272727,0.0031063111271444604 | |
WildBench Elo LC,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,0,0.8909090909090909,1.3728555395222063e-05 | |
WildBench Elo LC,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,1,0.8181818181818182,0.00013227513227513228 | |
WildBench Elo LC,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,2,0.8909090909090909,1.3728555395222063e-05 | |
WildBench Elo LC,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,3,0.8909090909090909,1.3728555395222063e-05 | |
WildBench Elo LC,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,4,0.8181818181818182,0.00013227513227513228 | |
WildBench Information Seeking,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,0,0.7454545454545454,0.000759529822029822 | |
WildBench Information Seeking,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,1,0.8181818181818182,0.00013227513227513228 | |
WildBench Information Seeking,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,2,0.8545454545454545,4.624619207952541e-05 | |
WildBench Information Seeking,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,3,0.7454545454545454,0.000759529822029822 | |
WildBench Information Seeking,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,4,0.7454545454545454,0.000759529822029822 | |
WildBench Creative,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,0,0.7454545454545454,0.000759529822029822 | |
WildBench Creative,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,1,0.7818181818181819,0.0003334435626102293 | |
WildBench Creative,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,2,0.7818181818181819,0.0003334435626102293 | |
WildBench Creative,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,3,0.7454545454545454,0.000759529822029822 | |
WildBench Creative,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,4,0.7454545454545454,0.000759529822029822 | |
WildBench Code Debugging,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,0,1.0,5.010421677088344e-08 | |
WildBench Code Debugging,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,1,0.9636363636363636,5.511463844797178e-07 | |
WildBench Code Debugging,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,2,0.8909090909090909,1.3728555395222063e-05 | |
WildBench Code Debugging,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,3,0.9636363636363636,5.511463844797178e-07 | |
WildBench Code Debugging,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,4,1.0,5.010421677088344e-08 | |
WildBench Math & Data,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,0,1.0,5.010421677088344e-08 | |
WildBench Math & Data,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,1,1.0,5.010421677088344e-08 | |
WildBench Math & Data,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,2,0.9272727272727274,3.2567740901074234e-06 | |
WildBench Math & Data,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,3,0.9636363636363636,5.511463844797178e-07 | |
WildBench Math & Data,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,4,1.0,5.010421677088344e-08 | |
WildBench Reasoning & Planning,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,0,0.9272727272727274,3.2567740901074234e-06 | |
WildBench Reasoning & Planning,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,1,0.9636363636363636,5.511463844797178e-07 | |
WildBench Reasoning & Planning,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,2,0.8909090909090909,1.3728555395222063e-05 | |
WildBench Reasoning & Planning,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,3,0.8181818181818182,0.00013227513227513228 | |
WildBench Reasoning & Planning,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,4,0.9272727272727274,3.2567740901074234e-06 | |
WildBench Score,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,0,0.9636363636363636,5.511463844797178e-07 | |
WildBench Score,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,1,0.9636363636363636,5.511463844797178e-07 | |
WildBench Score,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,2,0.8909090909090909,1.3728555395222063e-05 | |
WildBench Score,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,3,0.8909090909090909,1.3728555395222063e-05 | |
WildBench Score,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,4,0.9636363636363636,5.511463844797178e-07 | |
Arena Hard,arena_hard_240829.csv,aggregate,aggregate,kendall,random,11,0,1.0,5.010421677088344e-08 | |
Arena Hard,arena_hard_240829.csv,aggregate,aggregate,kendall,random,11,1,1.0,5.010421677088344e-08 | |
Arena Hard,arena_hard_240829.csv,aggregate,aggregate,kendall,random,11,2,1.0,5.010421677088344e-08 | |
Arena Hard,arena_hard_240829.csv,aggregate,aggregate,kendall,random,11,3,1.0,5.010421677088344e-08 | |
Arena Hard,arena_hard_240829.csv,aggregate,aggregate,kendall,random,11,4,1.0,5.010421677088344e-08 | |
HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,0,0.45454545454545453,0.06017015392015392 | |
HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,1,0.6000000000000001,0.00994553671637005 | |
HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,2,0.3090909090909091,0.21834651074234407 | |
HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,3,0.5272727272727272,0.02638447971781305 | |
HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,4,0.38181818181818183,0.12097096961680295 | |
HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,0,0.6000000000000001,0.00994553671637005 | |
HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,1,0.5636363636363636,0.016540504248837583 | |
HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,2,0.5272727272727272,0.02638447971781305 | |
HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,3,0.6000000000000001,0.00994553671637005 | |
HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,4,0.4909090909090909,0.04053235730319064 | |
HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,0,0.45454545454545453,0.06017015392015392 | |
HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,1,0.6000000000000001,0.00994553671637005 | |
HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,2,0.34545454545454546,0.16457331248997917 | |
HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,3,0.41818181818181815,0.08656124739458072 | |
HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,4,0.2727272727272727,0.2829668209876543 | |
HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,0,0.5741725345968929,0.015177848122929492 | |
HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,1,0.3519121986239021,0.1366995137219537 | |
HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,2,0.42599897728156577,0.07162425926742408 | |
HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,3,0.38181818181818183,0.12097096961680295 | |
HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,4,0.4403855060505442,0.06091869077971648 | |
HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,0,0.7090909090909091,0.0015912097162097162 | |
HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,1,0.7818181818181819,0.0003334435626102293 | |
HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,2,0.6727272727272727,0.0031063111271444604 | |
HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,3,0.7454545454545454,0.000759529822029822 | |
HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,4,0.6727272727272727,0.0031063111271444604 | |
HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,0,0.6000000000000001,0.00994553671637005 | |
HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,1,0.5272727272727272,0.02638447971781305 | |
HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,2,0.3090909090909091,0.21834651074234407 | |
HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,3,0.45454545454545453,0.06017015392015392 | |
HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,4,0.2727272727272727,0.2829668209876543 | |
HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,0,0.4403855060505442,0.06091869077971648 | |
HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,1,0.38181818181818183,0.12097096961680295 | |
HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,2,0.4403855060505442,0.06091869077971648 | |
HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,3,0.45454545454545453,0.06017015392015392 | |
HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,4,0.36698792170878686,0.11834981273562825 | |
BFCL,bfcl_240906.csv,aggregate,aggregate,kendall,random,11,0,0.2,0.4453821448613115 | |
BFCL,bfcl_240906.csv,aggregate,aggregate,kendall,random,11,1,0.38181818181818183,0.12097096961680295 | |
BFCL,bfcl_240906.csv,aggregate,aggregate,kendall,random,11,2,0.41818181818181815,0.08656124739458072 | |
BFCL,bfcl_240906.csv,aggregate,aggregate,kendall,random,11,3,0.5272727272727272,0.02638447971781305 | |
BFCL,bfcl_240906.csv,aggregate,aggregate,kendall,random,11,4,0.5272727272727272,0.02638447971781305 | |
BIGGEN,biggen_240829.csv,aggregate,aggregate,kendall,random,11,0,0.8181818181818182,0.00013227513227513228 | |
BIGGEN,biggen_240829.csv,aggregate,aggregate,kendall,random,11,1,0.8181818181818182,0.00013227513227513228 | |
BIGGEN,biggen_240829.csv,aggregate,aggregate,kendall,random,11,2,0.8909090909090909,1.3728555395222063e-05 | |
BIGGEN,biggen_240829.csv,aggregate,aggregate,kendall,random,11,3,0.8181818181818182,0.00013227513227513228 | |
BIGGEN,biggen_240829.csv,aggregate,aggregate,kendall,random,11,4,0.7090909090909091,0.0015912097162097162 | |
BIGGEN Grounding,biggen_240829.csv,aggregate,aggregate,kendall,random,11,0,0.7454545454545454,0.000759529822029822 | |
BIGGEN Grounding,biggen_240829.csv,aggregate,aggregate,kendall,random,11,1,0.7818181818181819,0.0003334435626102293 | |
BIGGEN Grounding,biggen_240829.csv,aggregate,aggregate,kendall,random,11,2,0.8181818181818182,0.00013227513227513228 | |
BIGGEN Grounding,biggen_240829.csv,aggregate,aggregate,kendall,random,11,3,0.8073734277593311,0.0005907573118657002 | |
BIGGEN Grounding,biggen_240829.csv,aggregate,aggregate,kendall,random,11,4,0.7454545454545454,0.000759529822029822 | |
BIGGEN Instruction Following,biggen_240829.csv,aggregate,aggregate,kendall,random,11,0,0.587180674734059,0.01246215829454031 | |
BIGGEN Instruction Following,biggen_240829.csv,aggregate,aggregate,kendall,random,11,1,0.6482593132545567,0.006117582447622459 | |
BIGGEN Instruction Following,biggen_240829.csv,aggregate,aggregate,kendall,random,11,2,0.8545454545454545,4.624619207952541e-05 | |
BIGGEN Instruction Following,biggen_240829.csv,aggregate,aggregate,kendall,random,11,3,0.7706746355884524,0.0010393630991335228 | |
BIGGEN Instruction Following,biggen_240829.csv,aggregate,aggregate,kendall,random,11,4,0.5371291452680612,0.02311942970946668 | |
BIGGEN Planning,biggen_240829.csv,aggregate,aggregate,kendall,random,11,0,0.6238794669049377,0.007931923532795268 | |
BIGGEN Planning,biggen_240829.csv,aggregate,aggregate,kendall,random,11,1,0.4909090909090909,0.04053235730319064 | |
BIGGEN Planning,biggen_240829.csv,aggregate,aggregate,kendall,random,11,2,0.8440722199302099,0.0003281542287518694 | |
BIGGEN Planning,biggen_240829.csv,aggregate,aggregate,kendall,random,11,3,0.7454545454545454,0.000759529822029822 | |
BIGGEN Planning,biggen_240829.csv,aggregate,aggregate,kendall,random,11,4,0.36698792170878686,0.11834981273562825 | |
BIGGEN Reasoning,biggen_240829.csv,aggregate,aggregate,kendall,random,11,0,0.8909090909090909,1.3728555395222063e-05 | |
BIGGEN Reasoning,biggen_240829.csv,aggregate,aggregate,kendall,random,11,1,0.9272727272727274,3.2567740901074234e-06 | |
BIGGEN Reasoning,biggen_240829.csv,aggregate,aggregate,kendall,random,11,2,1.0,5.010421677088344e-08 | |
BIGGEN Reasoning,biggen_240829.csv,aggregate,aggregate,kendall,random,11,3,0.7090909090909091,0.0015912097162097162 | |
BIGGEN Reasoning,biggen_240829.csv,aggregate,aggregate,kendall,random,11,4,0.6727272727272727,0.0031063111271444604 | |
BIGGEN Refinement,biggen_240829.csv,aggregate,aggregate,kendall,random,11,0,0.8181818181818182,0.00013227513227513228 | |
BIGGEN Refinement,biggen_240829.csv,aggregate,aggregate,kendall,random,11,1,0.8073734277593311,0.0005907573118657002 | |
BIGGEN Refinement,biggen_240829.csv,aggregate,aggregate,kendall,random,11,2,0.8909090909090909,1.3728555395222063e-05 | |
BIGGEN Refinement,biggen_240829.csv,aggregate,aggregate,kendall,random,11,3,0.7818181818181819,0.0003334435626102293 | |
BIGGEN Refinement,biggen_240829.csv,aggregate,aggregate,kendall,random,11,4,0.6605782590758164,0.004936818556325077 | |
BIGGEN Safety,biggen_240829.csv,aggregate,aggregate,kendall,random,11,0,-0.0909090909090909,0.7611503928170594 | |
BIGGEN Safety,biggen_240829.csv,aggregate,aggregate,kendall,random,11,1,0.07339758434175737,0.7547764265871044 | |
BIGGEN Safety,biggen_240829.csv,aggregate,aggregate,kendall,random,11,2,0.4403855060505442,0.06091869077971648 | |
BIGGEN Safety,biggen_240829.csv,aggregate,aggregate,kendall,random,11,3,0.3302891295379082,0.15985367483762747 | |
BIGGEN Safety,biggen_240829.csv,aggregate,aggregate,kendall,random,11,4,0.1272727272727273,0.6480954385121052 | |
BIGGEN Theory of Mind,biggen_240829.csv,aggregate,aggregate,kendall,random,11,0,0.7090909090909091,0.0015912097162097162 | |
BIGGEN Theory of Mind,biggen_240829.csv,aggregate,aggregate,kendall,random,11,1,0.7818181818181819,0.0003334435626102293 | |
BIGGEN Theory of Mind,biggen_240829.csv,aggregate,aggregate,kendall,random,11,2,0.8705196492275474,0.00023202582506637044 | |
BIGGEN Theory of Mind,biggen_240829.csv,aggregate,aggregate,kendall,random,11,3,0.7479575920067658,0.001637274718449882 | |
BIGGEN Theory of Mind,biggen_240829.csv,aggregate,aggregate,kendall,random,11,4,0.5983660736054126,0.01175728488671479 | |
BIGGEN Tool Usage,biggen_240829.csv,aggregate,aggregate,kendall,random,11,0,0.7090909090909091,0.0015912097162097162 | |
BIGGEN Tool Usage,biggen_240829.csv,aggregate,aggregate,kendall,random,11,1,0.6000000000000001,0.00994553671637005 | |
BIGGEN Tool Usage,biggen_240829.csv,aggregate,aggregate,kendall,random,11,2,0.7090909090909091,0.0015912097162097162 | |
BIGGEN Tool Usage,biggen_240829.csv,aggregate,aggregate,kendall,random,11,3,0.7454545454545454,0.000759529822029822 | |
BIGGEN Tool Usage,biggen_240829.csv,aggregate,aggregate,kendall,random,11,4,0.5272727272727272,0.02638447971781305 | |
BIGGEN Multilingual,biggen_240829.csv,aggregate,aggregate,kendall,random,11,0,0.7706746355884524,0.0010393630991335228 | |
BIGGEN Multilingual,biggen_240829.csv,aggregate,aggregate,kendall,random,11,1,0.8909090909090909,1.3728555395222063e-05 | |
BIGGEN Multilingual,biggen_240829.csv,aggregate,aggregate,kendall,random,11,2,0.5272727272727272,0.02638447971781305 | |
BIGGEN Multilingual,biggen_240829.csv,aggregate,aggregate,kendall,random,11,3,0.7454545454545454,0.000759529822029822 | |
BIGGEN Multilingual,biggen_240829.csv,aggregate,aggregate,kendall,random,11,4,0.7818181818181819,0.0003334435626102293 | |
LiveBench 240624,livebench_240701.csv,aggregate,aggregate,kendall,random,11,0,0.7818181818181819,0.0003334435626102293 | |
LiveBench 240624,livebench_240701.csv,aggregate,aggregate,kendall,random,11,1,0.8545454545454545,4.624619207952541e-05 | |
LiveBench 240624,livebench_240701.csv,aggregate,aggregate,kendall,random,11,2,0.7818181818181819,0.0003334435626102293 | |
LiveBench 240624,livebench_240701.csv,aggregate,aggregate,kendall,random,11,3,0.8545454545454545,4.624619207952541e-05 | |
LiveBench 240624,livebench_240701.csv,aggregate,aggregate,kendall,random,11,4,0.7818181818181819,0.0003334435626102293 | |
LiveBench Reasoning Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,0,0.6731618328060892,0.004677734981047257 | |
LiveBench Reasoning Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,1,0.759389481241052,0.0013210471654040124 | |
LiveBench Reasoning Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,2,0.7339758434175737,0.0017872890369872653 | |
LiveBench Reasoning Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,3,0.6238794669049377,0.007931923532795268 | |
LiveBench Reasoning Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,4,0.7090909090909091,0.0015912097162097162 | |
LiveBench Coding Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,0,0.6363636363636364,0.005707170915504249 | |
LiveBench Coding Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,1,0.7454545454545454,0.000759529822029822 | |
LiveBench Coding Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,2,0.7706746355884524,0.0010393630991335228 | |
LiveBench Coding Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,3,0.8181818181818182,0.00013227513227513228 | |
LiveBench Coding Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,4,0.7706746355884524,0.0010393630991335228 | |
LiveBench Mathematics Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,0,0.7090909090909091,0.0015912097162097162 | |
LiveBench Mathematics Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,1,0.8909090909090909,1.3728555395222063e-05 | |
LiveBench Mathematics Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,2,0.7818181818181819,0.0003334435626102293 | |
LiveBench Mathematics Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,3,0.8545454545454545,4.624619207952541e-05 | |
LiveBench Mathematics Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,4,0.8545454545454545,4.624619207952541e-05 | |
LiveBench Data Analysis Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,0,0.5636363636363636,0.016540504248837583 | |
LiveBench Data Analysis Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,1,0.7454545454545454,0.000759529822029822 | |
LiveBench Data Analysis Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,2,0.6363636363636364,0.005707170915504249 | |
LiveBench Data Analysis Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,3,0.7818181818181819,0.0003334435626102293 | |
LiveBench Data Analysis Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,4,0.6363636363636364,0.005707170915504249 | |
LiveBench Language Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,0,0.7090909090909091,0.0015912097162097162 | |
LiveBench Language Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,1,0.8181818181818182,0.00013227513227513228 | |
LiveBench Language Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,2,0.5636363636363636,0.016540504248837583 | |
LiveBench Language Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,3,0.8545454545454545,4.624619207952541e-05 | |
LiveBench Language Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,4,0.7454545454545454,0.000759529822029822 | |
LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,0,0.6727272727272727,0.0031063111271444604 | |
LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,1,0.7454545454545454,0.000759529822029822 | |
LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,2,0.7090909090909091,0.0015912097162097162 | |
LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,3,0.6000000000000001,0.00994553671637005 | |
LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,4,0.7454545454545454,0.000759529822029822 | |
aggregate,aggregate,Helm Lite,helm_lite_240829.csv,kendall,random,11,0,0.2778254199662385,0.2400384567875128 | |
aggregate,aggregate,Helm Lite,helm_lite_240829.csv,kendall,random,11,1,0.40368671387966554,0.08581278065055217 | |
aggregate,aggregate,Helm Lite,helm_lite_240829.csv,kendall,random,11,2,0.42599897728156577,0.07162425926742408 | |
aggregate,aggregate,Helm Lite,helm_lite_240829.csv,kendall,random,11,3,0.2778254199662385,0.2400384567875128 | |
aggregate,aggregate,Helm Lite,helm_lite_240829.csv,kendall,random,11,4,0.36698792170878686,0.11834981273562825 | |
aggregate,aggregate,Helm Lite NarrativeQA,helm_lite_240829.csv,kendall,random,11,0,-0.018181818181818184,1.0 | |
aggregate,aggregate,Helm Lite NarrativeQA,helm_lite_240829.csv,kendall,random,11,1,-0.018181818181818184,1.0 | |
aggregate,aggregate,Helm Lite NarrativeQA,helm_lite_240829.csv,kendall,random,11,2,-0.05454545454545454,0.8792698312489979 | |
aggregate,aggregate,Helm Lite NarrativeQA,helm_lite_240829.csv,kendall,random,11,3,-0.018181818181818184,1.0 | |
aggregate,aggregate,Helm Lite NarrativeQA,helm_lite_240829.csv,kendall,random,11,4,-0.1272727272727273,0.6480954385121052 | |
aggregate,aggregate,Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,kendall,random,11,0,0.05454545454545454,0.8792698312489979 | |
aggregate,aggregate,Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,kendall,random,11,1,-0.018181818181818184,1.0 | |
aggregate,aggregate,Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,kendall,random,11,2,-0.018181818181818184,1.0 | |
aggregate,aggregate,Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,kendall,random,11,3,0.05454545454545454,0.8792698312489979 | |
aggregate,aggregate,Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,kendall,random,11,4,-0.05454545454545454,0.8792698312489979 | |
aggregate,aggregate,Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,kendall,random,11,0,0.05454545454545454,0.8792698312489979 | |
aggregate,aggregate,Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,kendall,random,11,1,0.23636363636363636,0.3587114698573032 | |
aggregate,aggregate,Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,kendall,random,11,2,0.2,0.4453821448613115 | |
aggregate,aggregate,Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,kendall,random,11,3,0.05454545454545454,0.8792698312489979 | |
aggregate,aggregate,Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,kendall,random,11,4,0.1272727272727273,0.6480954385121052 | |
aggregate,aggregate,Helm Lite OpenBookQA,helm_lite_240829.csv,kendall,random,11,0,0.587180674734059,0.01246215829454031 | |
aggregate,aggregate,Helm Lite OpenBookQA,helm_lite_240829.csv,kendall,random,11,1,0.6727272727272727,0.0031063111271444604 | |
aggregate,aggregate,Helm Lite OpenBookQA,helm_lite_240829.csv,kendall,random,11,2,0.697277051246695,0.003004262239398284 | |
aggregate,aggregate,Helm Lite OpenBookQA,helm_lite_240829.csv,kendall,random,11,3,0.587180674734059,0.01246215829454031 | |
aggregate,aggregate,Helm Lite OpenBookQA,helm_lite_240829.csv,kendall,random,11,4,0.6605782590758164,0.004936818556325077 | |
aggregate,aggregate,Helm Lite MMLU,helm_lite_240829.csv,kendall,random,11,0,0.6000000000000001,0.00994553671637005 | |
aggregate,aggregate,Helm Lite MMLU,helm_lite_240829.csv,kendall,random,11,1,0.6727272727272727,0.0031063111271444604 | |
aggregate,aggregate,Helm Lite MMLU,helm_lite_240829.csv,kendall,random,11,2,0.7090909090909091,0.0015912097162097162 | |
aggregate,aggregate,Helm Lite MMLU,helm_lite_240829.csv,kendall,random,11,3,0.6000000000000001,0.00994553671637005 | |
aggregate,aggregate,Helm Lite MMLU,helm_lite_240829.csv,kendall,random,11,4,0.6363636363636364,0.005707170915504249 | |
aggregate,aggregate,Helm Lite MathEquivalentCOT,helm_lite_240829.csv,kendall,random,11,0,0.2727272727272727,0.2829668209876543 | |
aggregate,aggregate,Helm Lite MathEquivalentCOT,helm_lite_240829.csv,kendall,random,11,1,0.34545454545454546,0.16457331248997917 | |
aggregate,aggregate,Helm Lite MathEquivalentCOT,helm_lite_240829.csv,kendall,random,11,2,0.38181818181818183,0.12097096961680295 | |
aggregate,aggregate,Helm Lite MathEquivalentCOT,helm_lite_240829.csv,kendall,random,11,3,0.2727272727272727,0.2829668209876543 | |
aggregate,aggregate,Helm Lite MathEquivalentCOT,helm_lite_240829.csv,kendall,random,11,4,0.34545454545454546,0.16457331248997917 | |
aggregate,aggregate,Helm Lite GSM8K,helm_lite_240829.csv,kendall,random,11,0,0.6363636363636364,0.005707170915504249 | |
aggregate,aggregate,Helm Lite GSM8K,helm_lite_240829.csv,kendall,random,11,1,0.6727272727272727,0.0031063111271444604 | |
aggregate,aggregate,Helm Lite GSM8K,helm_lite_240829.csv,kendall,random,11,2,0.6363636363636364,0.005707170915504249 | |
aggregate,aggregate,Helm Lite GSM8K,helm_lite_240829.csv,kendall,random,11,3,0.6363636363636364,0.005707170915504249 | |
aggregate,aggregate,Helm Lite GSM8K,helm_lite_240829.csv,kendall,random,11,4,0.7090909090909091,0.0015912097162097162 | |
aggregate,aggregate,Helm Lite LegalBench,helm_lite_240829.csv,kendall,random,11,0,0.18349396085439343,0.43487965849578336 | |
aggregate,aggregate,Helm Lite LegalBench,helm_lite_240829.csv,kendall,random,11,1,0.2935903373670295,0.21152242941072896 | |
aggregate,aggregate,Helm Lite LegalBench,helm_lite_240829.csv,kendall,random,11,2,0.2727272727272727,0.2829668209876543 | |
aggregate,aggregate,Helm Lite LegalBench,helm_lite_240829.csv,kendall,random,11,3,0.18349396085439343,0.43487965849578336 | |
aggregate,aggregate,Helm Lite LegalBench,helm_lite_240829.csv,kendall,random,11,4,0.2568915451961508,0.27429882739587574 | |
aggregate,aggregate,Helm Lite MedQA,helm_lite_240829.csv,kendall,random,11,0,0.4909090909090909,0.04053235730319064 | |
aggregate,aggregate,Helm Lite MedQA,helm_lite_240829.csv,kendall,random,11,1,0.6000000000000001,0.00994553671637005 | |
aggregate,aggregate,Helm Lite MedQA,helm_lite_240829.csv,kendall,random,11,2,0.5636363636363636,0.016540504248837583 | |
aggregate,aggregate,Helm Lite MedQA,helm_lite_240829.csv,kendall,random,11,3,0.4909090909090909,0.04053235730319064 | |
aggregate,aggregate,Helm Lite MedQA,helm_lite_240829.csv,kendall,random,11,4,0.5636363636363636,0.016540504248837583 | |
aggregate,aggregate,Helm Lite WMT2014,helm_lite_240829.csv,kendall,random,11,0,0.34545454545454546,0.16457331248997917 | |
aggregate,aggregate,Helm Lite WMT2014,helm_lite_240829.csv,kendall,random,11,1,0.41818181818181815,0.08656124739458072 | |
aggregate,aggregate,Helm Lite WMT2014,helm_lite_240829.csv,kendall,random,11,2,0.4909090909090909,0.04053235730319064 | |
aggregate,aggregate,Helm Lite WMT2014,helm_lite_240829.csv,kendall,random,11,3,0.34545454545454546,0.16457331248997917 | |
aggregate,aggregate,Helm Lite WMT2014,helm_lite_240829.csv,kendall,random,11,4,0.34545454545454546,0.16457331248997917 | |
aggregate,aggregate,HF OpenLLM v2,hf_open_llm_v2_240829.csv,kendall,random,11,0,0.9272727272727274,3.2567740901074234e-06 | |
aggregate,aggregate,HF OpenLLM v2,hf_open_llm_v2_240829.csv,kendall,random,11,1,0.8545454545454545,4.624619207952541e-05 | |
aggregate,aggregate,HF OpenLLM v2,hf_open_llm_v2_240829.csv,kendall,random,11,2,0.9272727272727274,3.2567740901074234e-06 | |
aggregate,aggregate,HF OpenLLM v2,hf_open_llm_v2_240829.csv,kendall,random,11,3,0.8545454545454545,4.624619207952541e-05 | |
aggregate,aggregate,HF OpenLLM v2,hf_open_llm_v2_240829.csv,kendall,random,11,4,0.7818181818181819,0.0003334435626102293 | |
aggregate,aggregate,HFv2 BBH,hf_open_llm_v2_240829.csv,kendall,random,11,0,0.8181818181818182,0.00013227513227513228 | |
aggregate,aggregate,HFv2 BBH,hf_open_llm_v2_240829.csv,kendall,random,11,1,0.8181818181818182,0.00013227513227513228 | |
aggregate,aggregate,HFv2 BBH,hf_open_llm_v2_240829.csv,kendall,random,11,2,1.0,5.010421677088344e-08 | |
aggregate,aggregate,HFv2 BBH,hf_open_llm_v2_240829.csv,kendall,random,11,3,0.6727272727272727,0.0031063111271444604 | |
aggregate,aggregate,HFv2 BBH,hf_open_llm_v2_240829.csv,kendall,random,11,4,0.7454545454545454,0.000759529822029822 | |
aggregate,aggregate,HFv2 GPQA,hf_open_llm_v2_240829.csv,kendall,random,11,0,0.45454545454545453,0.06017015392015392 | |
aggregate,aggregate,HFv2 GPQA,hf_open_llm_v2_240829.csv,kendall,random,11,1,0.41818181818181815,0.08656124739458072 | |
aggregate,aggregate,HFv2 GPQA,hf_open_llm_v2_240829.csv,kendall,random,11,2,0.45454545454545453,0.06017015392015392 | |
aggregate,aggregate,HFv2 GPQA,hf_open_llm_v2_240829.csv,kendall,random,11,3,0.7454545454545454,0.000759529822029822 | |
aggregate,aggregate,HFv2 GPQA,hf_open_llm_v2_240829.csv,kendall,random,11,4,0.5272727272727272,0.02638447971781305 | |
aggregate,aggregate,HFv2 IFEval,hf_open_llm_v2_240829.csv,kendall,random,11,0,0.8181818181818182,0.00013227513227513228 | |
aggregate,aggregate,HFv2 IFEval,hf_open_llm_v2_240829.csv,kendall,random,11,1,0.5636363636363636,0.016540504248837583 | |
aggregate,aggregate,HFv2 IFEval,hf_open_llm_v2_240829.csv,kendall,random,11,2,0.7454545454545454,0.000759529822029822 | |
aggregate,aggregate,HFv2 IFEval,hf_open_llm_v2_240829.csv,kendall,random,11,3,0.7454545454545454,0.000759529822029822 | |
aggregate,aggregate,HFv2 IFEval,hf_open_llm_v2_240829.csv,kendall,random,11,4,0.7090909090909091,0.0015912097162097162 | |
aggregate,aggregate,HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,kendall,random,11,0,0.8909090909090909,1.3728555395222063e-05 | |
aggregate,aggregate,HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,kendall,random,11,1,0.9272727272727274,3.2567740901074234e-06 | |
aggregate,aggregate,HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,kendall,random,11,2,1.0,5.010421677088344e-08 | |
aggregate,aggregate,HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,kendall,random,11,3,0.8181818181818182,0.00013227513227513228 | |
aggregate,aggregate,HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,kendall,random,11,4,0.7818181818181819,0.0003334435626102293 | |
aggregate,aggregate,HFv2 Math Level 5,hf_open_llm_v2_240829.csv,kendall,random,11,0,0.8909090909090909,1.3728555395222063e-05 | |
aggregate,aggregate,HFv2 Math Level 5,hf_open_llm_v2_240829.csv,kendall,random,11,1,0.5272727272727272,0.02638447971781305 | |
aggregate,aggregate,HFv2 Math Level 5,hf_open_llm_v2_240829.csv,kendall,random,11,2,0.8545454545454545,4.624619207952541e-05 | |
aggregate,aggregate,HFv2 Math Level 5,hf_open_llm_v2_240829.csv,kendall,random,11,3,0.4403855060505442,0.06091869077971648 | |
aggregate,aggregate,HFv2 Math Level 5,hf_open_llm_v2_240829.csv,kendall,random,11,4,0.587180674734059,0.01246215829454031 | |
aggregate,aggregate,HFv2 MuSR,hf_open_llm_v2_240829.csv,kendall,random,11,0,0.7454545454545454,0.000759529822029822 | |
aggregate,aggregate,HFv2 MuSR,hf_open_llm_v2_240829.csv,kendall,random,11,1,0.45454545454545453,0.06017015392015392 | |
aggregate,aggregate,HFv2 MuSR,hf_open_llm_v2_240829.csv,kendall,random,11,2,0.38181818181818183,0.12097096961680295 | |
aggregate,aggregate,HFv2 MuSR,hf_open_llm_v2_240829.csv,kendall,random,11,3,0.6363636363636364,0.005707170915504249 | |
aggregate,aggregate,HFv2 MuSR,hf_open_llm_v2_240829.csv,kendall,random,11,4,0.587180674734059,0.01246215829454031 | |
aggregate,aggregate,Helm MMLU,helm_mmlu_240829.csv,kendall,random,11,0,0.7090909090909091,0.0015912097162097162 | |
aggregate,aggregate,Helm MMLU,helm_mmlu_240829.csv,kendall,random,11,1,0.7090909090909091,0.0015912097162097162 | |
aggregate,aggregate,Helm MMLU,helm_mmlu_240829.csv,kendall,random,11,2,0.7090909090909091,0.0015912097162097162 | |
aggregate,aggregate,Helm MMLU,helm_mmlu_240829.csv,kendall,random,11,3,0.7090909090909091,0.0015912097162097162 | |
aggregate,aggregate,Helm MMLU,helm_mmlu_240829.csv,kendall,random,11,4,0.7090909090909091,0.0015912097162097162 | |
aggregate,aggregate,LMSys Arena,chatbot_arena_240829.csv,kendall,random,11,0,1.0,5.010421677088344e-08 | |
aggregate,aggregate,LMSys Arena,chatbot_arena_240829.csv,kendall,random,11,1,1.0,5.010421677088344e-08 | |
aggregate,aggregate,LMSys Arena,chatbot_arena_240829.csv,kendall,random,11,2,1.0,5.010421677088344e-08 | |
aggregate,aggregate,LMSys Arena,chatbot_arena_240829.csv,kendall,random,11,3,1.0,5.010421677088344e-08 | |
aggregate,aggregate,LMSys Arena,chatbot_arena_240829.csv,kendall,random,11,4,1.0,5.010421677088344e-08 | |
aggregate,aggregate,MixEval,mixeval_240829.csv,kendall,random,11,0,0.8545454545454545,4.624619207952541e-05 | |
aggregate,aggregate,MixEval,mixeval_240829.csv,kendall,random,11,1,0.7818181818181819,0.0003334435626102293 | |
aggregate,aggregate,MixEval,mixeval_240829.csv,kendall,random,11,2,0.8181818181818182,0.00013227513227513228 | |
aggregate,aggregate,MixEval,mixeval_240829.csv,kendall,random,11,3,0.8181818181818182,0.00013227513227513228 | |
aggregate,aggregate,MixEval,mixeval_240829.csv,kendall,random,11,4,0.8181818181818182,0.00013227513227513228 | |
aggregate,aggregate,MixEval Hard,mixeval_240829.csv,kendall,random,11,0,0.7818181818181819,0.0003334435626102293 | |
aggregate,aggregate,MixEval Hard,mixeval_240829.csv,kendall,random,11,1,0.7454545454545454,0.000759529822029822 | |
aggregate,aggregate,MixEval Hard,mixeval_240829.csv,kendall,random,11,2,0.7090909090909091,0.0015912097162097162 | |
aggregate,aggregate,MixEval Hard,mixeval_240829.csv,kendall,random,11,3,0.4909090909090909,0.04053235730319064 | |
aggregate,aggregate,MixEval Hard,mixeval_240829.csv,kendall,random,11,4,0.8181818181818182,0.00013227513227513228 | |
aggregate,aggregate,MixEval TriviaQA,mixeval_240829.csv,kendall,random,11,0,0.6238794669049377,0.007931923532795268 | |
aggregate,aggregate,MixEval TriviaQA,mixeval_240829.csv,kendall,random,11,1,0.6605782590758164,0.004936818556325077 | |
aggregate,aggregate,MixEval TriviaQA,mixeval_240829.csv,kendall,random,11,2,0.4403855060505442,0.06091869077971648 | |
aggregate,aggregate,MixEval TriviaQA,mixeval_240829.csv,kendall,random,11,3,0.7090909090909091,0.0015912097162097162 | |
aggregate,aggregate,MixEval TriviaQA,mixeval_240829.csv,kendall,random,11,4,0.697277051246695,0.003004262239398284 | |
aggregate,aggregate,MixEval MMLU,mixeval_240829.csv,kendall,random,11,0,0.8545454545454545,4.624619207952541e-05 | |
aggregate,aggregate,MixEval MMLU,mixeval_240829.csv,kendall,random,11,1,0.7818181818181819,0.0003334435626102293 | |
aggregate,aggregate,MixEval MMLU,mixeval_240829.csv,kendall,random,11,2,0.8181818181818182,0.00013227513227513228 | |
aggregate,aggregate,MixEval MMLU,mixeval_240829.csv,kendall,random,11,3,0.7818181818181819,0.0003334435626102293 | |
aggregate,aggregate,MixEval MMLU,mixeval_240829.csv,kendall,random,11,4,0.7090909090909091,0.0015912097162097162 | |
aggregate,aggregate,MixEval DROP,mixeval_240829.csv,kendall,random,11,0,0.6238794669049377,0.007931923532795268 | |
aggregate,aggregate,MixEval DROP,mixeval_240829.csv,kendall,random,11,1,0.4403855060505442,0.06091869077971648 | |
aggregate,aggregate,MixEval DROP,mixeval_240829.csv,kendall,random,11,2,0.5636363636363636,0.016540504248837583 | |
aggregate,aggregate,MixEval DROP,mixeval_240829.csv,kendall,random,11,3,0.6363636363636364,0.005707170915504249 | |
aggregate,aggregate,MixEval DROP,mixeval_240829.csv,kendall,random,11,4,0.5636363636363636,0.016540504248837583 | |
aggregate,aggregate,MixEval HellaSwag,mixeval_240829.csv,kendall,random,11,0,0.6000000000000001,0.00994553671637005 | |
aggregate,aggregate,MixEval HellaSwag,mixeval_240829.csv,kendall,random,11,1,0.5636363636363636,0.016540504248837583 | |
aggregate,aggregate,MixEval HellaSwag,mixeval_240829.csv,kendall,random,11,2,0.6000000000000001,0.00994553671637005 | |
aggregate,aggregate,MixEval HellaSwag,mixeval_240829.csv,kendall,random,11,3,0.6000000000000001,0.00994553671637005 | |
aggregate,aggregate,MixEval HellaSwag,mixeval_240829.csv,kendall,random,11,4,0.5636363636363636,0.016540504248837583 | |
aggregate,aggregate,MixEval CommonsenseQA,mixeval_240829.csv,kendall,random,11,0,0.7339758434175737,0.0017872890369872653 | |
aggregate,aggregate,MixEval CommonsenseQA,mixeval_240829.csv,kendall,random,11,1,0.587180674734059,0.01246215829454031 | |
aggregate,aggregate,MixEval CommonsenseQA,mixeval_240829.csv,kendall,random,11,2,0.6482593132545567,0.006117582447622459 | |
aggregate,aggregate,MixEval CommonsenseQA,mixeval_240829.csv,kendall,random,11,3,0.759389481241052,0.0013210471654040124 | |
aggregate,aggregate,MixEval CommonsenseQA,mixeval_240829.csv,kendall,random,11,4,0.759389481241052,0.0013210471654040124 | |
aggregate,aggregate,MixEval TriviaQA Hard,mixeval_240829.csv,kendall,random,11,0,0.6363636363636364,0.005707170915504249 | |
aggregate,aggregate,MixEval TriviaQA Hard,mixeval_240829.csv,kendall,random,11,1,0.7090909090909091,0.0015912097162097162 | |
aggregate,aggregate,MixEval TriviaQA Hard,mixeval_240829.csv,kendall,random,11,2,0.6727272727272727,0.0031063111271444604 | |
aggregate,aggregate,MixEval TriviaQA Hard,mixeval_240829.csv,kendall,random,11,3,0.2727272727272727,0.2829668209876543 | |
aggregate,aggregate,MixEval TriviaQA Hard,mixeval_240829.csv,kendall,random,11,4,0.5272727272727272,0.02638447971781305 | |
aggregate,aggregate,MixEval MMLU Hard,mixeval_240829.csv,kendall,random,11,0,0.38181818181818183,0.12097096961680295 | |
aggregate,aggregate,MixEval MMLU Hard,mixeval_240829.csv,kendall,random,11,1,0.4909090909090909,0.04053235730319064 | |
aggregate,aggregate,MixEval MMLU Hard,mixeval_240829.csv,kendall,random,11,2,0.38895558795273394,0.10000137830747906 | |
aggregate,aggregate,MixEval MMLU Hard,mixeval_240829.csv,kendall,random,11,3,0.38181818181818183,0.12097096961680295 | |
aggregate,aggregate,MixEval MMLU Hard,mixeval_240829.csv,kendall,random,11,4,0.697277051246695,0.003004262239398284 | |
aggregate,aggregate,MixEval DROP Hard,mixeval_240829.csv,kendall,random,11,0,0.4909090909090909,0.04053235730319064 | |
aggregate,aggregate,MixEval DROP Hard,mixeval_240829.csv,kendall,random,11,1,0.41818181818181815,0.08656124739458072 | |
aggregate,aggregate,MixEval DROP Hard,mixeval_240829.csv,kendall,random,11,2,0.6000000000000001,0.00994553671637005 | |
aggregate,aggregate,MixEval DROP Hard,mixeval_240829.csv,kendall,random,11,3,0.6000000000000001,0.00994553671637005 | |
aggregate,aggregate,MixEval DROP Hard,mixeval_240829.csv,kendall,random,11,4,0.6000000000000001,0.00994553671637005 | |
aggregate,aggregate,AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,kendall,random,11,0,0.8181818181818182,0.00013227513227513228 | |
aggregate,aggregate,AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,kendall,random,11,1,0.8807710121010884,0.00017812930545546289 | |
aggregate,aggregate,AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,kendall,random,11,2,0.8807710121010884,0.00017812930545546289 | |
aggregate,aggregate,AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,kendall,random,11,3,0.8807710121010884,0.00017812930545546289 | |
aggregate,aggregate,AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,kendall,random,11,4,0.8545454545454545,4.624619207952541e-05 | |
aggregate,aggregate,LiveBench 240725,livebench_240829.csv,kendall,random,11,0,0.7454545454545454,0.000759529822029822 | |
aggregate,aggregate,LiveBench 240725,livebench_240829.csv,kendall,random,11,1,0.8181818181818182,0.00013227513227513228 | |
aggregate,aggregate,LiveBench 240725,livebench_240829.csv,kendall,random,11,2,0.8545454545454545,4.624619207952541e-05 | |
aggregate,aggregate,LiveBench 240725,livebench_240829.csv,kendall,random,11,3,0.8545454545454545,4.624619207952541e-05 | |
aggregate,aggregate,LiveBench 240725,livebench_240829.csv,kendall,random,11,4,0.8545454545454545,4.624619207952541e-05 | |
aggregate,aggregate,LiveBench Reasoning,livebench_240829.csv,kendall,random,11,0,0.4770842982214229,0.042330229121360724 | |
aggregate,aggregate,LiveBench Reasoning,livebench_240829.csv,kendall,random,11,1,0.6727272727272727,0.0031063111271444604 | |
aggregate,aggregate,LiveBench Reasoning,livebench_240829.csv,kendall,random,11,2,0.6605782590758164,0.004936818556325077 | |
aggregate,aggregate,LiveBench Reasoning,livebench_240829.csv,kendall,random,11,3,0.8440722199302099,0.0003281542287518694 | |
aggregate,aggregate,LiveBench Reasoning,livebench_240829.csv,kendall,random,11,4,0.7339758434175737,0.0017872890369872653 | |
aggregate,aggregate,LiveBench Coding,livebench_240829.csv,kendall,random,11,0,0.9636363636363636,5.511463844797178e-07 | |
aggregate,aggregate,LiveBench Coding,livebench_240829.csv,kendall,random,11,1,0.6727272727272727,0.0031063111271444604 | |
aggregate,aggregate,LiveBench Coding,livebench_240829.csv,kendall,random,11,2,0.7818181818181819,0.0003334435626102293 | |
aggregate,aggregate,LiveBench Coding,livebench_240829.csv,kendall,random,11,3,0.8545454545454545,4.624619207952541e-05 | |
aggregate,aggregate,LiveBench Coding,livebench_240829.csv,kendall,random,11,4,0.7818181818181819,0.0003334435626102293 | |
aggregate,aggregate,LiveBench Mathematics,livebench_240829.csv,kendall,random,11,0,0.7090909090909091,0.0015912097162097162 | |
aggregate,aggregate,LiveBench Mathematics,livebench_240829.csv,kendall,random,11,1,0.7454545454545454,0.000759529822029822 | |
aggregate,aggregate,LiveBench Mathematics,livebench_240829.csv,kendall,random,11,2,0.8181818181818182,0.00013227513227513228 | |
aggregate,aggregate,LiveBench Mathematics,livebench_240829.csv,kendall,random,11,3,0.8909090909090909,1.3728555395222063e-05 | |
aggregate,aggregate,LiveBench Mathematics,livebench_240829.csv,kendall,random,11,4,0.8545454545454545,4.624619207952541e-05 | |
aggregate,aggregate,LiveBench Data Analysis,livebench_240829.csv,kendall,random,11,0,0.4909090909090909,0.04053235730319064 | |
aggregate,aggregate,LiveBench Data Analysis,livebench_240829.csv,kendall,random,11,1,0.6363636363636364,0.005707170915504249 | |
aggregate,aggregate,LiveBench Data Analysis,livebench_240829.csv,kendall,random,11,2,0.7454545454545454,0.000759529822029822 | |
aggregate,aggregate,LiveBench Data Analysis,livebench_240829.csv,kendall,random,11,3,0.8545454545454545,4.624619207952541e-05 | |
aggregate,aggregate,LiveBench Data Analysis,livebench_240829.csv,kendall,random,11,4,0.7454545454545454,0.000759529822029822 | |
aggregate,aggregate,LiveBench Language,livebench_240829.csv,kendall,random,11,0,0.45454545454545453,0.06017015392015392 | |
aggregate,aggregate,LiveBench Language,livebench_240829.csv,kendall,random,11,1,0.8181818181818182,0.00013227513227513228 | |
aggregate,aggregate,LiveBench Language,livebench_240829.csv,kendall,random,11,2,0.6363636363636364,0.005707170915504249 | |
aggregate,aggregate,LiveBench Language,livebench_240829.csv,kendall,random,11,3,0.9272727272727274,3.2567740901074234e-06 | |
aggregate,aggregate,LiveBench Language,livebench_240829.csv,kendall,random,11,4,0.8545454545454545,4.624619207952541e-05 | |
aggregate,aggregate,LiveBench Instruction Following,livebench_240829.csv,kendall,random,11,0,0.6000000000000001,0.00994553671637005 | |
aggregate,aggregate,LiveBench Instruction Following,livebench_240829.csv,kendall,random,11,1,0.7090909090909091,0.0015912097162097162 | |
aggregate,aggregate,LiveBench Instruction Following,livebench_240829.csv,kendall,random,11,2,0.6727272727272727,0.0031063111271444604 | |
aggregate,aggregate,LiveBench Instruction Following,livebench_240829.csv,kendall,random,11,3,0.4909090909090909,0.04053235730319064 | |
aggregate,aggregate,LiveBench Instruction Following,livebench_240829.csv,kendall,random,11,4,0.6727272727272727,0.0031063111271444604 | |
aggregate,aggregate,WildBench Elo LC,wildbench_240829.csv,kendall,random,11,0,0.8909090909090909,1.3728555395222063e-05 | |
aggregate,aggregate,WildBench Elo LC,wildbench_240829.csv,kendall,random,11,1,0.8181818181818182,0.00013227513227513228 | |
aggregate,aggregate,WildBench Elo LC,wildbench_240829.csv,kendall,random,11,2,0.8909090909090909,1.3728555395222063e-05 | |
aggregate,aggregate,WildBench Elo LC,wildbench_240829.csv,kendall,random,11,3,0.8909090909090909,1.3728555395222063e-05 | |
aggregate,aggregate,WildBench Elo LC,wildbench_240829.csv,kendall,random,11,4,0.8181818181818182,0.00013227513227513228 | |
aggregate,aggregate,WildBench Information Seeking,wildbench_240829.csv,kendall,random,11,0,0.7454545454545454,0.000759529822029822 | |
aggregate,aggregate,WildBench Information Seeking,wildbench_240829.csv,kendall,random,11,1,0.8181818181818182,0.00013227513227513228 | |
aggregate,aggregate,WildBench Information Seeking,wildbench_240829.csv,kendall,random,11,2,0.8545454545454545,4.624619207952541e-05 | |
aggregate,aggregate,WildBench Information Seeking,wildbench_240829.csv,kendall,random,11,3,0.7454545454545454,0.000759529822029822 | |
aggregate,aggregate,WildBench Information Seeking,wildbench_240829.csv,kendall,random,11,4,0.7454545454545454,0.000759529822029822 | |
aggregate,aggregate,WildBench Creative,wildbench_240829.csv,kendall,random,11,0,0.7454545454545454,0.000759529822029822 | |
aggregate,aggregate,WildBench Creative,wildbench_240829.csv,kendall,random,11,1,0.7818181818181819,0.0003334435626102293 | |
aggregate,aggregate,WildBench Creative,wildbench_240829.csv,kendall,random,11,2,0.7818181818181819,0.0003334435626102293 | |
aggregate,aggregate,WildBench Creative,wildbench_240829.csv,kendall,random,11,3,0.7454545454545454,0.000759529822029822 | |
aggregate,aggregate,WildBench Creative,wildbench_240829.csv,kendall,random,11,4,0.7454545454545454,0.000759529822029822 | |
aggregate,aggregate,WildBench Code Debugging,wildbench_240829.csv,kendall,random,11,0,1.0,5.010421677088344e-08 | |
aggregate,aggregate,WildBench Code Debugging,wildbench_240829.csv,kendall,random,11,1,0.9636363636363636,5.511463844797178e-07 | |
aggregate,aggregate,WildBench Code Debugging,wildbench_240829.csv,kendall,random,11,2,0.8909090909090909,1.3728555395222063e-05 | |
aggregate,aggregate,WildBench Code Debugging,wildbench_240829.csv,kendall,random,11,3,0.9636363636363636,5.511463844797178e-07 | |
aggregate,aggregate,WildBench Code Debugging,wildbench_240829.csv,kendall,random,11,4,1.0,5.010421677088344e-08 | |
aggregate,aggregate,WildBench Math & Data,wildbench_240829.csv,kendall,random,11,0,1.0,5.010421677088344e-08 | |
aggregate,aggregate,WildBench Math & Data,wildbench_240829.csv,kendall,random,11,1,1.0,5.010421677088344e-08 | |
aggregate,aggregate,WildBench Math & Data,wildbench_240829.csv,kendall,random,11,2,0.9272727272727274,3.2567740901074234e-06 | |
aggregate,aggregate,WildBench Math & Data,wildbench_240829.csv,kendall,random,11,3,0.9636363636363636,5.511463844797178e-07 | |
aggregate,aggregate,WildBench Math & Data,wildbench_240829.csv,kendall,random,11,4,1.0,5.010421677088344e-08 | |
aggregate,aggregate,WildBench Reasoning & Planning,wildbench_240829.csv,kendall,random,11,0,0.9272727272727274,3.2567740901074234e-06 | |
aggregate,aggregate,WildBench Reasoning & Planning,wildbench_240829.csv,kendall,random,11,1,0.9636363636363636,5.511463844797178e-07 | |
aggregate,aggregate,WildBench Reasoning & Planning,wildbench_240829.csv,kendall,random,11,2,0.8909090909090909,1.3728555395222063e-05 | |
aggregate,aggregate,WildBench Reasoning & Planning,wildbench_240829.csv,kendall,random,11,3,0.8181818181818182,0.00013227513227513228 | |
aggregate,aggregate,WildBench Reasoning & Planning,wildbench_240829.csv,kendall,random,11,4,0.9272727272727274,3.2567740901074234e-06 | |
aggregate,aggregate,WildBench Score,wildbench_240829.csv,kendall,random,11,0,0.9636363636363636,5.511463844797178e-07 | |
aggregate,aggregate,WildBench Score,wildbench_240829.csv,kendall,random,11,1,0.9636363636363636,5.511463844797178e-07 | |
aggregate,aggregate,WildBench Score,wildbench_240829.csv,kendall,random,11,2,0.8909090909090909,1.3728555395222063e-05 | |
aggregate,aggregate,WildBench Score,wildbench_240829.csv,kendall,random,11,3,0.8909090909090909,1.3728555395222063e-05 | |
aggregate,aggregate,WildBench Score,wildbench_240829.csv,kendall,random,11,4,0.9636363636363636,5.511463844797178e-07 | |
aggregate,aggregate,Arena Hard,arena_hard_240829.csv,kendall,random,11,0,1.0,5.010421677088344e-08 | |
aggregate,aggregate,Arena Hard,arena_hard_240829.csv,kendall,random,11,1,1.0,5.010421677088344e-08 | |
aggregate,aggregate,Arena Hard,arena_hard_240829.csv,kendall,random,11,2,1.0,5.010421677088344e-08 | |
aggregate,aggregate,Arena Hard,arena_hard_240829.csv,kendall,random,11,3,1.0,5.010421677088344e-08 | |
aggregate,aggregate,Arena Hard,arena_hard_240829.csv,kendall,random,11,4,1.0,5.010421677088344e-08 | |
aggregate,aggregate,HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,0,0.45454545454545453,0.06017015392015392 | |
aggregate,aggregate,HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,1,0.6000000000000001,0.00994553671637005 | |
aggregate,aggregate,HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,2,0.3090909090909091,0.21834651074234407 | |
aggregate,aggregate,HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,3,0.5272727272727272,0.02638447971781305 | |
aggregate,aggregate,HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,4,0.38181818181818183,0.12097096961680295 | |
aggregate,aggregate,HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,0,0.6000000000000001,0.00994553671637005 | |
aggregate,aggregate,HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,1,0.5636363636363636,0.016540504248837583 | |
aggregate,aggregate,HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,2,0.5272727272727272,0.02638447971781305 | |
aggregate,aggregate,HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,3,0.6000000000000001,0.00994553671637005 | |
aggregate,aggregate,HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,4,0.4909090909090909,0.04053235730319064 | |
aggregate,aggregate,HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,0,0.45454545454545453,0.06017015392015392 | |
aggregate,aggregate,HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,1,0.6000000000000001,0.00994553671637005 | |
aggregate,aggregate,HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,2,0.34545454545454546,0.16457331248997917 | |
aggregate,aggregate,HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,3,0.41818181818181815,0.08656124739458072 | |
aggregate,aggregate,HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,4,0.2727272727272727,0.2829668209876543 | |
aggregate,aggregate,HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,0,0.5741725345968929,0.015177848122929492 | |
aggregate,aggregate,HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,1,0.3519121986239021,0.1366995137219537 | |
aggregate,aggregate,HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,2,0.42599897728156577,0.07162425926742408 | |
aggregate,aggregate,HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,3,0.38181818181818183,0.12097096961680295 | |
aggregate,aggregate,HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,4,0.4403855060505442,0.06091869077971648 | |
aggregate,aggregate,HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,0,0.7090909090909091,0.0015912097162097162 | |
aggregate,aggregate,HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,1,0.7818181818181819,0.0003334435626102293 | |
aggregate,aggregate,HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,2,0.6727272727272727,0.0031063111271444604 | |
aggregate,aggregate,HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,3,0.7454545454545454,0.000759529822029822 | |
aggregate,aggregate,HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,4,0.6727272727272727,0.0031063111271444604 | |
aggregate,aggregate,HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,0,0.6000000000000001,0.00994553671637005 | |
aggregate,aggregate,HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,1,0.5272727272727272,0.02638447971781305 | |
aggregate,aggregate,HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,2,0.3090909090909091,0.21834651074234407 | |
aggregate,aggregate,HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,3,0.45454545454545453,0.06017015392015392 | |
aggregate,aggregate,HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,4,0.2727272727272727,0.2829668209876543 | |
aggregate,aggregate,HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,0,0.4403855060505442,0.06091869077971648 | |
aggregate,aggregate,HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,1,0.38181818181818183,0.12097096961680295 | |
aggregate,aggregate,HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,2,0.4403855060505442,0.06091869077971648 | |
aggregate,aggregate,HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,3,0.45454545454545453,0.06017015392015392 | |
aggregate,aggregate,HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,4,0.36698792170878686,0.11834981273562825 | |
aggregate,aggregate,BFCL,bfcl_240906.csv,kendall,random,11,0,0.2,0.4453821448613115 | |
aggregate,aggregate,BFCL,bfcl_240906.csv,kendall,random,11,1,0.38181818181818183,0.12097096961680295 | |
aggregate,aggregate,BFCL,bfcl_240906.csv,kendall,random,11,2,0.41818181818181815,0.08656124739458072 | |
aggregate,aggregate,BFCL,bfcl_240906.csv,kendall,random,11,3,0.5272727272727272,0.02638447971781305 | |
aggregate,aggregate,BFCL,bfcl_240906.csv,kendall,random,11,4,0.5272727272727272,0.02638447971781305 | |
aggregate,aggregate,BIGGEN,biggen_240829.csv,kendall,random,11,0,0.8181818181818182,0.00013227513227513228 | |
aggregate,aggregate,BIGGEN,biggen_240829.csv,kendall,random,11,1,0.8181818181818182,0.00013227513227513228 | |
aggregate,aggregate,BIGGEN,biggen_240829.csv,kendall,random,11,2,0.8909090909090909,1.3728555395222063e-05 | |
aggregate,aggregate,BIGGEN,biggen_240829.csv,kendall,random,11,3,0.8181818181818182,0.00013227513227513228 | |
aggregate,aggregate,BIGGEN,biggen_240829.csv,kendall,random,11,4,0.7090909090909091,0.0015912097162097162 | |
aggregate,aggregate,BIGGEN Grounding,biggen_240829.csv,kendall,random,11,0,0.7454545454545454,0.000759529822029822 | |
aggregate,aggregate,BIGGEN Grounding,biggen_240829.csv,kendall,random,11,1,0.7818181818181819,0.0003334435626102293 | |
aggregate,aggregate,BIGGEN Grounding,biggen_240829.csv,kendall,random,11,2,0.8181818181818182,0.00013227513227513228 | |
aggregate,aggregate,BIGGEN Grounding,biggen_240829.csv,kendall,random,11,3,0.8073734277593311,0.0005907573118657002 | |
aggregate,aggregate,BIGGEN Grounding,biggen_240829.csv,kendall,random,11,4,0.7454545454545454,0.000759529822029822 | |
aggregate,aggregate,BIGGEN Instruction Following,biggen_240829.csv,kendall,random,11,0,0.587180674734059,0.01246215829454031 | |
aggregate,aggregate,BIGGEN Instruction Following,biggen_240829.csv,kendall,random,11,1,0.6482593132545567,0.006117582447622459 | |
aggregate,aggregate,BIGGEN Instruction Following,biggen_240829.csv,kendall,random,11,2,0.8545454545454545,4.624619207952541e-05 | |
aggregate,aggregate,BIGGEN Instruction Following,biggen_240829.csv,kendall,random,11,3,0.7706746355884524,0.0010393630991335228 | |
aggregate,aggregate,BIGGEN Instruction Following,biggen_240829.csv,kendall,random,11,4,0.5371291452680612,0.02311942970946668 | |
aggregate,aggregate,BIGGEN Planning,biggen_240829.csv,kendall,random,11,0,0.6238794669049377,0.007931923532795268 | |
aggregate,aggregate,BIGGEN Planning,biggen_240829.csv,kendall,random,11,1,0.4909090909090909,0.04053235730319064 | |
aggregate,aggregate,BIGGEN Planning,biggen_240829.csv,kendall,random,11,2,0.8440722199302099,0.0003281542287518694 | |
aggregate,aggregate,BIGGEN Planning,biggen_240829.csv,kendall,random,11,3,0.7454545454545454,0.000759529822029822 | |
aggregate,aggregate,BIGGEN Planning,biggen_240829.csv,kendall,random,11,4,0.36698792170878686,0.11834981273562825 | |
aggregate,aggregate,BIGGEN Reasoning,biggen_240829.csv,kendall,random,11,0,0.8909090909090909,1.3728555395222063e-05 | |
aggregate,aggregate,BIGGEN Reasoning,biggen_240829.csv,kendall,random,11,1,0.9272727272727274,3.2567740901074234e-06 | |
aggregate,aggregate,BIGGEN Reasoning,biggen_240829.csv,kendall,random,11,2,1.0,5.010421677088344e-08 | |
aggregate,aggregate,BIGGEN Reasoning,biggen_240829.csv,kendall,random,11,3,0.7090909090909091,0.0015912097162097162 | |
aggregate,aggregate,BIGGEN Reasoning,biggen_240829.csv,kendall,random,11,4,0.6727272727272727,0.0031063111271444604 | |
aggregate,aggregate,BIGGEN Refinement,biggen_240829.csv,kendall,random,11,0,0.8181818181818182,0.00013227513227513228 | |
aggregate,aggregate,BIGGEN Refinement,biggen_240829.csv,kendall,random,11,1,0.8073734277593311,0.0005907573118657002 | |
aggregate,aggregate,BIGGEN Refinement,biggen_240829.csv,kendall,random,11,2,0.8909090909090909,1.3728555395222063e-05 | |
aggregate,aggregate,BIGGEN Refinement,biggen_240829.csv,kendall,random,11,3,0.7818181818181819,0.0003334435626102293 | |
aggregate,aggregate,BIGGEN Refinement,biggen_240829.csv,kendall,random,11,4,0.6605782590758164,0.004936818556325077 | |
aggregate,aggregate,BIGGEN Safety,biggen_240829.csv,kendall,random,11,0,-0.0909090909090909,0.7611503928170594 | |
aggregate,aggregate,BIGGEN Safety,biggen_240829.csv,kendall,random,11,1,0.07339758434175737,0.7547764265871044 | |
aggregate,aggregate,BIGGEN Safety,biggen_240829.csv,kendall,random,11,2,0.4403855060505442,0.06091869077971648 | |
aggregate,aggregate,BIGGEN Safety,biggen_240829.csv,kendall,random,11,3,0.3302891295379082,0.15985367483762747 | |
aggregate,aggregate,BIGGEN Safety,biggen_240829.csv,kendall,random,11,4,0.1272727272727273,0.6480954385121052 | |
aggregate,aggregate,BIGGEN Theory of Mind,biggen_240829.csv,kendall,random,11,0,0.7090909090909091,0.0015912097162097162 | |
aggregate,aggregate,BIGGEN Theory of Mind,biggen_240829.csv,kendall,random,11,1,0.7818181818181819,0.0003334435626102293 | |
aggregate,aggregate,BIGGEN Theory of Mind,biggen_240829.csv,kendall,random,11,2,0.8705196492275474,0.00023202582506637044 | |
aggregate,aggregate,BIGGEN Theory of Mind,biggen_240829.csv,kendall,random,11,3,0.7479575920067658,0.001637274718449882 | |
aggregate,aggregate,BIGGEN Theory of Mind,biggen_240829.csv,kendall,random,11,4,0.5983660736054126,0.01175728488671479 | |
aggregate,aggregate,BIGGEN Tool Usage,biggen_240829.csv,kendall,random,11,0,0.7090909090909091,0.0015912097162097162 | |
aggregate,aggregate,BIGGEN Tool Usage,biggen_240829.csv,kendall,random,11,1,0.6000000000000001,0.00994553671637005 | |
aggregate,aggregate,BIGGEN Tool Usage,biggen_240829.csv,kendall,random,11,2,0.7090909090909091,0.0015912097162097162 | |
aggregate,aggregate,BIGGEN Tool Usage,biggen_240829.csv,kendall,random,11,3,0.7454545454545454,0.000759529822029822 | |
aggregate,aggregate,BIGGEN Tool Usage,biggen_240829.csv,kendall,random,11,4,0.5272727272727272,0.02638447971781305 | |
aggregate,aggregate,BIGGEN Multilingual,biggen_240829.csv,kendall,random,11,0,0.7706746355884524,0.0010393630991335228 | |
aggregate,aggregate,BIGGEN Multilingual,biggen_240829.csv,kendall,random,11,1,0.8909090909090909,1.3728555395222063e-05 | |
aggregate,aggregate,BIGGEN Multilingual,biggen_240829.csv,kendall,random,11,2,0.5272727272727272,0.02638447971781305 | |
aggregate,aggregate,BIGGEN Multilingual,biggen_240829.csv,kendall,random,11,3,0.7454545454545454,0.000759529822029822 | |
aggregate,aggregate,BIGGEN Multilingual,biggen_240829.csv,kendall,random,11,4,0.7818181818181819,0.0003334435626102293 | |
aggregate,aggregate,LiveBench 240624,livebench_240701.csv,kendall,random,11,0,0.7818181818181819,0.0003334435626102293 | |
aggregate,aggregate,LiveBench 240624,livebench_240701.csv,kendall,random,11,1,0.8545454545454545,4.624619207952541e-05 | |
aggregate,aggregate,LiveBench 240624,livebench_240701.csv,kendall,random,11,2,0.7818181818181819,0.0003334435626102293 | |
aggregate,aggregate,LiveBench 240624,livebench_240701.csv,kendall,random,11,3,0.8545454545454545,4.624619207952541e-05 | |
aggregate,aggregate,LiveBench 240624,livebench_240701.csv,kendall,random,11,4,0.7818181818181819,0.0003334435626102293 | |
aggregate,aggregate,LiveBench Reasoning Average,livebench_240701.csv,kendall,random,11,0,0.6731618328060892,0.004677734981047257 | |
aggregate,aggregate,LiveBench Reasoning Average,livebench_240701.csv,kendall,random,11,1,0.759389481241052,0.0013210471654040124 | |
aggregate,aggregate,LiveBench Reasoning Average,livebench_240701.csv,kendall,random,11,2,0.7339758434175737,0.0017872890369872653 | |
aggregate,aggregate,LiveBench Reasoning Average,livebench_240701.csv,kendall,random,11,3,0.6238794669049377,0.007931923532795268 | |
aggregate,aggregate,LiveBench Reasoning Average,livebench_240701.csv,kendall,random,11,4,0.7090909090909091,0.0015912097162097162 | |
aggregate,aggregate,LiveBench Coding Average,livebench_240701.csv,kendall,random,11,0,0.6363636363636364,0.005707170915504249 | |
aggregate,aggregate,LiveBench Coding Average,livebench_240701.csv,kendall,random,11,1,0.7454545454545454,0.000759529822029822 | |
aggregate,aggregate,LiveBench Coding Average,livebench_240701.csv,kendall,random,11,2,0.7706746355884524,0.0010393630991335228 | |
aggregate,aggregate,LiveBench Coding Average,livebench_240701.csv,kendall,random,11,3,0.8181818181818182,0.00013227513227513228 | |
aggregate,aggregate,LiveBench Coding Average,livebench_240701.csv,kendall,random,11,4,0.7706746355884524,0.0010393630991335228 | |
aggregate,aggregate,LiveBench Mathematics Average,livebench_240701.csv,kendall,random,11,0,0.7090909090909091,0.0015912097162097162 | |
aggregate,aggregate,LiveBench Mathematics Average,livebench_240701.csv,kendall,random,11,1,0.8909090909090909,1.3728555395222063e-05 | |
aggregate,aggregate,LiveBench Mathematics Average,livebench_240701.csv,kendall,random,11,2,0.7818181818181819,0.0003334435626102293 | |
aggregate,aggregate,LiveBench Mathematics Average,livebench_240701.csv,kendall,random,11,3,0.8545454545454545,4.624619207952541e-05 | |
aggregate,aggregate,LiveBench Mathematics Average,livebench_240701.csv,kendall,random,11,4,0.8545454545454545,4.624619207952541e-05 | |
aggregate,aggregate,LiveBench Data Analysis Average,livebench_240701.csv,kendall,random,11,0,0.5636363636363636,0.016540504248837583 | |
aggregate,aggregate,LiveBench Data Analysis Average,livebench_240701.csv,kendall,random,11,1,0.7454545454545454,0.000759529822029822 | |
aggregate,aggregate,LiveBench Data Analysis Average,livebench_240701.csv,kendall,random,11,2,0.6363636363636364,0.005707170915504249 | |
aggregate,aggregate,LiveBench Data Analysis Average,livebench_240701.csv,kendall,random,11,3,0.7818181818181819,0.0003334435626102293 | |
aggregate,aggregate,LiveBench Data Analysis Average,livebench_240701.csv,kendall,random,11,4,0.6363636363636364,0.005707170915504249 | |
aggregate,aggregate,LiveBench Language Average,livebench_240701.csv,kendall,random,11,0,0.7090909090909091,0.0015912097162097162 | |
aggregate,aggregate,LiveBench Language Average,livebench_240701.csv,kendall,random,11,1,0.8181818181818182,0.00013227513227513228 | |
aggregate,aggregate,LiveBench Language Average,livebench_240701.csv,kendall,random,11,2,0.5636363636363636,0.016540504248837583 | |
aggregate,aggregate,LiveBench Language Average,livebench_240701.csv,kendall,random,11,3,0.8545454545454545,4.624619207952541e-05 | |
aggregate,aggregate,LiveBench Language Average,livebench_240701.csv,kendall,random,11,4,0.7454545454545454,0.000759529822029822 | |
aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,kendall,random,11,0,0.6727272727272727,0.0031063111271444604 | |
aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,kendall,random,11,1,0.7454545454545454,0.000759529822029822 | |
aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,kendall,random,11,2,0.7090909090909091,0.0015912097162097162 | |
aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,kendall,random,11,3,0.6000000000000001,0.00994553671637005 | |
aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,kendall,random,11,4,0.7454545454545454,0.000759529822029822 | |