benchbench / cache /agreements_cache_741f08262e15cba4bd6c8b25f2b138ca.csv
Yotam-Perlitz
add cache
b9a3a5d
raw
history blame
85.2 kB
scenario,scenario_source,ref_scenario,ref_source,corr_type,model_select_strategy,model_subset_size_requested,exp_n,correlation,p_value
Helm Lite,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,0,0.2778254199662385,0.2400384567875128
Helm Lite,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,1,0.40368671387966554,0.08581278065055217
Helm Lite,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,2,0.42599897728156577,0.07162425926742408
Helm Lite,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,3,0.2778254199662385,0.2400384567875128
Helm Lite,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,4,0.36698792170878686,0.11834981273562825
Helm Lite NarrativeQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,0,-0.018181818181818184,1.0
Helm Lite NarrativeQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,1,-0.018181818181818184,1.0
Helm Lite NarrativeQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,2,-0.05454545454545454,0.8792698312489979
Helm Lite NarrativeQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,3,-0.018181818181818184,1.0
Helm Lite NarrativeQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,4,-0.1272727272727273,0.6480954385121052
Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,0,0.05454545454545454,0.8792698312489979
Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,1,-0.018181818181818184,1.0
Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,2,-0.018181818181818184,1.0
Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,3,0.05454545454545454,0.8792698312489979
Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,4,-0.05454545454545454,0.8792698312489979
Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,0,0.05454545454545454,0.8792698312489979
Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,1,0.23636363636363636,0.3587114698573032
Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,2,0.2,0.4453821448613115
Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,3,0.05454545454545454,0.8792698312489979
Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,4,0.1272727272727273,0.6480954385121052
Helm Lite OpenBookQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,0,0.587180674734059,0.01246215829454031
Helm Lite OpenBookQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,1,0.6727272727272727,0.0031063111271444604
Helm Lite OpenBookQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,2,0.697277051246695,0.003004262239398284
Helm Lite OpenBookQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,3,0.587180674734059,0.01246215829454031
Helm Lite OpenBookQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,4,0.6605782590758164,0.004936818556325077
Helm Lite MMLU,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,0,0.6000000000000001,0.00994553671637005
Helm Lite MMLU,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,1,0.6727272727272727,0.0031063111271444604
Helm Lite MMLU,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,2,0.7090909090909091,0.0015912097162097162
Helm Lite MMLU,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,3,0.6000000000000001,0.00994553671637005
Helm Lite MMLU,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,4,0.6363636363636364,0.005707170915504249
Helm Lite MathEquivalentCOT,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,0,0.2727272727272727,0.2829668209876543
Helm Lite MathEquivalentCOT,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,1,0.34545454545454546,0.16457331248997917
Helm Lite MathEquivalentCOT,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,2,0.38181818181818183,0.12097096961680295
Helm Lite MathEquivalentCOT,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,3,0.2727272727272727,0.2829668209876543
Helm Lite MathEquivalentCOT,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,4,0.34545454545454546,0.16457331248997917
Helm Lite GSM8K,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,0,0.6363636363636364,0.005707170915504249
Helm Lite GSM8K,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,1,0.6727272727272727,0.0031063111271444604
Helm Lite GSM8K,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,2,0.6363636363636364,0.005707170915504249
Helm Lite GSM8K,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,3,0.6363636363636364,0.005707170915504249
Helm Lite GSM8K,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,4,0.7090909090909091,0.0015912097162097162
Helm Lite LegalBench,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,0,0.18349396085439343,0.43487965849578336
Helm Lite LegalBench,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,1,0.2935903373670295,0.21152242941072896
Helm Lite LegalBench,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,2,0.2727272727272727,0.2829668209876543
Helm Lite LegalBench,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,3,0.18349396085439343,0.43487965849578336
Helm Lite LegalBench,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,4,0.2568915451961508,0.27429882739587574
Helm Lite MedQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,0,0.4909090909090909,0.04053235730319064
Helm Lite MedQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,1,0.6000000000000001,0.00994553671637005
Helm Lite MedQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,2,0.5636363636363636,0.016540504248837583
Helm Lite MedQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,3,0.4909090909090909,0.04053235730319064
Helm Lite MedQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,4,0.5636363636363636,0.016540504248837583
Helm Lite WMT2014,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,0,0.34545454545454546,0.16457331248997917
Helm Lite WMT2014,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,1,0.41818181818181815,0.08656124739458072
Helm Lite WMT2014,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,2,0.4909090909090909,0.04053235730319064
Helm Lite WMT2014,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,3,0.34545454545454546,0.16457331248997917
Helm Lite WMT2014,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,4,0.34545454545454546,0.16457331248997917
HF OpenLLM v2,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,0,0.9272727272727274,3.2567740901074234e-06
HF OpenLLM v2,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,1,0.8545454545454545,4.624619207952541e-05
HF OpenLLM v2,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,2,0.9272727272727274,3.2567740901074234e-06
HF OpenLLM v2,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,3,0.8545454545454545,4.624619207952541e-05
HF OpenLLM v2,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,4,0.7818181818181819,0.0003334435626102293
HFv2 BBH,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,0,0.8181818181818182,0.00013227513227513228
HFv2 BBH,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,1,0.8181818181818182,0.00013227513227513228
HFv2 BBH,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,2,1.0,5.010421677088344e-08
HFv2 BBH,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,3,0.6727272727272727,0.0031063111271444604
HFv2 BBH,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,4,0.7454545454545454,0.000759529822029822
HFv2 GPQA,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,0,0.45454545454545453,0.06017015392015392
HFv2 GPQA,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,1,0.41818181818181815,0.08656124739458072
HFv2 GPQA,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,2,0.45454545454545453,0.06017015392015392
HFv2 GPQA,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,3,0.7454545454545454,0.000759529822029822
HFv2 GPQA,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,4,0.5272727272727272,0.02638447971781305
HFv2 IFEval,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,0,0.8181818181818182,0.00013227513227513228
HFv2 IFEval,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,1,0.5636363636363636,0.016540504248837583
HFv2 IFEval,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,2,0.7454545454545454,0.000759529822029822
HFv2 IFEval,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,3,0.7454545454545454,0.000759529822029822
HFv2 IFEval,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,4,0.7090909090909091,0.0015912097162097162
HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,0,0.8909090909090909,1.3728555395222063e-05
HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,1,0.9272727272727274,3.2567740901074234e-06
HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,2,1.0,5.010421677088344e-08
HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,3,0.8181818181818182,0.00013227513227513228
HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,4,0.7818181818181819,0.0003334435626102293
HFv2 Math Level 5,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,0,0.8909090909090909,1.3728555395222063e-05
HFv2 Math Level 5,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,1,0.5272727272727272,0.02638447971781305
HFv2 Math Level 5,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,2,0.8545454545454545,4.624619207952541e-05
HFv2 Math Level 5,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,3,0.4403855060505442,0.06091869077971648
HFv2 Math Level 5,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,4,0.587180674734059,0.01246215829454031
HFv2 MuSR,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,0,0.7454545454545454,0.000759529822029822
HFv2 MuSR,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,1,0.45454545454545453,0.06017015392015392
HFv2 MuSR,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,2,0.38181818181818183,0.12097096961680295
HFv2 MuSR,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,3,0.6363636363636364,0.005707170915504249
HFv2 MuSR,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,4,0.587180674734059,0.01246215829454031
Helm MMLU,helm_mmlu_240829.csv,aggregate,aggregate,kendall,random,11,0,0.7090909090909091,0.0015912097162097162
Helm MMLU,helm_mmlu_240829.csv,aggregate,aggregate,kendall,random,11,1,0.7090909090909091,0.0015912097162097162
Helm MMLU,helm_mmlu_240829.csv,aggregate,aggregate,kendall,random,11,2,0.7090909090909091,0.0015912097162097162
Helm MMLU,helm_mmlu_240829.csv,aggregate,aggregate,kendall,random,11,3,0.7090909090909091,0.0015912097162097162
Helm MMLU,helm_mmlu_240829.csv,aggregate,aggregate,kendall,random,11,4,0.7090909090909091,0.0015912097162097162
LMSys Arena,chatbot_arena_240829.csv,aggregate,aggregate,kendall,random,11,0,1.0,5.010421677088344e-08
LMSys Arena,chatbot_arena_240829.csv,aggregate,aggregate,kendall,random,11,1,1.0,5.010421677088344e-08
LMSys Arena,chatbot_arena_240829.csv,aggregate,aggregate,kendall,random,11,2,1.0,5.010421677088344e-08
LMSys Arena,chatbot_arena_240829.csv,aggregate,aggregate,kendall,random,11,3,1.0,5.010421677088344e-08
LMSys Arena,chatbot_arena_240829.csv,aggregate,aggregate,kendall,random,11,4,1.0,5.010421677088344e-08
MixEval,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,0,0.8545454545454545,4.624619207952541e-05
MixEval,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,1,0.7818181818181819,0.0003334435626102293
MixEval,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,2,0.8181818181818182,0.00013227513227513228
MixEval,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,3,0.8181818181818182,0.00013227513227513228
MixEval,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,4,0.8181818181818182,0.00013227513227513228
MixEval Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,0,0.7818181818181819,0.0003334435626102293
MixEval Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,1,0.7454545454545454,0.000759529822029822
MixEval Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,2,0.7090909090909091,0.0015912097162097162
MixEval Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,3,0.4909090909090909,0.04053235730319064
MixEval Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,4,0.8181818181818182,0.00013227513227513228
MixEval TriviaQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,0,0.6238794669049377,0.007931923532795268
MixEval TriviaQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,1,0.6605782590758164,0.004936818556325077
MixEval TriviaQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,2,0.4403855060505442,0.06091869077971648
MixEval TriviaQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,3,0.7090909090909091,0.0015912097162097162
MixEval TriviaQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,4,0.697277051246695,0.003004262239398284
MixEval MMLU,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,0,0.8545454545454545,4.624619207952541e-05
MixEval MMLU,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,1,0.7818181818181819,0.0003334435626102293
MixEval MMLU,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,2,0.8181818181818182,0.00013227513227513228
MixEval MMLU,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,3,0.7818181818181819,0.0003334435626102293
MixEval MMLU,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,4,0.7090909090909091,0.0015912097162097162
MixEval DROP,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,0,0.6238794669049377,0.007931923532795268
MixEval DROP,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,1,0.4403855060505442,0.06091869077971648
MixEval DROP,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,2,0.5636363636363636,0.016540504248837583
MixEval DROP,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,3,0.6363636363636364,0.005707170915504249
MixEval DROP,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,4,0.5636363636363636,0.016540504248837583
MixEval HellaSwag,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,0,0.6000000000000001,0.00994553671637005
MixEval HellaSwag,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,1,0.5636363636363636,0.016540504248837583
MixEval HellaSwag,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,2,0.6000000000000001,0.00994553671637005
MixEval HellaSwag,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,3,0.6000000000000001,0.00994553671637005
MixEval HellaSwag,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,4,0.5636363636363636,0.016540504248837583
MixEval CommonsenseQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,0,0.7339758434175737,0.0017872890369872653
MixEval CommonsenseQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,1,0.587180674734059,0.01246215829454031
MixEval CommonsenseQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,2,0.6482593132545567,0.006117582447622459
MixEval CommonsenseQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,3,0.759389481241052,0.0013210471654040124
MixEval CommonsenseQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,4,0.759389481241052,0.0013210471654040124
MixEval TriviaQA Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,0,0.6363636363636364,0.005707170915504249
MixEval TriviaQA Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,1,0.7090909090909091,0.0015912097162097162
MixEval TriviaQA Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,2,0.6727272727272727,0.0031063111271444604
MixEval TriviaQA Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,3,0.2727272727272727,0.2829668209876543
MixEval TriviaQA Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,4,0.5272727272727272,0.02638447971781305
MixEval MMLU Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,0,0.38181818181818183,0.12097096961680295
MixEval MMLU Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,1,0.4909090909090909,0.04053235730319064
MixEval MMLU Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,2,0.38895558795273394,0.10000137830747906
MixEval MMLU Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,3,0.38181818181818183,0.12097096961680295
MixEval MMLU Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,4,0.697277051246695,0.003004262239398284
MixEval DROP Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,0,0.4909090909090909,0.04053235730319064
MixEval DROP Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,1,0.41818181818181815,0.08656124739458072
MixEval DROP Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,2,0.6000000000000001,0.00994553671637005
MixEval DROP Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,3,0.6000000000000001,0.00994553671637005
MixEval DROP Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,4,0.6000000000000001,0.00994553671637005
AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,aggregate,aggregate,kendall,random,11,0,0.8181818181818182,0.00013227513227513228
AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,aggregate,aggregate,kendall,random,11,1,0.8807710121010884,0.00017812930545546289
AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,aggregate,aggregate,kendall,random,11,2,0.8807710121010884,0.00017812930545546289
AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,aggregate,aggregate,kendall,random,11,3,0.8807710121010884,0.00017812930545546289
AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,aggregate,aggregate,kendall,random,11,4,0.8545454545454545,4.624619207952541e-05
LiveBench 240725,livebench_240829.csv,aggregate,aggregate,kendall,random,11,0,0.7454545454545454,0.000759529822029822
LiveBench 240725,livebench_240829.csv,aggregate,aggregate,kendall,random,11,1,0.8181818181818182,0.00013227513227513228
LiveBench 240725,livebench_240829.csv,aggregate,aggregate,kendall,random,11,2,0.8545454545454545,4.624619207952541e-05
LiveBench 240725,livebench_240829.csv,aggregate,aggregate,kendall,random,11,3,0.8545454545454545,4.624619207952541e-05
LiveBench 240725,livebench_240829.csv,aggregate,aggregate,kendall,random,11,4,0.8545454545454545,4.624619207952541e-05
LiveBench Reasoning,livebench_240829.csv,aggregate,aggregate,kendall,random,11,0,0.4770842982214229,0.042330229121360724
LiveBench Reasoning,livebench_240829.csv,aggregate,aggregate,kendall,random,11,1,0.6727272727272727,0.0031063111271444604
LiveBench Reasoning,livebench_240829.csv,aggregate,aggregate,kendall,random,11,2,0.6605782590758164,0.004936818556325077
LiveBench Reasoning,livebench_240829.csv,aggregate,aggregate,kendall,random,11,3,0.8440722199302099,0.0003281542287518694
LiveBench Reasoning,livebench_240829.csv,aggregate,aggregate,kendall,random,11,4,0.7339758434175737,0.0017872890369872653
LiveBench Coding,livebench_240829.csv,aggregate,aggregate,kendall,random,11,0,0.9636363636363636,5.511463844797178e-07
LiveBench Coding,livebench_240829.csv,aggregate,aggregate,kendall,random,11,1,0.6727272727272727,0.0031063111271444604
LiveBench Coding,livebench_240829.csv,aggregate,aggregate,kendall,random,11,2,0.7818181818181819,0.0003334435626102293
LiveBench Coding,livebench_240829.csv,aggregate,aggregate,kendall,random,11,3,0.8545454545454545,4.624619207952541e-05
LiveBench Coding,livebench_240829.csv,aggregate,aggregate,kendall,random,11,4,0.7818181818181819,0.0003334435626102293
LiveBench Mathematics,livebench_240829.csv,aggregate,aggregate,kendall,random,11,0,0.7090909090909091,0.0015912097162097162
LiveBench Mathematics,livebench_240829.csv,aggregate,aggregate,kendall,random,11,1,0.7454545454545454,0.000759529822029822
LiveBench Mathematics,livebench_240829.csv,aggregate,aggregate,kendall,random,11,2,0.8181818181818182,0.00013227513227513228
LiveBench Mathematics,livebench_240829.csv,aggregate,aggregate,kendall,random,11,3,0.8909090909090909,1.3728555395222063e-05
LiveBench Mathematics,livebench_240829.csv,aggregate,aggregate,kendall,random,11,4,0.8545454545454545,4.624619207952541e-05
LiveBench Data Analysis,livebench_240829.csv,aggregate,aggregate,kendall,random,11,0,0.4909090909090909,0.04053235730319064
LiveBench Data Analysis,livebench_240829.csv,aggregate,aggregate,kendall,random,11,1,0.6363636363636364,0.005707170915504249
LiveBench Data Analysis,livebench_240829.csv,aggregate,aggregate,kendall,random,11,2,0.7454545454545454,0.000759529822029822
LiveBench Data Analysis,livebench_240829.csv,aggregate,aggregate,kendall,random,11,3,0.8545454545454545,4.624619207952541e-05
LiveBench Data Analysis,livebench_240829.csv,aggregate,aggregate,kendall,random,11,4,0.7454545454545454,0.000759529822029822
LiveBench Language,livebench_240829.csv,aggregate,aggregate,kendall,random,11,0,0.45454545454545453,0.06017015392015392
LiveBench Language,livebench_240829.csv,aggregate,aggregate,kendall,random,11,1,0.8181818181818182,0.00013227513227513228
LiveBench Language,livebench_240829.csv,aggregate,aggregate,kendall,random,11,2,0.6363636363636364,0.005707170915504249
LiveBench Language,livebench_240829.csv,aggregate,aggregate,kendall,random,11,3,0.9272727272727274,3.2567740901074234e-06
LiveBench Language,livebench_240829.csv,aggregate,aggregate,kendall,random,11,4,0.8545454545454545,4.624619207952541e-05
LiveBench Instruction Following,livebench_240829.csv,aggregate,aggregate,kendall,random,11,0,0.6000000000000001,0.00994553671637005
LiveBench Instruction Following,livebench_240829.csv,aggregate,aggregate,kendall,random,11,1,0.7090909090909091,0.0015912097162097162
LiveBench Instruction Following,livebench_240829.csv,aggregate,aggregate,kendall,random,11,2,0.6727272727272727,0.0031063111271444604
LiveBench Instruction Following,livebench_240829.csv,aggregate,aggregate,kendall,random,11,3,0.4909090909090909,0.04053235730319064
LiveBench Instruction Following,livebench_240829.csv,aggregate,aggregate,kendall,random,11,4,0.6727272727272727,0.0031063111271444604
WildBench Elo LC,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,0,0.8909090909090909,1.3728555395222063e-05
WildBench Elo LC,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,1,0.8181818181818182,0.00013227513227513228
WildBench Elo LC,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,2,0.8909090909090909,1.3728555395222063e-05
WildBench Elo LC,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,3,0.8909090909090909,1.3728555395222063e-05
WildBench Elo LC,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,4,0.8181818181818182,0.00013227513227513228
WildBench Information Seeking,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,0,0.7454545454545454,0.000759529822029822
WildBench Information Seeking,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,1,0.8181818181818182,0.00013227513227513228
WildBench Information Seeking,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,2,0.8545454545454545,4.624619207952541e-05
WildBench Information Seeking,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,3,0.7454545454545454,0.000759529822029822
WildBench Information Seeking,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,4,0.7454545454545454,0.000759529822029822
WildBench Creative,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,0,0.7454545454545454,0.000759529822029822
WildBench Creative,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,1,0.7818181818181819,0.0003334435626102293
WildBench Creative,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,2,0.7818181818181819,0.0003334435626102293
WildBench Creative,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,3,0.7454545454545454,0.000759529822029822
WildBench Creative,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,4,0.7454545454545454,0.000759529822029822
WildBench Code Debugging,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,0,1.0,5.010421677088344e-08
WildBench Code Debugging,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,1,0.9636363636363636,5.511463844797178e-07
WildBench Code Debugging,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,2,0.8909090909090909,1.3728555395222063e-05
WildBench Code Debugging,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,3,0.9636363636363636,5.511463844797178e-07
WildBench Code Debugging,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,4,1.0,5.010421677088344e-08
WildBench Math & Data,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,0,1.0,5.010421677088344e-08
WildBench Math & Data,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,1,1.0,5.010421677088344e-08
WildBench Math & Data,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,2,0.9272727272727274,3.2567740901074234e-06
WildBench Math & Data,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,3,0.9636363636363636,5.511463844797178e-07
WildBench Math & Data,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,4,1.0,5.010421677088344e-08
WildBench Reasoning & Planning,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,0,0.9272727272727274,3.2567740901074234e-06
WildBench Reasoning & Planning,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,1,0.9636363636363636,5.511463844797178e-07
WildBench Reasoning & Planning,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,2,0.8909090909090909,1.3728555395222063e-05
WildBench Reasoning & Planning,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,3,0.8181818181818182,0.00013227513227513228
WildBench Reasoning & Planning,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,4,0.9272727272727274,3.2567740901074234e-06
WildBench Score,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,0,0.9636363636363636,5.511463844797178e-07
WildBench Score,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,1,0.9636363636363636,5.511463844797178e-07
WildBench Score,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,2,0.8909090909090909,1.3728555395222063e-05
WildBench Score,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,3,0.8909090909090909,1.3728555395222063e-05
WildBench Score,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,4,0.9636363636363636,5.511463844797178e-07
Arena Hard,arena_hard_240829.csv,aggregate,aggregate,kendall,random,11,0,1.0,5.010421677088344e-08
Arena Hard,arena_hard_240829.csv,aggregate,aggregate,kendall,random,11,1,1.0,5.010421677088344e-08
Arena Hard,arena_hard_240829.csv,aggregate,aggregate,kendall,random,11,2,1.0,5.010421677088344e-08
Arena Hard,arena_hard_240829.csv,aggregate,aggregate,kendall,random,11,3,1.0,5.010421677088344e-08
Arena Hard,arena_hard_240829.csv,aggregate,aggregate,kendall,random,11,4,1.0,5.010421677088344e-08
HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,0,0.45454545454545453,0.06017015392015392
HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,1,0.6000000000000001,0.00994553671637005
HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,2,0.3090909090909091,0.21834651074234407
HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,3,0.5272727272727272,0.02638447971781305
HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,4,0.38181818181818183,0.12097096961680295
HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,0,0.6000000000000001,0.00994553671637005
HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,1,0.5636363636363636,0.016540504248837583
HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,2,0.5272727272727272,0.02638447971781305
HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,3,0.6000000000000001,0.00994553671637005
HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,4,0.4909090909090909,0.04053235730319064
HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,0,0.45454545454545453,0.06017015392015392
HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,1,0.6000000000000001,0.00994553671637005
HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,2,0.34545454545454546,0.16457331248997917
HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,3,0.41818181818181815,0.08656124739458072
HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,4,0.2727272727272727,0.2829668209876543
HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,0,0.5741725345968929,0.015177848122929492
HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,1,0.3519121986239021,0.1366995137219537
HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,2,0.42599897728156577,0.07162425926742408
HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,3,0.38181818181818183,0.12097096961680295
HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,4,0.4403855060505442,0.06091869077971648
HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,0,0.7090909090909091,0.0015912097162097162
HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,1,0.7818181818181819,0.0003334435626102293
HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,2,0.6727272727272727,0.0031063111271444604
HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,3,0.7454545454545454,0.000759529822029822
HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,4,0.6727272727272727,0.0031063111271444604
HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,0,0.6000000000000001,0.00994553671637005
HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,1,0.5272727272727272,0.02638447971781305
HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,2,0.3090909090909091,0.21834651074234407
HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,3,0.45454545454545453,0.06017015392015392
HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,4,0.2727272727272727,0.2829668209876543
HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,0,0.4403855060505442,0.06091869077971648
HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,1,0.38181818181818183,0.12097096961680295
HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,2,0.4403855060505442,0.06091869077971648
HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,3,0.45454545454545453,0.06017015392015392
HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,4,0.36698792170878686,0.11834981273562825
BFCL,bfcl_240906.csv,aggregate,aggregate,kendall,random,11,0,0.2,0.4453821448613115
BFCL,bfcl_240906.csv,aggregate,aggregate,kendall,random,11,1,0.38181818181818183,0.12097096961680295
BFCL,bfcl_240906.csv,aggregate,aggregate,kendall,random,11,2,0.41818181818181815,0.08656124739458072
BFCL,bfcl_240906.csv,aggregate,aggregate,kendall,random,11,3,0.5272727272727272,0.02638447971781305
BFCL,bfcl_240906.csv,aggregate,aggregate,kendall,random,11,4,0.5272727272727272,0.02638447971781305
BIGGEN,biggen_240829.csv,aggregate,aggregate,kendall,random,11,0,0.8181818181818182,0.00013227513227513228
BIGGEN,biggen_240829.csv,aggregate,aggregate,kendall,random,11,1,0.8181818181818182,0.00013227513227513228
BIGGEN,biggen_240829.csv,aggregate,aggregate,kendall,random,11,2,0.8909090909090909,1.3728555395222063e-05
BIGGEN,biggen_240829.csv,aggregate,aggregate,kendall,random,11,3,0.8181818181818182,0.00013227513227513228
BIGGEN,biggen_240829.csv,aggregate,aggregate,kendall,random,11,4,0.7090909090909091,0.0015912097162097162
BIGGEN Grounding,biggen_240829.csv,aggregate,aggregate,kendall,random,11,0,0.7454545454545454,0.000759529822029822
BIGGEN Grounding,biggen_240829.csv,aggregate,aggregate,kendall,random,11,1,0.7818181818181819,0.0003334435626102293
BIGGEN Grounding,biggen_240829.csv,aggregate,aggregate,kendall,random,11,2,0.8181818181818182,0.00013227513227513228
BIGGEN Grounding,biggen_240829.csv,aggregate,aggregate,kendall,random,11,3,0.8073734277593311,0.0005907573118657002
BIGGEN Grounding,biggen_240829.csv,aggregate,aggregate,kendall,random,11,4,0.7454545454545454,0.000759529822029822
BIGGEN Instruction Following,biggen_240829.csv,aggregate,aggregate,kendall,random,11,0,0.587180674734059,0.01246215829454031
BIGGEN Instruction Following,biggen_240829.csv,aggregate,aggregate,kendall,random,11,1,0.6482593132545567,0.006117582447622459
BIGGEN Instruction Following,biggen_240829.csv,aggregate,aggregate,kendall,random,11,2,0.8545454545454545,4.624619207952541e-05
BIGGEN Instruction Following,biggen_240829.csv,aggregate,aggregate,kendall,random,11,3,0.7706746355884524,0.0010393630991335228
BIGGEN Instruction Following,biggen_240829.csv,aggregate,aggregate,kendall,random,11,4,0.5371291452680612,0.02311942970946668
BIGGEN Planning,biggen_240829.csv,aggregate,aggregate,kendall,random,11,0,0.6238794669049377,0.007931923532795268
BIGGEN Planning,biggen_240829.csv,aggregate,aggregate,kendall,random,11,1,0.4909090909090909,0.04053235730319064
BIGGEN Planning,biggen_240829.csv,aggregate,aggregate,kendall,random,11,2,0.8440722199302099,0.0003281542287518694
BIGGEN Planning,biggen_240829.csv,aggregate,aggregate,kendall,random,11,3,0.7454545454545454,0.000759529822029822
BIGGEN Planning,biggen_240829.csv,aggregate,aggregate,kendall,random,11,4,0.36698792170878686,0.11834981273562825
BIGGEN Reasoning,biggen_240829.csv,aggregate,aggregate,kendall,random,11,0,0.8909090909090909,1.3728555395222063e-05
BIGGEN Reasoning,biggen_240829.csv,aggregate,aggregate,kendall,random,11,1,0.9272727272727274,3.2567740901074234e-06
BIGGEN Reasoning,biggen_240829.csv,aggregate,aggregate,kendall,random,11,2,1.0,5.010421677088344e-08
BIGGEN Reasoning,biggen_240829.csv,aggregate,aggregate,kendall,random,11,3,0.7090909090909091,0.0015912097162097162
BIGGEN Reasoning,biggen_240829.csv,aggregate,aggregate,kendall,random,11,4,0.6727272727272727,0.0031063111271444604
BIGGEN Refinement,biggen_240829.csv,aggregate,aggregate,kendall,random,11,0,0.8181818181818182,0.00013227513227513228
BIGGEN Refinement,biggen_240829.csv,aggregate,aggregate,kendall,random,11,1,0.8073734277593311,0.0005907573118657002
BIGGEN Refinement,biggen_240829.csv,aggregate,aggregate,kendall,random,11,2,0.8909090909090909,1.3728555395222063e-05
BIGGEN Refinement,biggen_240829.csv,aggregate,aggregate,kendall,random,11,3,0.7818181818181819,0.0003334435626102293
BIGGEN Refinement,biggen_240829.csv,aggregate,aggregate,kendall,random,11,4,0.6605782590758164,0.004936818556325077
BIGGEN Safety,biggen_240829.csv,aggregate,aggregate,kendall,random,11,0,-0.0909090909090909,0.7611503928170594
BIGGEN Safety,biggen_240829.csv,aggregate,aggregate,kendall,random,11,1,0.07339758434175737,0.7547764265871044
BIGGEN Safety,biggen_240829.csv,aggregate,aggregate,kendall,random,11,2,0.4403855060505442,0.06091869077971648
BIGGEN Safety,biggen_240829.csv,aggregate,aggregate,kendall,random,11,3,0.3302891295379082,0.15985367483762747
BIGGEN Safety,biggen_240829.csv,aggregate,aggregate,kendall,random,11,4,0.1272727272727273,0.6480954385121052
BIGGEN Theory of Mind,biggen_240829.csv,aggregate,aggregate,kendall,random,11,0,0.7090909090909091,0.0015912097162097162
BIGGEN Theory of Mind,biggen_240829.csv,aggregate,aggregate,kendall,random,11,1,0.7818181818181819,0.0003334435626102293
BIGGEN Theory of Mind,biggen_240829.csv,aggregate,aggregate,kendall,random,11,2,0.8705196492275474,0.00023202582506637044
BIGGEN Theory of Mind,biggen_240829.csv,aggregate,aggregate,kendall,random,11,3,0.7479575920067658,0.001637274718449882
BIGGEN Theory of Mind,biggen_240829.csv,aggregate,aggregate,kendall,random,11,4,0.5983660736054126,0.01175728488671479
BIGGEN Tool Usage,biggen_240829.csv,aggregate,aggregate,kendall,random,11,0,0.7090909090909091,0.0015912097162097162
BIGGEN Tool Usage,biggen_240829.csv,aggregate,aggregate,kendall,random,11,1,0.6000000000000001,0.00994553671637005
BIGGEN Tool Usage,biggen_240829.csv,aggregate,aggregate,kendall,random,11,2,0.7090909090909091,0.0015912097162097162
BIGGEN Tool Usage,biggen_240829.csv,aggregate,aggregate,kendall,random,11,3,0.7454545454545454,0.000759529822029822
BIGGEN Tool Usage,biggen_240829.csv,aggregate,aggregate,kendall,random,11,4,0.5272727272727272,0.02638447971781305
BIGGEN Multilingual,biggen_240829.csv,aggregate,aggregate,kendall,random,11,0,0.7706746355884524,0.0010393630991335228
BIGGEN Multilingual,biggen_240829.csv,aggregate,aggregate,kendall,random,11,1,0.8909090909090909,1.3728555395222063e-05
BIGGEN Multilingual,biggen_240829.csv,aggregate,aggregate,kendall,random,11,2,0.5272727272727272,0.02638447971781305
BIGGEN Multilingual,biggen_240829.csv,aggregate,aggregate,kendall,random,11,3,0.7454545454545454,0.000759529822029822
BIGGEN Multilingual,biggen_240829.csv,aggregate,aggregate,kendall,random,11,4,0.7818181818181819,0.0003334435626102293
LiveBench 240624,livebench_240701.csv,aggregate,aggregate,kendall,random,11,0,0.7818181818181819,0.0003334435626102293
LiveBench 240624,livebench_240701.csv,aggregate,aggregate,kendall,random,11,1,0.8545454545454545,4.624619207952541e-05
LiveBench 240624,livebench_240701.csv,aggregate,aggregate,kendall,random,11,2,0.7818181818181819,0.0003334435626102293
LiveBench 240624,livebench_240701.csv,aggregate,aggregate,kendall,random,11,3,0.8545454545454545,4.624619207952541e-05
LiveBench 240624,livebench_240701.csv,aggregate,aggregate,kendall,random,11,4,0.7818181818181819,0.0003334435626102293
LiveBench Reasoning Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,0,0.6731618328060892,0.004677734981047257
LiveBench Reasoning Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,1,0.759389481241052,0.0013210471654040124
LiveBench Reasoning Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,2,0.7339758434175737,0.0017872890369872653
LiveBench Reasoning Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,3,0.6238794669049377,0.007931923532795268
LiveBench Reasoning Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,4,0.7090909090909091,0.0015912097162097162
LiveBench Coding Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,0,0.6363636363636364,0.005707170915504249
LiveBench Coding Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,1,0.7454545454545454,0.000759529822029822
LiveBench Coding Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,2,0.7706746355884524,0.0010393630991335228
LiveBench Coding Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,3,0.8181818181818182,0.00013227513227513228
LiveBench Coding Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,4,0.7706746355884524,0.0010393630991335228
LiveBench Mathematics Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,0,0.7090909090909091,0.0015912097162097162
LiveBench Mathematics Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,1,0.8909090909090909,1.3728555395222063e-05
LiveBench Mathematics Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,2,0.7818181818181819,0.0003334435626102293
LiveBench Mathematics Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,3,0.8545454545454545,4.624619207952541e-05
LiveBench Mathematics Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,4,0.8545454545454545,4.624619207952541e-05
LiveBench Data Analysis Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,0,0.5636363636363636,0.016540504248837583
LiveBench Data Analysis Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,1,0.7454545454545454,0.000759529822029822
LiveBench Data Analysis Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,2,0.6363636363636364,0.005707170915504249
LiveBench Data Analysis Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,3,0.7818181818181819,0.0003334435626102293
LiveBench Data Analysis Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,4,0.6363636363636364,0.005707170915504249
LiveBench Language Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,0,0.7090909090909091,0.0015912097162097162
LiveBench Language Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,1,0.8181818181818182,0.00013227513227513228
LiveBench Language Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,2,0.5636363636363636,0.016540504248837583
LiveBench Language Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,3,0.8545454545454545,4.624619207952541e-05
LiveBench Language Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,4,0.7454545454545454,0.000759529822029822
LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,0,0.6727272727272727,0.0031063111271444604
LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,1,0.7454545454545454,0.000759529822029822
LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,2,0.7090909090909091,0.0015912097162097162
LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,3,0.6000000000000001,0.00994553671637005
LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,4,0.7454545454545454,0.000759529822029822
aggregate,aggregate,Helm Lite,helm_lite_240829.csv,kendall,random,11,0,0.2778254199662385,0.2400384567875128
aggregate,aggregate,Helm Lite,helm_lite_240829.csv,kendall,random,11,1,0.40368671387966554,0.08581278065055217
aggregate,aggregate,Helm Lite,helm_lite_240829.csv,kendall,random,11,2,0.42599897728156577,0.07162425926742408
aggregate,aggregate,Helm Lite,helm_lite_240829.csv,kendall,random,11,3,0.2778254199662385,0.2400384567875128
aggregate,aggregate,Helm Lite,helm_lite_240829.csv,kendall,random,11,4,0.36698792170878686,0.11834981273562825
aggregate,aggregate,Helm Lite NarrativeQA,helm_lite_240829.csv,kendall,random,11,0,-0.018181818181818184,1.0
aggregate,aggregate,Helm Lite NarrativeQA,helm_lite_240829.csv,kendall,random,11,1,-0.018181818181818184,1.0
aggregate,aggregate,Helm Lite NarrativeQA,helm_lite_240829.csv,kendall,random,11,2,-0.05454545454545454,0.8792698312489979
aggregate,aggregate,Helm Lite NarrativeQA,helm_lite_240829.csv,kendall,random,11,3,-0.018181818181818184,1.0
aggregate,aggregate,Helm Lite NarrativeQA,helm_lite_240829.csv,kendall,random,11,4,-0.1272727272727273,0.6480954385121052
aggregate,aggregate,Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,kendall,random,11,0,0.05454545454545454,0.8792698312489979
aggregate,aggregate,Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,kendall,random,11,1,-0.018181818181818184,1.0
aggregate,aggregate,Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,kendall,random,11,2,-0.018181818181818184,1.0
aggregate,aggregate,Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,kendall,random,11,3,0.05454545454545454,0.8792698312489979
aggregate,aggregate,Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,kendall,random,11,4,-0.05454545454545454,0.8792698312489979
aggregate,aggregate,Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,kendall,random,11,0,0.05454545454545454,0.8792698312489979
aggregate,aggregate,Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,kendall,random,11,1,0.23636363636363636,0.3587114698573032
aggregate,aggregate,Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,kendall,random,11,2,0.2,0.4453821448613115
aggregate,aggregate,Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,kendall,random,11,3,0.05454545454545454,0.8792698312489979
aggregate,aggregate,Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,kendall,random,11,4,0.1272727272727273,0.6480954385121052
aggregate,aggregate,Helm Lite OpenBookQA,helm_lite_240829.csv,kendall,random,11,0,0.587180674734059,0.01246215829454031
aggregate,aggregate,Helm Lite OpenBookQA,helm_lite_240829.csv,kendall,random,11,1,0.6727272727272727,0.0031063111271444604
aggregate,aggregate,Helm Lite OpenBookQA,helm_lite_240829.csv,kendall,random,11,2,0.697277051246695,0.003004262239398284
aggregate,aggregate,Helm Lite OpenBookQA,helm_lite_240829.csv,kendall,random,11,3,0.587180674734059,0.01246215829454031
aggregate,aggregate,Helm Lite OpenBookQA,helm_lite_240829.csv,kendall,random,11,4,0.6605782590758164,0.004936818556325077
aggregate,aggregate,Helm Lite MMLU,helm_lite_240829.csv,kendall,random,11,0,0.6000000000000001,0.00994553671637005
aggregate,aggregate,Helm Lite MMLU,helm_lite_240829.csv,kendall,random,11,1,0.6727272727272727,0.0031063111271444604
aggregate,aggregate,Helm Lite MMLU,helm_lite_240829.csv,kendall,random,11,2,0.7090909090909091,0.0015912097162097162
aggregate,aggregate,Helm Lite MMLU,helm_lite_240829.csv,kendall,random,11,3,0.6000000000000001,0.00994553671637005
aggregate,aggregate,Helm Lite MMLU,helm_lite_240829.csv,kendall,random,11,4,0.6363636363636364,0.005707170915504249
aggregate,aggregate,Helm Lite MathEquivalentCOT,helm_lite_240829.csv,kendall,random,11,0,0.2727272727272727,0.2829668209876543
aggregate,aggregate,Helm Lite MathEquivalentCOT,helm_lite_240829.csv,kendall,random,11,1,0.34545454545454546,0.16457331248997917
aggregate,aggregate,Helm Lite MathEquivalentCOT,helm_lite_240829.csv,kendall,random,11,2,0.38181818181818183,0.12097096961680295
aggregate,aggregate,Helm Lite MathEquivalentCOT,helm_lite_240829.csv,kendall,random,11,3,0.2727272727272727,0.2829668209876543
aggregate,aggregate,Helm Lite MathEquivalentCOT,helm_lite_240829.csv,kendall,random,11,4,0.34545454545454546,0.16457331248997917
aggregate,aggregate,Helm Lite GSM8K,helm_lite_240829.csv,kendall,random,11,0,0.6363636363636364,0.005707170915504249
aggregate,aggregate,Helm Lite GSM8K,helm_lite_240829.csv,kendall,random,11,1,0.6727272727272727,0.0031063111271444604
aggregate,aggregate,Helm Lite GSM8K,helm_lite_240829.csv,kendall,random,11,2,0.6363636363636364,0.005707170915504249
aggregate,aggregate,Helm Lite GSM8K,helm_lite_240829.csv,kendall,random,11,3,0.6363636363636364,0.005707170915504249
aggregate,aggregate,Helm Lite GSM8K,helm_lite_240829.csv,kendall,random,11,4,0.7090909090909091,0.0015912097162097162
aggregate,aggregate,Helm Lite LegalBench,helm_lite_240829.csv,kendall,random,11,0,0.18349396085439343,0.43487965849578336
aggregate,aggregate,Helm Lite LegalBench,helm_lite_240829.csv,kendall,random,11,1,0.2935903373670295,0.21152242941072896
aggregate,aggregate,Helm Lite LegalBench,helm_lite_240829.csv,kendall,random,11,2,0.2727272727272727,0.2829668209876543
aggregate,aggregate,Helm Lite LegalBench,helm_lite_240829.csv,kendall,random,11,3,0.18349396085439343,0.43487965849578336
aggregate,aggregate,Helm Lite LegalBench,helm_lite_240829.csv,kendall,random,11,4,0.2568915451961508,0.27429882739587574
aggregate,aggregate,Helm Lite MedQA,helm_lite_240829.csv,kendall,random,11,0,0.4909090909090909,0.04053235730319064
aggregate,aggregate,Helm Lite MedQA,helm_lite_240829.csv,kendall,random,11,1,0.6000000000000001,0.00994553671637005
aggregate,aggregate,Helm Lite MedQA,helm_lite_240829.csv,kendall,random,11,2,0.5636363636363636,0.016540504248837583
aggregate,aggregate,Helm Lite MedQA,helm_lite_240829.csv,kendall,random,11,3,0.4909090909090909,0.04053235730319064
aggregate,aggregate,Helm Lite MedQA,helm_lite_240829.csv,kendall,random,11,4,0.5636363636363636,0.016540504248837583
aggregate,aggregate,Helm Lite WMT2014,helm_lite_240829.csv,kendall,random,11,0,0.34545454545454546,0.16457331248997917
aggregate,aggregate,Helm Lite WMT2014,helm_lite_240829.csv,kendall,random,11,1,0.41818181818181815,0.08656124739458072
aggregate,aggregate,Helm Lite WMT2014,helm_lite_240829.csv,kendall,random,11,2,0.4909090909090909,0.04053235730319064
aggregate,aggregate,Helm Lite WMT2014,helm_lite_240829.csv,kendall,random,11,3,0.34545454545454546,0.16457331248997917
aggregate,aggregate,Helm Lite WMT2014,helm_lite_240829.csv,kendall,random,11,4,0.34545454545454546,0.16457331248997917
aggregate,aggregate,HF OpenLLM v2,hf_open_llm_v2_240829.csv,kendall,random,11,0,0.9272727272727274,3.2567740901074234e-06
aggregate,aggregate,HF OpenLLM v2,hf_open_llm_v2_240829.csv,kendall,random,11,1,0.8545454545454545,4.624619207952541e-05
aggregate,aggregate,HF OpenLLM v2,hf_open_llm_v2_240829.csv,kendall,random,11,2,0.9272727272727274,3.2567740901074234e-06
aggregate,aggregate,HF OpenLLM v2,hf_open_llm_v2_240829.csv,kendall,random,11,3,0.8545454545454545,4.624619207952541e-05
aggregate,aggregate,HF OpenLLM v2,hf_open_llm_v2_240829.csv,kendall,random,11,4,0.7818181818181819,0.0003334435626102293
aggregate,aggregate,HFv2 BBH,hf_open_llm_v2_240829.csv,kendall,random,11,0,0.8181818181818182,0.00013227513227513228
aggregate,aggregate,HFv2 BBH,hf_open_llm_v2_240829.csv,kendall,random,11,1,0.8181818181818182,0.00013227513227513228
aggregate,aggregate,HFv2 BBH,hf_open_llm_v2_240829.csv,kendall,random,11,2,1.0,5.010421677088344e-08
aggregate,aggregate,HFv2 BBH,hf_open_llm_v2_240829.csv,kendall,random,11,3,0.6727272727272727,0.0031063111271444604
aggregate,aggregate,HFv2 BBH,hf_open_llm_v2_240829.csv,kendall,random,11,4,0.7454545454545454,0.000759529822029822
aggregate,aggregate,HFv2 GPQA,hf_open_llm_v2_240829.csv,kendall,random,11,0,0.45454545454545453,0.06017015392015392
aggregate,aggregate,HFv2 GPQA,hf_open_llm_v2_240829.csv,kendall,random,11,1,0.41818181818181815,0.08656124739458072
aggregate,aggregate,HFv2 GPQA,hf_open_llm_v2_240829.csv,kendall,random,11,2,0.45454545454545453,0.06017015392015392
aggregate,aggregate,HFv2 GPQA,hf_open_llm_v2_240829.csv,kendall,random,11,3,0.7454545454545454,0.000759529822029822
aggregate,aggregate,HFv2 GPQA,hf_open_llm_v2_240829.csv,kendall,random,11,4,0.5272727272727272,0.02638447971781305
aggregate,aggregate,HFv2 IFEval,hf_open_llm_v2_240829.csv,kendall,random,11,0,0.8181818181818182,0.00013227513227513228
aggregate,aggregate,HFv2 IFEval,hf_open_llm_v2_240829.csv,kendall,random,11,1,0.5636363636363636,0.016540504248837583
aggregate,aggregate,HFv2 IFEval,hf_open_llm_v2_240829.csv,kendall,random,11,2,0.7454545454545454,0.000759529822029822
aggregate,aggregate,HFv2 IFEval,hf_open_llm_v2_240829.csv,kendall,random,11,3,0.7454545454545454,0.000759529822029822
aggregate,aggregate,HFv2 IFEval,hf_open_llm_v2_240829.csv,kendall,random,11,4,0.7090909090909091,0.0015912097162097162
aggregate,aggregate,HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,kendall,random,11,0,0.8909090909090909,1.3728555395222063e-05
aggregate,aggregate,HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,kendall,random,11,1,0.9272727272727274,3.2567740901074234e-06
aggregate,aggregate,HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,kendall,random,11,2,1.0,5.010421677088344e-08
aggregate,aggregate,HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,kendall,random,11,3,0.8181818181818182,0.00013227513227513228
aggregate,aggregate,HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,kendall,random,11,4,0.7818181818181819,0.0003334435626102293
aggregate,aggregate,HFv2 Math Level 5,hf_open_llm_v2_240829.csv,kendall,random,11,0,0.8909090909090909,1.3728555395222063e-05
aggregate,aggregate,HFv2 Math Level 5,hf_open_llm_v2_240829.csv,kendall,random,11,1,0.5272727272727272,0.02638447971781305
aggregate,aggregate,HFv2 Math Level 5,hf_open_llm_v2_240829.csv,kendall,random,11,2,0.8545454545454545,4.624619207952541e-05
aggregate,aggregate,HFv2 Math Level 5,hf_open_llm_v2_240829.csv,kendall,random,11,3,0.4403855060505442,0.06091869077971648
aggregate,aggregate,HFv2 Math Level 5,hf_open_llm_v2_240829.csv,kendall,random,11,4,0.587180674734059,0.01246215829454031
aggregate,aggregate,HFv2 MuSR,hf_open_llm_v2_240829.csv,kendall,random,11,0,0.7454545454545454,0.000759529822029822
aggregate,aggregate,HFv2 MuSR,hf_open_llm_v2_240829.csv,kendall,random,11,1,0.45454545454545453,0.06017015392015392
aggregate,aggregate,HFv2 MuSR,hf_open_llm_v2_240829.csv,kendall,random,11,2,0.38181818181818183,0.12097096961680295
aggregate,aggregate,HFv2 MuSR,hf_open_llm_v2_240829.csv,kendall,random,11,3,0.6363636363636364,0.005707170915504249
aggregate,aggregate,HFv2 MuSR,hf_open_llm_v2_240829.csv,kendall,random,11,4,0.587180674734059,0.01246215829454031
aggregate,aggregate,Helm MMLU,helm_mmlu_240829.csv,kendall,random,11,0,0.7090909090909091,0.0015912097162097162
aggregate,aggregate,Helm MMLU,helm_mmlu_240829.csv,kendall,random,11,1,0.7090909090909091,0.0015912097162097162
aggregate,aggregate,Helm MMLU,helm_mmlu_240829.csv,kendall,random,11,2,0.7090909090909091,0.0015912097162097162
aggregate,aggregate,Helm MMLU,helm_mmlu_240829.csv,kendall,random,11,3,0.7090909090909091,0.0015912097162097162
aggregate,aggregate,Helm MMLU,helm_mmlu_240829.csv,kendall,random,11,4,0.7090909090909091,0.0015912097162097162
aggregate,aggregate,LMSys Arena,chatbot_arena_240829.csv,kendall,random,11,0,1.0,5.010421677088344e-08
aggregate,aggregate,LMSys Arena,chatbot_arena_240829.csv,kendall,random,11,1,1.0,5.010421677088344e-08
aggregate,aggregate,LMSys Arena,chatbot_arena_240829.csv,kendall,random,11,2,1.0,5.010421677088344e-08
aggregate,aggregate,LMSys Arena,chatbot_arena_240829.csv,kendall,random,11,3,1.0,5.010421677088344e-08
aggregate,aggregate,LMSys Arena,chatbot_arena_240829.csv,kendall,random,11,4,1.0,5.010421677088344e-08
aggregate,aggregate,MixEval,mixeval_240829.csv,kendall,random,11,0,0.8545454545454545,4.624619207952541e-05
aggregate,aggregate,MixEval,mixeval_240829.csv,kendall,random,11,1,0.7818181818181819,0.0003334435626102293
aggregate,aggregate,MixEval,mixeval_240829.csv,kendall,random,11,2,0.8181818181818182,0.00013227513227513228
aggregate,aggregate,MixEval,mixeval_240829.csv,kendall,random,11,3,0.8181818181818182,0.00013227513227513228
aggregate,aggregate,MixEval,mixeval_240829.csv,kendall,random,11,4,0.8181818181818182,0.00013227513227513228
aggregate,aggregate,MixEval Hard,mixeval_240829.csv,kendall,random,11,0,0.7818181818181819,0.0003334435626102293
aggregate,aggregate,MixEval Hard,mixeval_240829.csv,kendall,random,11,1,0.7454545454545454,0.000759529822029822
aggregate,aggregate,MixEval Hard,mixeval_240829.csv,kendall,random,11,2,0.7090909090909091,0.0015912097162097162
aggregate,aggregate,MixEval Hard,mixeval_240829.csv,kendall,random,11,3,0.4909090909090909,0.04053235730319064
aggregate,aggregate,MixEval Hard,mixeval_240829.csv,kendall,random,11,4,0.8181818181818182,0.00013227513227513228
aggregate,aggregate,MixEval TriviaQA,mixeval_240829.csv,kendall,random,11,0,0.6238794669049377,0.007931923532795268
aggregate,aggregate,MixEval TriviaQA,mixeval_240829.csv,kendall,random,11,1,0.6605782590758164,0.004936818556325077
aggregate,aggregate,MixEval TriviaQA,mixeval_240829.csv,kendall,random,11,2,0.4403855060505442,0.06091869077971648
aggregate,aggregate,MixEval TriviaQA,mixeval_240829.csv,kendall,random,11,3,0.7090909090909091,0.0015912097162097162
aggregate,aggregate,MixEval TriviaQA,mixeval_240829.csv,kendall,random,11,4,0.697277051246695,0.003004262239398284
aggregate,aggregate,MixEval MMLU,mixeval_240829.csv,kendall,random,11,0,0.8545454545454545,4.624619207952541e-05
aggregate,aggregate,MixEval MMLU,mixeval_240829.csv,kendall,random,11,1,0.7818181818181819,0.0003334435626102293
aggregate,aggregate,MixEval MMLU,mixeval_240829.csv,kendall,random,11,2,0.8181818181818182,0.00013227513227513228
aggregate,aggregate,MixEval MMLU,mixeval_240829.csv,kendall,random,11,3,0.7818181818181819,0.0003334435626102293
aggregate,aggregate,MixEval MMLU,mixeval_240829.csv,kendall,random,11,4,0.7090909090909091,0.0015912097162097162
aggregate,aggregate,MixEval DROP,mixeval_240829.csv,kendall,random,11,0,0.6238794669049377,0.007931923532795268
aggregate,aggregate,MixEval DROP,mixeval_240829.csv,kendall,random,11,1,0.4403855060505442,0.06091869077971648
aggregate,aggregate,MixEval DROP,mixeval_240829.csv,kendall,random,11,2,0.5636363636363636,0.016540504248837583
aggregate,aggregate,MixEval DROP,mixeval_240829.csv,kendall,random,11,3,0.6363636363636364,0.005707170915504249
aggregate,aggregate,MixEval DROP,mixeval_240829.csv,kendall,random,11,4,0.5636363636363636,0.016540504248837583
aggregate,aggregate,MixEval HellaSwag,mixeval_240829.csv,kendall,random,11,0,0.6000000000000001,0.00994553671637005
aggregate,aggregate,MixEval HellaSwag,mixeval_240829.csv,kendall,random,11,1,0.5636363636363636,0.016540504248837583
aggregate,aggregate,MixEval HellaSwag,mixeval_240829.csv,kendall,random,11,2,0.6000000000000001,0.00994553671637005
aggregate,aggregate,MixEval HellaSwag,mixeval_240829.csv,kendall,random,11,3,0.6000000000000001,0.00994553671637005
aggregate,aggregate,MixEval HellaSwag,mixeval_240829.csv,kendall,random,11,4,0.5636363636363636,0.016540504248837583
aggregate,aggregate,MixEval CommonsenseQA,mixeval_240829.csv,kendall,random,11,0,0.7339758434175737,0.0017872890369872653
aggregate,aggregate,MixEval CommonsenseQA,mixeval_240829.csv,kendall,random,11,1,0.587180674734059,0.01246215829454031
aggregate,aggregate,MixEval CommonsenseQA,mixeval_240829.csv,kendall,random,11,2,0.6482593132545567,0.006117582447622459
aggregate,aggregate,MixEval CommonsenseQA,mixeval_240829.csv,kendall,random,11,3,0.759389481241052,0.0013210471654040124
aggregate,aggregate,MixEval CommonsenseQA,mixeval_240829.csv,kendall,random,11,4,0.759389481241052,0.0013210471654040124
aggregate,aggregate,MixEval TriviaQA Hard,mixeval_240829.csv,kendall,random,11,0,0.6363636363636364,0.005707170915504249
aggregate,aggregate,MixEval TriviaQA Hard,mixeval_240829.csv,kendall,random,11,1,0.7090909090909091,0.0015912097162097162
aggregate,aggregate,MixEval TriviaQA Hard,mixeval_240829.csv,kendall,random,11,2,0.6727272727272727,0.0031063111271444604
aggregate,aggregate,MixEval TriviaQA Hard,mixeval_240829.csv,kendall,random,11,3,0.2727272727272727,0.2829668209876543
aggregate,aggregate,MixEval TriviaQA Hard,mixeval_240829.csv,kendall,random,11,4,0.5272727272727272,0.02638447971781305
aggregate,aggregate,MixEval MMLU Hard,mixeval_240829.csv,kendall,random,11,0,0.38181818181818183,0.12097096961680295
aggregate,aggregate,MixEval MMLU Hard,mixeval_240829.csv,kendall,random,11,1,0.4909090909090909,0.04053235730319064
aggregate,aggregate,MixEval MMLU Hard,mixeval_240829.csv,kendall,random,11,2,0.38895558795273394,0.10000137830747906
aggregate,aggregate,MixEval MMLU Hard,mixeval_240829.csv,kendall,random,11,3,0.38181818181818183,0.12097096961680295
aggregate,aggregate,MixEval MMLU Hard,mixeval_240829.csv,kendall,random,11,4,0.697277051246695,0.003004262239398284
aggregate,aggregate,MixEval DROP Hard,mixeval_240829.csv,kendall,random,11,0,0.4909090909090909,0.04053235730319064
aggregate,aggregate,MixEval DROP Hard,mixeval_240829.csv,kendall,random,11,1,0.41818181818181815,0.08656124739458072
aggregate,aggregate,MixEval DROP Hard,mixeval_240829.csv,kendall,random,11,2,0.6000000000000001,0.00994553671637005
aggregate,aggregate,MixEval DROP Hard,mixeval_240829.csv,kendall,random,11,3,0.6000000000000001,0.00994553671637005
aggregate,aggregate,MixEval DROP Hard,mixeval_240829.csv,kendall,random,11,4,0.6000000000000001,0.00994553671637005
aggregate,aggregate,AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,kendall,random,11,0,0.8181818181818182,0.00013227513227513228
aggregate,aggregate,AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,kendall,random,11,1,0.8807710121010884,0.00017812930545546289
aggregate,aggregate,AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,kendall,random,11,2,0.8807710121010884,0.00017812930545546289
aggregate,aggregate,AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,kendall,random,11,3,0.8807710121010884,0.00017812930545546289
aggregate,aggregate,AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,kendall,random,11,4,0.8545454545454545,4.624619207952541e-05
aggregate,aggregate,LiveBench 240725,livebench_240829.csv,kendall,random,11,0,0.7454545454545454,0.000759529822029822
aggregate,aggregate,LiveBench 240725,livebench_240829.csv,kendall,random,11,1,0.8181818181818182,0.00013227513227513228
aggregate,aggregate,LiveBench 240725,livebench_240829.csv,kendall,random,11,2,0.8545454545454545,4.624619207952541e-05
aggregate,aggregate,LiveBench 240725,livebench_240829.csv,kendall,random,11,3,0.8545454545454545,4.624619207952541e-05
aggregate,aggregate,LiveBench 240725,livebench_240829.csv,kendall,random,11,4,0.8545454545454545,4.624619207952541e-05
aggregate,aggregate,LiveBench Reasoning,livebench_240829.csv,kendall,random,11,0,0.4770842982214229,0.042330229121360724
aggregate,aggregate,LiveBench Reasoning,livebench_240829.csv,kendall,random,11,1,0.6727272727272727,0.0031063111271444604
aggregate,aggregate,LiveBench Reasoning,livebench_240829.csv,kendall,random,11,2,0.6605782590758164,0.004936818556325077
aggregate,aggregate,LiveBench Reasoning,livebench_240829.csv,kendall,random,11,3,0.8440722199302099,0.0003281542287518694
aggregate,aggregate,LiveBench Reasoning,livebench_240829.csv,kendall,random,11,4,0.7339758434175737,0.0017872890369872653
aggregate,aggregate,LiveBench Coding,livebench_240829.csv,kendall,random,11,0,0.9636363636363636,5.511463844797178e-07
aggregate,aggregate,LiveBench Coding,livebench_240829.csv,kendall,random,11,1,0.6727272727272727,0.0031063111271444604
aggregate,aggregate,LiveBench Coding,livebench_240829.csv,kendall,random,11,2,0.7818181818181819,0.0003334435626102293
aggregate,aggregate,LiveBench Coding,livebench_240829.csv,kendall,random,11,3,0.8545454545454545,4.624619207952541e-05
aggregate,aggregate,LiveBench Coding,livebench_240829.csv,kendall,random,11,4,0.7818181818181819,0.0003334435626102293
aggregate,aggregate,LiveBench Mathematics,livebench_240829.csv,kendall,random,11,0,0.7090909090909091,0.0015912097162097162
aggregate,aggregate,LiveBench Mathematics,livebench_240829.csv,kendall,random,11,1,0.7454545454545454,0.000759529822029822
aggregate,aggregate,LiveBench Mathematics,livebench_240829.csv,kendall,random,11,2,0.8181818181818182,0.00013227513227513228
aggregate,aggregate,LiveBench Mathematics,livebench_240829.csv,kendall,random,11,3,0.8909090909090909,1.3728555395222063e-05
aggregate,aggregate,LiveBench Mathematics,livebench_240829.csv,kendall,random,11,4,0.8545454545454545,4.624619207952541e-05
aggregate,aggregate,LiveBench Data Analysis,livebench_240829.csv,kendall,random,11,0,0.4909090909090909,0.04053235730319064
aggregate,aggregate,LiveBench Data Analysis,livebench_240829.csv,kendall,random,11,1,0.6363636363636364,0.005707170915504249
aggregate,aggregate,LiveBench Data Analysis,livebench_240829.csv,kendall,random,11,2,0.7454545454545454,0.000759529822029822
aggregate,aggregate,LiveBench Data Analysis,livebench_240829.csv,kendall,random,11,3,0.8545454545454545,4.624619207952541e-05
aggregate,aggregate,LiveBench Data Analysis,livebench_240829.csv,kendall,random,11,4,0.7454545454545454,0.000759529822029822
aggregate,aggregate,LiveBench Language,livebench_240829.csv,kendall,random,11,0,0.45454545454545453,0.06017015392015392
aggregate,aggregate,LiveBench Language,livebench_240829.csv,kendall,random,11,1,0.8181818181818182,0.00013227513227513228
aggregate,aggregate,LiveBench Language,livebench_240829.csv,kendall,random,11,2,0.6363636363636364,0.005707170915504249
aggregate,aggregate,LiveBench Language,livebench_240829.csv,kendall,random,11,3,0.9272727272727274,3.2567740901074234e-06
aggregate,aggregate,LiveBench Language,livebench_240829.csv,kendall,random,11,4,0.8545454545454545,4.624619207952541e-05
aggregate,aggregate,LiveBench Instruction Following,livebench_240829.csv,kendall,random,11,0,0.6000000000000001,0.00994553671637005
aggregate,aggregate,LiveBench Instruction Following,livebench_240829.csv,kendall,random,11,1,0.7090909090909091,0.0015912097162097162
aggregate,aggregate,LiveBench Instruction Following,livebench_240829.csv,kendall,random,11,2,0.6727272727272727,0.0031063111271444604
aggregate,aggregate,LiveBench Instruction Following,livebench_240829.csv,kendall,random,11,3,0.4909090909090909,0.04053235730319064
aggregate,aggregate,LiveBench Instruction Following,livebench_240829.csv,kendall,random,11,4,0.6727272727272727,0.0031063111271444604
aggregate,aggregate,WildBench Elo LC,wildbench_240829.csv,kendall,random,11,0,0.8909090909090909,1.3728555395222063e-05
aggregate,aggregate,WildBench Elo LC,wildbench_240829.csv,kendall,random,11,1,0.8181818181818182,0.00013227513227513228
aggregate,aggregate,WildBench Elo LC,wildbench_240829.csv,kendall,random,11,2,0.8909090909090909,1.3728555395222063e-05
aggregate,aggregate,WildBench Elo LC,wildbench_240829.csv,kendall,random,11,3,0.8909090909090909,1.3728555395222063e-05
aggregate,aggregate,WildBench Elo LC,wildbench_240829.csv,kendall,random,11,4,0.8181818181818182,0.00013227513227513228
aggregate,aggregate,WildBench Information Seeking,wildbench_240829.csv,kendall,random,11,0,0.7454545454545454,0.000759529822029822
aggregate,aggregate,WildBench Information Seeking,wildbench_240829.csv,kendall,random,11,1,0.8181818181818182,0.00013227513227513228
aggregate,aggregate,WildBench Information Seeking,wildbench_240829.csv,kendall,random,11,2,0.8545454545454545,4.624619207952541e-05
aggregate,aggregate,WildBench Information Seeking,wildbench_240829.csv,kendall,random,11,3,0.7454545454545454,0.000759529822029822
aggregate,aggregate,WildBench Information Seeking,wildbench_240829.csv,kendall,random,11,4,0.7454545454545454,0.000759529822029822
aggregate,aggregate,WildBench Creative,wildbench_240829.csv,kendall,random,11,0,0.7454545454545454,0.000759529822029822
aggregate,aggregate,WildBench Creative,wildbench_240829.csv,kendall,random,11,1,0.7818181818181819,0.0003334435626102293
aggregate,aggregate,WildBench Creative,wildbench_240829.csv,kendall,random,11,2,0.7818181818181819,0.0003334435626102293
aggregate,aggregate,WildBench Creative,wildbench_240829.csv,kendall,random,11,3,0.7454545454545454,0.000759529822029822
aggregate,aggregate,WildBench Creative,wildbench_240829.csv,kendall,random,11,4,0.7454545454545454,0.000759529822029822
aggregate,aggregate,WildBench Code Debugging,wildbench_240829.csv,kendall,random,11,0,1.0,5.010421677088344e-08
aggregate,aggregate,WildBench Code Debugging,wildbench_240829.csv,kendall,random,11,1,0.9636363636363636,5.511463844797178e-07
aggregate,aggregate,WildBench Code Debugging,wildbench_240829.csv,kendall,random,11,2,0.8909090909090909,1.3728555395222063e-05
aggregate,aggregate,WildBench Code Debugging,wildbench_240829.csv,kendall,random,11,3,0.9636363636363636,5.511463844797178e-07
aggregate,aggregate,WildBench Code Debugging,wildbench_240829.csv,kendall,random,11,4,1.0,5.010421677088344e-08
aggregate,aggregate,WildBench Math & Data,wildbench_240829.csv,kendall,random,11,0,1.0,5.010421677088344e-08
aggregate,aggregate,WildBench Math & Data,wildbench_240829.csv,kendall,random,11,1,1.0,5.010421677088344e-08
aggregate,aggregate,WildBench Math & Data,wildbench_240829.csv,kendall,random,11,2,0.9272727272727274,3.2567740901074234e-06
aggregate,aggregate,WildBench Math & Data,wildbench_240829.csv,kendall,random,11,3,0.9636363636363636,5.511463844797178e-07
aggregate,aggregate,WildBench Math & Data,wildbench_240829.csv,kendall,random,11,4,1.0,5.010421677088344e-08
aggregate,aggregate,WildBench Reasoning & Planning,wildbench_240829.csv,kendall,random,11,0,0.9272727272727274,3.2567740901074234e-06
aggregate,aggregate,WildBench Reasoning & Planning,wildbench_240829.csv,kendall,random,11,1,0.9636363636363636,5.511463844797178e-07
aggregate,aggregate,WildBench Reasoning & Planning,wildbench_240829.csv,kendall,random,11,2,0.8909090909090909,1.3728555395222063e-05
aggregate,aggregate,WildBench Reasoning & Planning,wildbench_240829.csv,kendall,random,11,3,0.8181818181818182,0.00013227513227513228
aggregate,aggregate,WildBench Reasoning & Planning,wildbench_240829.csv,kendall,random,11,4,0.9272727272727274,3.2567740901074234e-06
aggregate,aggregate,WildBench Score,wildbench_240829.csv,kendall,random,11,0,0.9636363636363636,5.511463844797178e-07
aggregate,aggregate,WildBench Score,wildbench_240829.csv,kendall,random,11,1,0.9636363636363636,5.511463844797178e-07
aggregate,aggregate,WildBench Score,wildbench_240829.csv,kendall,random,11,2,0.8909090909090909,1.3728555395222063e-05
aggregate,aggregate,WildBench Score,wildbench_240829.csv,kendall,random,11,3,0.8909090909090909,1.3728555395222063e-05
aggregate,aggregate,WildBench Score,wildbench_240829.csv,kendall,random,11,4,0.9636363636363636,5.511463844797178e-07
aggregate,aggregate,Arena Hard,arena_hard_240829.csv,kendall,random,11,0,1.0,5.010421677088344e-08
aggregate,aggregate,Arena Hard,arena_hard_240829.csv,kendall,random,11,1,1.0,5.010421677088344e-08
aggregate,aggregate,Arena Hard,arena_hard_240829.csv,kendall,random,11,2,1.0,5.010421677088344e-08
aggregate,aggregate,Arena Hard,arena_hard_240829.csv,kendall,random,11,3,1.0,5.010421677088344e-08
aggregate,aggregate,Arena Hard,arena_hard_240829.csv,kendall,random,11,4,1.0,5.010421677088344e-08
aggregate,aggregate,HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,0,0.45454545454545453,0.06017015392015392
aggregate,aggregate,HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,1,0.6000000000000001,0.00994553671637005
aggregate,aggregate,HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,2,0.3090909090909091,0.21834651074234407
aggregate,aggregate,HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,3,0.5272727272727272,0.02638447971781305
aggregate,aggregate,HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,4,0.38181818181818183,0.12097096961680295
aggregate,aggregate,HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,0,0.6000000000000001,0.00994553671637005
aggregate,aggregate,HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,1,0.5636363636363636,0.016540504248837583
aggregate,aggregate,HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,2,0.5272727272727272,0.02638447971781305
aggregate,aggregate,HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,3,0.6000000000000001,0.00994553671637005
aggregate,aggregate,HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,4,0.4909090909090909,0.04053235730319064
aggregate,aggregate,HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,0,0.45454545454545453,0.06017015392015392
aggregate,aggregate,HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,1,0.6000000000000001,0.00994553671637005
aggregate,aggregate,HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,2,0.34545454545454546,0.16457331248997917
aggregate,aggregate,HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,3,0.41818181818181815,0.08656124739458072
aggregate,aggregate,HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,4,0.2727272727272727,0.2829668209876543
aggregate,aggregate,HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,0,0.5741725345968929,0.015177848122929492
aggregate,aggregate,HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,1,0.3519121986239021,0.1366995137219537
aggregate,aggregate,HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,2,0.42599897728156577,0.07162425926742408
aggregate,aggregate,HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,3,0.38181818181818183,0.12097096961680295
aggregate,aggregate,HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,4,0.4403855060505442,0.06091869077971648
aggregate,aggregate,HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,0,0.7090909090909091,0.0015912097162097162
aggregate,aggregate,HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,1,0.7818181818181819,0.0003334435626102293
aggregate,aggregate,HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,2,0.6727272727272727,0.0031063111271444604
aggregate,aggregate,HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,3,0.7454545454545454,0.000759529822029822
aggregate,aggregate,HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,4,0.6727272727272727,0.0031063111271444604
aggregate,aggregate,HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,0,0.6000000000000001,0.00994553671637005
aggregate,aggregate,HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,1,0.5272727272727272,0.02638447971781305
aggregate,aggregate,HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,2,0.3090909090909091,0.21834651074234407
aggregate,aggregate,HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,3,0.45454545454545453,0.06017015392015392
aggregate,aggregate,HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,4,0.2727272727272727,0.2829668209876543
aggregate,aggregate,HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,0,0.4403855060505442,0.06091869077971648
aggregate,aggregate,HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,1,0.38181818181818183,0.12097096961680295
aggregate,aggregate,HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,2,0.4403855060505442,0.06091869077971648
aggregate,aggregate,HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,3,0.45454545454545453,0.06017015392015392
aggregate,aggregate,HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,4,0.36698792170878686,0.11834981273562825
aggregate,aggregate,BFCL,bfcl_240906.csv,kendall,random,11,0,0.2,0.4453821448613115
aggregate,aggregate,BFCL,bfcl_240906.csv,kendall,random,11,1,0.38181818181818183,0.12097096961680295
aggregate,aggregate,BFCL,bfcl_240906.csv,kendall,random,11,2,0.41818181818181815,0.08656124739458072
aggregate,aggregate,BFCL,bfcl_240906.csv,kendall,random,11,3,0.5272727272727272,0.02638447971781305
aggregate,aggregate,BFCL,bfcl_240906.csv,kendall,random,11,4,0.5272727272727272,0.02638447971781305
aggregate,aggregate,BIGGEN,biggen_240829.csv,kendall,random,11,0,0.8181818181818182,0.00013227513227513228
aggregate,aggregate,BIGGEN,biggen_240829.csv,kendall,random,11,1,0.8181818181818182,0.00013227513227513228
aggregate,aggregate,BIGGEN,biggen_240829.csv,kendall,random,11,2,0.8909090909090909,1.3728555395222063e-05
aggregate,aggregate,BIGGEN,biggen_240829.csv,kendall,random,11,3,0.8181818181818182,0.00013227513227513228
aggregate,aggregate,BIGGEN,biggen_240829.csv,kendall,random,11,4,0.7090909090909091,0.0015912097162097162
aggregate,aggregate,BIGGEN Grounding,biggen_240829.csv,kendall,random,11,0,0.7454545454545454,0.000759529822029822
aggregate,aggregate,BIGGEN Grounding,biggen_240829.csv,kendall,random,11,1,0.7818181818181819,0.0003334435626102293
aggregate,aggregate,BIGGEN Grounding,biggen_240829.csv,kendall,random,11,2,0.8181818181818182,0.00013227513227513228
aggregate,aggregate,BIGGEN Grounding,biggen_240829.csv,kendall,random,11,3,0.8073734277593311,0.0005907573118657002
aggregate,aggregate,BIGGEN Grounding,biggen_240829.csv,kendall,random,11,4,0.7454545454545454,0.000759529822029822
aggregate,aggregate,BIGGEN Instruction Following,biggen_240829.csv,kendall,random,11,0,0.587180674734059,0.01246215829454031
aggregate,aggregate,BIGGEN Instruction Following,biggen_240829.csv,kendall,random,11,1,0.6482593132545567,0.006117582447622459
aggregate,aggregate,BIGGEN Instruction Following,biggen_240829.csv,kendall,random,11,2,0.8545454545454545,4.624619207952541e-05
aggregate,aggregate,BIGGEN Instruction Following,biggen_240829.csv,kendall,random,11,3,0.7706746355884524,0.0010393630991335228
aggregate,aggregate,BIGGEN Instruction Following,biggen_240829.csv,kendall,random,11,4,0.5371291452680612,0.02311942970946668
aggregate,aggregate,BIGGEN Planning,biggen_240829.csv,kendall,random,11,0,0.6238794669049377,0.007931923532795268
aggregate,aggregate,BIGGEN Planning,biggen_240829.csv,kendall,random,11,1,0.4909090909090909,0.04053235730319064
aggregate,aggregate,BIGGEN Planning,biggen_240829.csv,kendall,random,11,2,0.8440722199302099,0.0003281542287518694
aggregate,aggregate,BIGGEN Planning,biggen_240829.csv,kendall,random,11,3,0.7454545454545454,0.000759529822029822
aggregate,aggregate,BIGGEN Planning,biggen_240829.csv,kendall,random,11,4,0.36698792170878686,0.11834981273562825
aggregate,aggregate,BIGGEN Reasoning,biggen_240829.csv,kendall,random,11,0,0.8909090909090909,1.3728555395222063e-05
aggregate,aggregate,BIGGEN Reasoning,biggen_240829.csv,kendall,random,11,1,0.9272727272727274,3.2567740901074234e-06
aggregate,aggregate,BIGGEN Reasoning,biggen_240829.csv,kendall,random,11,2,1.0,5.010421677088344e-08
aggregate,aggregate,BIGGEN Reasoning,biggen_240829.csv,kendall,random,11,3,0.7090909090909091,0.0015912097162097162
aggregate,aggregate,BIGGEN Reasoning,biggen_240829.csv,kendall,random,11,4,0.6727272727272727,0.0031063111271444604
aggregate,aggregate,BIGGEN Refinement,biggen_240829.csv,kendall,random,11,0,0.8181818181818182,0.00013227513227513228
aggregate,aggregate,BIGGEN Refinement,biggen_240829.csv,kendall,random,11,1,0.8073734277593311,0.0005907573118657002
aggregate,aggregate,BIGGEN Refinement,biggen_240829.csv,kendall,random,11,2,0.8909090909090909,1.3728555395222063e-05
aggregate,aggregate,BIGGEN Refinement,biggen_240829.csv,kendall,random,11,3,0.7818181818181819,0.0003334435626102293
aggregate,aggregate,BIGGEN Refinement,biggen_240829.csv,kendall,random,11,4,0.6605782590758164,0.004936818556325077
aggregate,aggregate,BIGGEN Safety,biggen_240829.csv,kendall,random,11,0,-0.0909090909090909,0.7611503928170594
aggregate,aggregate,BIGGEN Safety,biggen_240829.csv,kendall,random,11,1,0.07339758434175737,0.7547764265871044
aggregate,aggregate,BIGGEN Safety,biggen_240829.csv,kendall,random,11,2,0.4403855060505442,0.06091869077971648
aggregate,aggregate,BIGGEN Safety,biggen_240829.csv,kendall,random,11,3,0.3302891295379082,0.15985367483762747
aggregate,aggregate,BIGGEN Safety,biggen_240829.csv,kendall,random,11,4,0.1272727272727273,0.6480954385121052
aggregate,aggregate,BIGGEN Theory of Mind,biggen_240829.csv,kendall,random,11,0,0.7090909090909091,0.0015912097162097162
aggregate,aggregate,BIGGEN Theory of Mind,biggen_240829.csv,kendall,random,11,1,0.7818181818181819,0.0003334435626102293
aggregate,aggregate,BIGGEN Theory of Mind,biggen_240829.csv,kendall,random,11,2,0.8705196492275474,0.00023202582506637044
aggregate,aggregate,BIGGEN Theory of Mind,biggen_240829.csv,kendall,random,11,3,0.7479575920067658,0.001637274718449882
aggregate,aggregate,BIGGEN Theory of Mind,biggen_240829.csv,kendall,random,11,4,0.5983660736054126,0.01175728488671479
aggregate,aggregate,BIGGEN Tool Usage,biggen_240829.csv,kendall,random,11,0,0.7090909090909091,0.0015912097162097162
aggregate,aggregate,BIGGEN Tool Usage,biggen_240829.csv,kendall,random,11,1,0.6000000000000001,0.00994553671637005
aggregate,aggregate,BIGGEN Tool Usage,biggen_240829.csv,kendall,random,11,2,0.7090909090909091,0.0015912097162097162
aggregate,aggregate,BIGGEN Tool Usage,biggen_240829.csv,kendall,random,11,3,0.7454545454545454,0.000759529822029822
aggregate,aggregate,BIGGEN Tool Usage,biggen_240829.csv,kendall,random,11,4,0.5272727272727272,0.02638447971781305
aggregate,aggregate,BIGGEN Multilingual,biggen_240829.csv,kendall,random,11,0,0.7706746355884524,0.0010393630991335228
aggregate,aggregate,BIGGEN Multilingual,biggen_240829.csv,kendall,random,11,1,0.8909090909090909,1.3728555395222063e-05
aggregate,aggregate,BIGGEN Multilingual,biggen_240829.csv,kendall,random,11,2,0.5272727272727272,0.02638447971781305
aggregate,aggregate,BIGGEN Multilingual,biggen_240829.csv,kendall,random,11,3,0.7454545454545454,0.000759529822029822
aggregate,aggregate,BIGGEN Multilingual,biggen_240829.csv,kendall,random,11,4,0.7818181818181819,0.0003334435626102293
aggregate,aggregate,LiveBench 240624,livebench_240701.csv,kendall,random,11,0,0.7818181818181819,0.0003334435626102293
aggregate,aggregate,LiveBench 240624,livebench_240701.csv,kendall,random,11,1,0.8545454545454545,4.624619207952541e-05
aggregate,aggregate,LiveBench 240624,livebench_240701.csv,kendall,random,11,2,0.7818181818181819,0.0003334435626102293
aggregate,aggregate,LiveBench 240624,livebench_240701.csv,kendall,random,11,3,0.8545454545454545,4.624619207952541e-05
aggregate,aggregate,LiveBench 240624,livebench_240701.csv,kendall,random,11,4,0.7818181818181819,0.0003334435626102293
aggregate,aggregate,LiveBench Reasoning Average,livebench_240701.csv,kendall,random,11,0,0.6731618328060892,0.004677734981047257
aggregate,aggregate,LiveBench Reasoning Average,livebench_240701.csv,kendall,random,11,1,0.759389481241052,0.0013210471654040124
aggregate,aggregate,LiveBench Reasoning Average,livebench_240701.csv,kendall,random,11,2,0.7339758434175737,0.0017872890369872653
aggregate,aggregate,LiveBench Reasoning Average,livebench_240701.csv,kendall,random,11,3,0.6238794669049377,0.007931923532795268
aggregate,aggregate,LiveBench Reasoning Average,livebench_240701.csv,kendall,random,11,4,0.7090909090909091,0.0015912097162097162
aggregate,aggregate,LiveBench Coding Average,livebench_240701.csv,kendall,random,11,0,0.6363636363636364,0.005707170915504249
aggregate,aggregate,LiveBench Coding Average,livebench_240701.csv,kendall,random,11,1,0.7454545454545454,0.000759529822029822
aggregate,aggregate,LiveBench Coding Average,livebench_240701.csv,kendall,random,11,2,0.7706746355884524,0.0010393630991335228
aggregate,aggregate,LiveBench Coding Average,livebench_240701.csv,kendall,random,11,3,0.8181818181818182,0.00013227513227513228
aggregate,aggregate,LiveBench Coding Average,livebench_240701.csv,kendall,random,11,4,0.7706746355884524,0.0010393630991335228
aggregate,aggregate,LiveBench Mathematics Average,livebench_240701.csv,kendall,random,11,0,0.7090909090909091,0.0015912097162097162
aggregate,aggregate,LiveBench Mathematics Average,livebench_240701.csv,kendall,random,11,1,0.8909090909090909,1.3728555395222063e-05
aggregate,aggregate,LiveBench Mathematics Average,livebench_240701.csv,kendall,random,11,2,0.7818181818181819,0.0003334435626102293
aggregate,aggregate,LiveBench Mathematics Average,livebench_240701.csv,kendall,random,11,3,0.8545454545454545,4.624619207952541e-05
aggregate,aggregate,LiveBench Mathematics Average,livebench_240701.csv,kendall,random,11,4,0.8545454545454545,4.624619207952541e-05
aggregate,aggregate,LiveBench Data Analysis Average,livebench_240701.csv,kendall,random,11,0,0.5636363636363636,0.016540504248837583
aggregate,aggregate,LiveBench Data Analysis Average,livebench_240701.csv,kendall,random,11,1,0.7454545454545454,0.000759529822029822
aggregate,aggregate,LiveBench Data Analysis Average,livebench_240701.csv,kendall,random,11,2,0.6363636363636364,0.005707170915504249
aggregate,aggregate,LiveBench Data Analysis Average,livebench_240701.csv,kendall,random,11,3,0.7818181818181819,0.0003334435626102293
aggregate,aggregate,LiveBench Data Analysis Average,livebench_240701.csv,kendall,random,11,4,0.6363636363636364,0.005707170915504249
aggregate,aggregate,LiveBench Language Average,livebench_240701.csv,kendall,random,11,0,0.7090909090909091,0.0015912097162097162
aggregate,aggregate,LiveBench Language Average,livebench_240701.csv,kendall,random,11,1,0.8181818181818182,0.00013227513227513228
aggregate,aggregate,LiveBench Language Average,livebench_240701.csv,kendall,random,11,2,0.5636363636363636,0.016540504248837583
aggregate,aggregate,LiveBench Language Average,livebench_240701.csv,kendall,random,11,3,0.8545454545454545,4.624619207952541e-05
aggregate,aggregate,LiveBench Language Average,livebench_240701.csv,kendall,random,11,4,0.7454545454545454,0.000759529822029822
aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,kendall,random,11,0,0.6727272727272727,0.0031063111271444604
aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,kendall,random,11,1,0.7454545454545454,0.000759529822029822
aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,kendall,random,11,2,0.7090909090909091,0.0015912097162097162
aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,kendall,random,11,3,0.6000000000000001,0.00994553671637005
aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,kendall,random,11,4,0.7454545454545454,0.000759529822029822