diff --git "a/cache/agreements_cache_151f5bfbf87ac7384c2759731c72ec0c.csv" "b/cache/agreements_cache_151f5bfbf87ac7384c2759731c72ec0c.csv" new file mode 100644--- /dev/null +++ "b/cache/agreements_cache_151f5bfbf87ac7384c2759731c72ec0c.csv" @@ -0,0 +1,1001 @@ +scenario,scenario_source,ref_scenario,ref_source,corr_type,model_select_strategy,model_subset_size_requested,exp_n,correlation,p_value +Holmes,holmes_240829.csv,aggregate,aggregate,pearson,random,8,0,-0.017485869096098686,0.9672206778351959 +Holmes,holmes_240829.csv,aggregate,aggregate,pearson,random,8,1,-0.06826285140114943,0.8724042132624071 +Holmes,holmes_240829.csv,aggregate,aggregate,pearson,random,8,2,-0.27291992568490936,0.5131179718629255 +Holmes,holmes_240829.csv,aggregate,aggregate,pearson,random,8,3,-0.0623085741331382,0.8834734515868299 +Holmes,holmes_240829.csv,aggregate,aggregate,pearson,random,8,4,0.11553071904436202,0.7852997192967395 +Helm Lite,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,0,0.8743737489954189,0.004501296794893102 +Helm Lite,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,1,0.8019858294586086,0.01664169341252048 +Helm Lite,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,2,0.865218326418788,0.005519059390504801 +Helm Lite,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,3,0.9324959770534272,0.0007305971150650418 +Helm Lite,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,4,0.9578331579912773,0.00018155839890573593 +Helm Lite NarrativeQA,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,0,-0.30992157835736617,0.4550353006304514 +Helm Lite NarrativeQA,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,1,-0.48460771469003827,0.2235972811859595 +Helm Lite NarrativeQA,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,2,-0.1162588388208577,0.78397092283469 +Helm Lite NarrativeQA,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,3,-0.03180360013624742,0.9404084479868535 +Helm Lite NarrativeQA,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,4,-0.6310234888301745,0.09339585968843296 +Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,0,0.5719061307929368,0.1385541569597628 +Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,1,-0.2953447949582872,0.47758892197811004 +Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,2,-0.08547114468780825,0.8405203853999355 +Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,3,-0.02680948636066538,0.9497562944796989 +Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,4,-0.4016145018471783,0.32402730112296474 +Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,0,0.7247956777996108,0.04194484960329344 +Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,1,0.2767660595168839,0.5069548295866992 +Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,2,0.3337223270100439,0.4191769676693079 +Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,3,0.6126891094585267,0.10632638977302632 +Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,4,0.8079257463851817,0.015261307993340337 +Helm Lite OpenBookQA,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,0,0.6647150497002838,0.07212235537894374 +Helm Lite OpenBookQA,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,1,0.9659235574949907,9.641323857066814e-05 +Helm Lite OpenBookQA,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,2,0.8593434484023453,0.0062437049978399314 +Helm Lite OpenBookQA,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,3,0.7704800482268904,0.025262942539415363 +Helm Lite OpenBookQA,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,4,0.9028773381740962,0.002126756432137772 +Helm Lite MMLU,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,0,0.748982925973149,0.032470780295939985 +Helm Lite MMLU,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,1,0.8599957450436625,0.006160409391629476 +Helm Lite MMLU,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,2,0.8718735582848011,0.004766072993988772 +Helm Lite MMLU,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,3,0.9069576656171551,0.001875739334441522 +Helm Lite MMLU,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,4,0.9502933219669614,0.00029570003340264575 +Helm Lite MathEquivalentCOT,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,0,0.8905328662549648,0.003016032865892646 +Helm Lite MathEquivalentCOT,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,1,0.5058552901713423,0.20090402274559316 +Helm Lite MathEquivalentCOT,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,2,0.6767432630833718,0.0652968761285632 +Helm Lite MathEquivalentCOT,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,3,0.7135518769682414,0.04685902831102101 +Helm Lite MathEquivalentCOT,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,4,0.873661116609048,0.004575776138454243 +Helm Lite GSM8K,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,0,0.8775217778627072,0.004181622363896538 +Helm Lite GSM8K,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,1,0.7683490298001087,0.025928082489068475 +Helm Lite GSM8K,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,2,0.741463148953373,0.035258455741147623 +Helm Lite GSM8K,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,3,0.7891209052525207,0.019892902878583873 +Helm Lite GSM8K,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,4,0.8157900850650412,0.013547661219765379 +Helm Lite LegalBench,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,0,0.8625206786227912,0.005844699973375535 +Helm Lite LegalBench,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,1,0.49625129009057833,0.211004712621783 +Helm Lite LegalBench,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,2,0.7482300147416783,0.0327435760119495 +Helm Lite LegalBench,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,3,0.9237060456412569,0.0010476652712265917 +Helm Lite LegalBench,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,4,0.8540419074377281,0.00694751386877189 +Helm Lite MedQA,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,0,0.7827735900001105,0.021632253958226707 +Helm Lite MedQA,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,1,0.7416615606437577,0.03518309274676423 +Helm Lite MedQA,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,2,0.8208959354305796,0.01250307893717913 +Helm Lite MedQA,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,3,0.9182336628416601,0.0012842298120423852 +Helm Lite MedQA,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,4,0.9199026021249039,0.0012087423991030853 +Helm Lite WMT2014,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,0,0.7807842071724994,0.022196180227557687 +Helm Lite WMT2014,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,1,0.6016089012086534,0.11460809097860054 +Helm Lite WMT2014,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,2,0.85978308688271,0.006187486327563118 +Helm Lite WMT2014,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,3,0.9471155608874564,0.00035525230596496123 +Helm Lite WMT2014,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,4,0.9238574615349179,0.0010415614421426264 +HF OpenLLM v2,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,0,0.780599537830846,0.022248986205867058 +HF OpenLLM v2,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,1,0.753379355065838,0.030905705190702806 +HF OpenLLM v2,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,2,0.8379676352721162,0.009384640911630616 +HF OpenLLM v2,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,3,0.8462209992405952,0.008075105621350536 +HF OpenLLM v2,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,4,0.9020771423654268,0.0021784040615750178 +HFv2 BBH,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,0,0.9392379026634557,0.000535591367028614 +HFv2 BBH,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,1,0.7412355057774336,0.035345043191044964 +HFv2 BBH,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,2,0.8195179387247324,0.01277979740900836 +HFv2 BBH,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,3,0.9675915145186947,8.304238414993675e-05 +HFv2 BBH,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,4,0.9356246311290696,0.0006351718939850358 +HFv2 GPQA,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,0,0.7466011946729814,0.03333852605723143 +HFv2 GPQA,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,1,0.9551682330569339,0.00021776057653192886 +HFv2 GPQA,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,2,0.46353588273705637,0.24734250900688215 +HFv2 GPQA,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,3,0.8866352243352398,0.003339629955133934 +HFv2 GPQA,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,4,0.937902652612242,0.0005710971446370687 +HFv2 IFEval,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,0,0.2831911510498836,0.4967225093410736 +HFv2 IFEval,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,1,0.2031844122583542,0.6293846722461313 +HFv2 IFEval,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,2,0.8894964926830444,0.0031000020401251533 +HFv2 IFEval,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,3,0.549284007260608,0.15849945140105312 +HFv2 IFEval,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,4,0.7889373199563972,0.01994193933246426 +HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,0,0.9415411104598773,0.00047780769988844555 +HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,1,0.8981158348442198,0.0024460728519243077 +HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,2,0.7526431927239958,0.0311644661156264 +HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,3,0.8962925022649735,0.0025761063553240114 +HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,4,0.937590300147702,0.0005796196796032962 +HFv2 Math Level 5,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,0,0.5831241321997315,0.12921116102954364 +HFv2 Math Level 5,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,1,0.5561145441014004,0.1523217142123119 +HFv2 Math Level 5,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,2,0.5664450708720614,0.14323389729888122 +HFv2 Math Level 5,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,3,0.47517181530974595,0.23407895750101468 +HFv2 Math Level 5,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,4,0.718855715365913,0.04449992445427745 +HFv2 MuSR,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,0,0.7168604276016974,0.04537877960385103 +HFv2 MuSR,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,1,0.18264726732113173,0.6650765454064547 +HFv2 MuSR,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,2,0.04614314940391431,0.9136043258512831 +HFv2 MuSR,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,3,0.6369093478690498,0.08944819108801377 +HFv2 MuSR,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,4,0.8650362997962656,0.005540656777637369 +OpenCompass Academic,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,0,0.9481614738377944,0.00033485605767966255 +OpenCompass Academic,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,1,0.8579024362848122,0.006430262194723998 +OpenCompass Academic,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,2,0.9674751054383679,8.39330376548511e-05 +OpenCompass Academic,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,3,0.9711920829273566,5.848502027941985e-05 +OpenCompass Academic,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,4,0.9121630061872308,0.0015845787994022296 +OpenCompass MMLU,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,0,0.7303458809128464,0.03963972108447683 +OpenCompass MMLU,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,1,0.7466964409211542,0.03330355520543848 +OpenCompass MMLU,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,2,0.8886798251454765,0.0031672235640011434 +OpenCompass MMLU,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,3,0.9036719475219376,0.002076262347775526 +OpenCompass MMLU,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,4,0.7623592248502944,0.02785522986224059 +OpenCompass MMLU Pro,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,0,0.8902509919824877,0.0030387234498153886 +OpenCompass MMLU Pro,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,1,0.8349964637145074,0.009887030967730168 +OpenCompass MMLU Pro,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,2,0.9513669166922365,0.00027717775621958416 +OpenCompass MMLU Pro,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,3,0.979588964641596,2.0934517813580252e-05 +OpenCompass MMLU Pro,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,4,0.8898917220751776,0.0030678038612609354 +OpenCompass CMMLU,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,0,0.8000397965603336,0.01711033114623395 +OpenCompass CMMLU,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,1,0.7666453684194998,0.026467542617941944 +OpenCompass CMMLU,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,2,0.8751438663188438,0.004421691058140597 +OpenCompass CMMLU,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,3,0.8954496186826447,0.0026376993343606783 +OpenCompass CMMLU,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,4,0.8195357136433342,0.012776203631959988 +OpenCompass BBH,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,0,0.8973997559676354,0.0024966210305528294 +OpenCompass BBH,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,1,0.9238541898435834,0.0010416930833947954 +OpenCompass BBH,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,2,0.9232578806881373,0.0010658683179569461 +OpenCompass BBH,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,3,0.9466806411756816,0.00036396834317210526 +OpenCompass BBH,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,4,0.9402048459613361,0.0005108048313780666 +OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,0,0.7428545649568395,0.03473202812850355 +OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,1,0.8962239297969814,0.0025810820467571426 +OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,2,0.9528032040825007,0.0002536158007562822 +OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,3,0.8136140570811612,0.01400900062666989 +OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,4,0.5749045753814719,0.13602130778385005 +OpenCompass HumanEval,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,0,0.780595487125304,0.022250145374352125 +OpenCompass HumanEval,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,1,0.8389921086523722,0.009215256295109017 +OpenCompass HumanEval,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,2,0.8803463320171083,0.003907570379771439 +OpenCompass HumanEval,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,3,0.7142670311425445,0.04653663665491792 +OpenCompass HumanEval,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,4,0.7977979460712193,0.017660348313797546 +OpenCompass IFEval,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,0,0.7240026280446691,0.04228069432019545 +OpenCompass IFEval,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,1,0.8051290094703403,0.01590190576987268 +OpenCompass IFEval,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,2,0.9226246952938778,0.0010919364406592675 +OpenCompass IFEval,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,3,0.770582228125362,0.025231318204288148 +OpenCompass IFEval,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,4,0.5188109005585113,0.18769119165787862 +Helm MMLU,helm_mmlu_240829.csv,aggregate,aggregate,pearson,random,8,0,0.9642212364414142,0.00011145218096014672 +Helm MMLU,helm_mmlu_240829.csv,aggregate,aggregate,pearson,random,8,1,0.7836454491081474,0.021387948565361206 +Helm MMLU,helm_mmlu_240829.csv,aggregate,aggregate,pearson,random,8,2,0.865235745718993,0.005516995432107779 +Helm MMLU,helm_mmlu_240829.csv,aggregate,aggregate,pearson,random,8,3,0.819500116935474,0.012783401302719894 +Helm MMLU,helm_mmlu_240829.csv,aggregate,aggregate,pearson,random,8,4,0.7432637726714306,0.034578129186903464 +LMSys Arena,chatbot_arena_240829.csv,aggregate,aggregate,pearson,random,8,0,0.9522400671025366,0.0002626898916961467 +LMSys Arena,chatbot_arena_240829.csv,aggregate,aggregate,pearson,random,8,1,0.9732730607216835,4.677795327851405e-05 +LMSys Arena,chatbot_arena_240829.csv,aggregate,aggregate,pearson,random,8,2,0.9110081304703664,0.001646433879397326 +LMSys Arena,chatbot_arena_240829.csv,aggregate,aggregate,pearson,random,8,3,0.9433518650586681,0.0004353717167521428 +LMSys Arena,chatbot_arena_240829.csv,aggregate,aggregate,pearson,random,8,4,0.880586328075459,0.003884834219553849 +Helm Classic,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,0,0.9603201312455674,0.00015157780411521223 +Helm Classic,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,1,0.9604114108423772,0.00015054459028416203 +Helm Classic,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,2,0.9519258192529104,0.00026784516618954716 +Helm Classic,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,3,0.9352773832366816,0.0006453340323628832 +Helm Classic,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,4,0.9756845534259928,3.5288470321501036e-05 +Helm BoolQ,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,0,0.6730282904268812,0.06736225845470355 +Helm BoolQ,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,1,0.9361725603565639,0.0006193510978979659 +Helm BoolQ,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,2,0.8618105831276622,0.005932414266978994 +Helm BoolQ,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,3,0.9371490197710903,0.0005918014940797798 +Helm BoolQ,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,4,0.8046621876144952,0.01601044603512172 +Helm NarrativeQA,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,0,0.41770329390345684,0.30313696659492734 +Helm NarrativeQA,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,1,0.6529975286213465,0.07915856325659755 +Helm NarrativeQA,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,2,0.6950517775314824,0.05566978580633573 +Helm NarrativeQA,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,3,0.5130382972054114,0.19351964488420637 +Helm NarrativeQA,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,4,0.6825577913683614,0.062140382561143265 +Helm NaturalQuestionsClosed,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,0,0.9059635004669196,0.0019350193188838174 +Helm NaturalQuestionsClosed,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,1,0.8702987510549938,0.00493787146977232 +Helm NaturalQuestionsClosed,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,2,0.8349295032906534,0.009898545248446817 +Helm NaturalQuestionsClosed,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,3,0.8245663895988613,0.011784555837564846 +Helm NaturalQuestionsClosed,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,4,0.9186996315597573,0.0012628532368153516 +Helm NaturalQuestionsOpen,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,0,0.899783088468177,0.002330962388754791 +Helm NaturalQuestionsOpen,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,1,0.8724919719311256,0.004699674798249593 +Helm NaturalQuestionsOpen,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,2,0.9486250828884353,0.00032606741963897914 +Helm NaturalQuestionsOpen,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,3,0.9264530754805538,0.0009405124032405977 +Helm NaturalQuestionsOpen,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,4,0.926933634016331,0.000922537739358256 +Helm QuAC,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,0,0.6984411569502376,0.05398723363884652 +Helm QuAC,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,1,0.754828418128203,0.03040022622820331 +Helm QuAC,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,2,0.5655988276473191,0.14396676855997925 +Helm QuAC,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,3,0.9407474980820671,0.000497230334167822 +Helm QuAC,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,4,0.770589245932409,0.025229147116181697 +Helm OpenBookQA,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,0,0.7775815292717585,0.023123063813025962 +Helm OpenBookQA,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,1,0.5611200837416681,0.14787988852194642 +Helm OpenBookQA,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,2,0.34646366697352105,0.40049416986179387 +Helm OpenBookQA,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,3,0.7868643731535557,0.020500867535993103 +Helm OpenBookQA,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,4,0.8114670933196435,0.014473750045325934 +Helm IMDB,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,0,0.4013581254554363,0.32436552572418753 +Helm IMDB,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,1,0.28341806840646894,0.4963625961904983 +Helm IMDB,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,2,0.3139211847524032,0.44892434309679713 +Helm IMDB,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,3,0.2606167560977108,0.5330194398770082 +Helm IMDB,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,4,0.32260154615753545,0.43577896021471924 +Helm CivilComments,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,0,0.7827817854375669,0.021629949458519884 +Helm CivilComments,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,1,0.9421767369217469,0.0004626159242720608 +Helm CivilComments,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,2,0.5386185630062554,0.16841388744478442 +Helm CivilComments,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,3,0.7045551126623175,0.05103000019308416 +Helm CivilComments,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,4,0.8414540075802577,0.00881618884168942 +Helm RAFT,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,0,0.8748256107732684,0.0044544778532186755 +Helm RAFT,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,1,0.8614522174161048,0.005976999431835443 +Helm RAFT,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,2,0.7878166990611953,0.02024289628983945 +Helm RAFT,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,3,0.8381151096374623,0.009360136935052572 +Helm RAFT,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,4,0.876154278920616,0.0043186280005204514 +MMLU Pro,mmlu_pro_240829.csv,aggregate,aggregate,pearson,random,8,0,0.9802952193136,1.884578972104051e-05 +MMLU Pro,mmlu_pro_240829.csv,aggregate,aggregate,pearson,random,8,1,0.9841937367574427,9.755845662836177e-06 +MMLU Pro,mmlu_pro_240829.csv,aggregate,aggregate,pearson,random,8,2,0.8661864185981796,0.005405102460401999 +MMLU Pro,mmlu_pro_240829.csv,aggregate,aggregate,pearson,random,8,3,0.8297856426405835,0.010808669505560614 +MMLU Pro,mmlu_pro_240829.csv,aggregate,aggregate,pearson,random,8,4,0.9329487606730291,0.000716243089312378 +MixEval,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,0,0.822202489777381,0.01224422861798353 +MixEval,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,1,0.6968865871905413,0.05475511707469452 +MixEval,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,2,0.9318897100616549,0.0007501099193828288 +MixEval,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,3,0.7939152572032528,0.018638835543465734 +MixEval,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,4,0.7761614135775217,0.02354161442763604 +MixEval Hard,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,0,0.9079242687040253,0.0018192466167481706 +MixEval Hard,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,1,0.5935991848770941,0.12081484777974201 +MixEval Hard,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,2,0.96841302674998,7.693398893847449e-05 +MixEval Hard,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,3,0.9131963004520903,0.001530535130781307 +MixEval Hard,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,4,0.7594573765014532,0.02881968270449265 +MixEval TriviaQA,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,0,0.6622792441367216,0.07355344210000651 +MixEval TriviaQA,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,1,0.5835165093102912,0.1288909419896904 +MixEval TriviaQA,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,2,0.7271748558955601,0.04094703171178795 +MixEval TriviaQA,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,3,0.7369082697183147,0.0370157216672518 +MixEval TriviaQA,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,4,0.7219159720057066,0.04317213020613491 +MixEval MMLU,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,0,0.8973595810319037,0.002499476856786579 +MixEval MMLU,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,1,0.6540145328427245,0.07853263145320354 +MixEval MMLU,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,2,0.9470816844896075,0.0003559262259996983 +MixEval MMLU,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,3,0.798793471524343,0.017414760604056785 +MixEval MMLU,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,4,0.766501585020503,0.026513385703318352 +MixEval DROP,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,0,0.6776894663079587,0.06477689572321889 +MixEval DROP,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,1,0.6576248245381009,0.07633405000799688 +MixEval DROP,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,2,0.796342090311639,0.018023378799051942 +MixEval DROP,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,3,0.689140856921657,0.058678219175095074 +MixEval DROP,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,4,0.6705942614169457,0.06873614015066103 +MixEval HellaSwag,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,0,0.6842754194067544,0.0612256583562849 +MixEval HellaSwag,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,1,0.7338112096805872,0.03824046140795786 +MixEval HellaSwag,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,2,0.8786344078919507,0.0040722405599500165 +MixEval HellaSwag,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,3,0.8914863638509409,0.0029400900210167272 +MixEval HellaSwag,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,4,0.8522000994286094,0.007203358614415384 +MixEval CommonsenseQA,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,0,0.7479170810940026,0.03285737031031745 +MixEval CommonsenseQA,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,1,0.5899049701184135,0.1237398240474465 +MixEval CommonsenseQA,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,2,0.864013241961245,0.005663050469813282 +MixEval CommonsenseQA,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,3,0.726560560314063,0.04120326937800088 +MixEval CommonsenseQA,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,4,0.7600546147835674,0.02861953111724766 +MixEval TriviaQA Hard,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,0,0.8675817638279608,0.00524352512595729 +MixEval TriviaQA Hard,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,1,0.4358953069712842,0.280322780055143 +MixEval TriviaQA Hard,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,2,0.8724977849323057,0.004699053502733089 +MixEval TriviaQA Hard,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,3,0.871502377377448,0.004806214049293794 +MixEval TriviaQA Hard,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,4,0.636462032322589,0.08974474991245225 +MixEval MMLU Hard,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,0,0.7407371067623334,0.035535069908202585 +MixEval MMLU Hard,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,1,0.13754152986907456,0.7453436298315592 +MixEval MMLU Hard,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,2,0.8584434869588686,0.006359804257501524 +MixEval MMLU Hard,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,3,0.9096718109287911,0.0017199423212977748 +MixEval MMLU Hard,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,4,0.429513562091493,0.2882272134157949 +MixEval DROP Hard,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,0,0.7258395762861067,0.04150524782255408 +MixEval DROP Hard,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,1,0.4140057077993773,0.3078793667149351 +MixEval DROP Hard,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,2,0.8860840192325219,0.003387122941063616 +MixEval DROP Hard,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,3,0.8531999374729967,0.007063738601380546 +MixEval DROP Hard,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,4,0.570698753672453,0.13958138247636556 +AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,aggregate,aggregate,pearson,random,8,0,0.9462124246513754,0.00037350751375720304 +AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,aggregate,aggregate,pearson,random,8,1,0.820982530302196,0.012485817170678851 +AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,aggregate,aggregate,pearson,random,8,2,0.9284819872198913,0.0008661544234609058 +AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,aggregate,aggregate,pearson,random,8,3,0.9226572389021586,0.0010905865909148318 +AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,aggregate,aggregate,pearson,random,8,4,0.8996834645928126,0.0023377397968761906 +OpenCompass,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,0,0.9806889787900566,1.77437080791335e-05 +OpenCompass,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,1,0.9467481050448351,0.00036260722071780783 +OpenCompass,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,2,0.9051882617143683,0.001982079878231783 +OpenCompass,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,3,0.8448816290057799,0.008279149903754354 +OpenCompass,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,4,0.9486969514405281,0.0003247187445212263 +OpenCompass Language,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,0,0.7138885174194392,0.046707103452906885 +OpenCompass Language,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,1,0.40763933138747765,0.3161269846214854 +OpenCompass Language,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,2,0.5033557119680766,0.20350786972733814 +OpenCompass Language,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,3,0.4943676910774294,0.21301612937354739 +OpenCompass Language,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,4,0.3662549994154035,0.3722134961617391 +OpenCompass Knowledge,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,0,0.6943274080319848,0.05603338677616118 +OpenCompass Knowledge,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,1,0.888202282224346,0.0032069637473251308 +OpenCompass Knowledge,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,2,0.862959786938574,0.0057908774192851585 +OpenCompass Knowledge,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,3,0.4422315456206938,0.2725814015162671 +OpenCompass Knowledge,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,4,0.9314197867245828,0.0007654668867563735 +OpenCompass Reasoning,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,0,0.8576726697477571,0.006460333718352682 +OpenCompass Reasoning,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,1,0.6477798867796105,0.08241558395766836 +OpenCompass Reasoning,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,2,0.7105249096891054,0.04823848031855015 +OpenCompass Reasoning,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,3,0.7433756448219943,0.034536127920169364 +OpenCompass Reasoning,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,4,0.465629371128827,0.24492880327618063 +OpenCompass Math,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,0,0.9815968610969954,1.5367458655827867e-05 +OpenCompass Math,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,1,0.9215279351913577,0.0011380681078154023 +OpenCompass Math,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,2,0.9028698976709195,0.0021272329705264844 +OpenCompass Math,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,3,0.8115257987039834,0.014460915122317916 +OpenCompass Math,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,4,0.8840656907304268,0.003564741739845647 +OpenCompass Code,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,0,0.9288767434076772,0.0008521494712455959 +OpenCompass Code,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,1,0.8762491857760322,0.004309027650395265 +OpenCompass Code,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,2,0.822174167720692,0.012249803466994006 +OpenCompass Code,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,3,0.8388480886223416,0.009238949980481774 +OpenCompass Code,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,4,0.9318866818637482,0.0007502082286076188 +OpenCompass Instruction,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,0,0.6752208316271633,0.06613869004956173 +OpenCompass Instruction,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,1,0.7677373687773497,0.026120973578910495 +OpenCompass Instruction,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,2,0.7919204265038193,0.01915443839404165 +OpenCompass Instruction,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,3,0.8238198607264919,0.01192852239680578 +OpenCompass Instruction,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,4,0.8788769140000767,0.0040486473187813605 +OpenCompass Agent,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,0,0.5937971020205063,0.1206592532108973 +OpenCompass Agent,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,1,0.6743688104667733,0.0666125934693148 +OpenCompass Agent,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,2,0.6092910701405022,0.10882867605607495 +OpenCompass Agent,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,3,0.02436876480189197,0.954326651607438 +OpenCompass Agent,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,4,0.7114255278499215,0.04782552820112736 +OpenCompass Arena,opencompass_arena_240829.csv,aggregate,aggregate,pearson,random,8,0,0.5887872724291499,0.12463254240428198 +OpenCompass Arena,opencompass_arena_240829.csv,aggregate,aggregate,pearson,random,8,1,0.4029552549015283,0.32226121873409685 +OpenCompass Arena,opencompass_arena_240829.csv,aggregate,aggregate,pearson,random,8,2,0.19589220319331574,0.6419903458052949 +OpenCompass Arena,opencompass_arena_240829.csv,aggregate,aggregate,pearson,random,8,3,0.5147894627560958,0.1917415408232741 +OpenCompass Arena,opencompass_arena_240829.csv,aggregate,aggregate,pearson,random,8,4,0.43696792691727815,0.2790047957490856 +LiveBench 240725,livebench_240829.csv,aggregate,aggregate,pearson,random,8,0,0.9683600812057522,7.731839943750683e-05 +LiveBench 240725,livebench_240829.csv,aggregate,aggregate,pearson,random,8,1,0.9490060035318915,0.00031896092810029624 +LiveBench 240725,livebench_240829.csv,aggregate,aggregate,pearson,random,8,2,0.9033732116949054,0.0020951534061901173 +LiveBench 240725,livebench_240829.csv,aggregate,aggregate,pearson,random,8,3,0.9728319200142996,4.911626350007423e-05 +LiveBench 240725,livebench_240829.csv,aggregate,aggregate,pearson,random,8,4,0.9174158952141087,0.0013223130420052574 +LiveBench Reasoning,livebench_240829.csv,aggregate,aggregate,pearson,random,8,0,0.8698029729880158,0.00499276771087744 +LiveBench Reasoning,livebench_240829.csv,aggregate,aggregate,pearson,random,8,1,0.9736499620869766,4.483954353741208e-05 +LiveBench Reasoning,livebench_240829.csv,aggregate,aggregate,pearson,random,8,2,0.8938963574061565,0.002753683842916408 +LiveBench Reasoning,livebench_240829.csv,aggregate,aggregate,pearson,random,8,3,0.9427230009399408,0.00044981624708065733 +LiveBench Reasoning,livebench_240829.csv,aggregate,aggregate,pearson,random,8,4,0.9288091831587435,0.0008545357544848401 +LiveBench Coding,livebench_240829.csv,aggregate,aggregate,pearson,random,8,0,0.9876650170257133,4.648675321533348e-06 +LiveBench Coding,livebench_240829.csv,aggregate,aggregate,pearson,random,8,1,0.9200698352872445,0.0012013420941124318 +LiveBench Coding,livebench_240829.csv,aggregate,aggregate,pearson,random,8,2,0.8197843971795349,0.012725991028944833 +LiveBench Coding,livebench_240829.csv,aggregate,aggregate,pearson,random,8,3,0.9667731014329254,8.943826166773405e-05 +LiveBench Coding,livebench_240829.csv,aggregate,aggregate,pearson,random,8,4,0.9135236868955329,0.0015136659995374103 +LiveBench Mathematics,livebench_240829.csv,aggregate,aggregate,pearson,random,8,0,0.983826044072315,1.0449743172360012e-05 +LiveBench Mathematics,livebench_240829.csv,aggregate,aggregate,pearson,random,8,1,0.9482689395026054,0.000332805134027447 +LiveBench Mathematics,livebench_240829.csv,aggregate,aggregate,pearson,random,8,2,0.9334433471484072,0.0007007762613840839 +LiveBench Mathematics,livebench_240829.csv,aggregate,aggregate,pearson,random,8,3,0.8998371432675459,0.0023272903802322954 +LiveBench Mathematics,livebench_240829.csv,aggregate,aggregate,pearson,random,8,4,0.9131450099069247,0.0015331889972515346 +LiveBench Data Analysis,livebench_240829.csv,aggregate,aggregate,pearson,random,8,0,0.9445409047411082,0.00040889964932544416 +LiveBench Data Analysis,livebench_240829.csv,aggregate,aggregate,pearson,random,8,1,0.8996453255999854,0.00234033776853281 +LiveBench Data Analysis,livebench_240829.csv,aggregate,aggregate,pearson,random,8,2,0.8662449830102448,0.005398257529969565 +LiveBench Data Analysis,livebench_240829.csv,aggregate,aggregate,pearson,random,8,3,0.9506955154682739,0.00028866872380162265 +LiveBench Data Analysis,livebench_240829.csv,aggregate,aggregate,pearson,random,8,4,0.9121357775980045,0.0015860194531010332 +LiveBench Language,livebench_240829.csv,aggregate,aggregate,pearson,random,8,0,0.9469225816315634,0.000359102582060145 +LiveBench Language,livebench_240829.csv,aggregate,aggregate,pearson,random,8,1,0.790872393374341,0.019428850798750914 +LiveBench Language,livebench_240829.csv,aggregate,aggregate,pearson,random,8,2,0.7384692720332464,0.03640761031575469 +LiveBench Language,livebench_240829.csv,aggregate,aggregate,pearson,random,8,3,0.9396936265489109,0.0005238133760109684 +LiveBench Language,livebench_240829.csv,aggregate,aggregate,pearson,random,8,4,0.7853349194194776,0.020919442242219075 +LiveBench Instruction Following,livebench_240829.csv,aggregate,aggregate,pearson,random,8,0,0.8636070293544758,0.005712124057773506 +LiveBench Instruction Following,livebench_240829.csv,aggregate,aggregate,pearson,random,8,1,0.837126038633602,0.009525258316342535 +LiveBench Instruction Following,livebench_240829.csv,aggregate,aggregate,pearson,random,8,2,0.7663953319208139,0.026547294337781743 +LiveBench Instruction Following,livebench_240829.csv,aggregate,aggregate,pearson,random,8,3,0.8834569465544357,0.00361946726545403 +LiveBench Instruction Following,livebench_240829.csv,aggregate,aggregate,pearson,random,8,4,0.8480938359553485,0.00779520658099071 +WildBench Elo LC,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,0,0.9882164477730901,4.05436289119973e-06 +WildBench Elo LC,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,1,0.9749878899040407,3.838912250625781e-05 +WildBench Elo LC,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,2,0.9189017807616305,0.0012536521795481071 +WildBench Elo LC,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,3,0.976785228034165,3.073554131266073e-05 +WildBench Elo LC,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,4,0.9683736529744773,7.721974100004276e-05 +WildBench Information Seeking,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,0,0.8793267175321069,0.004005119722136405 +WildBench Information Seeking,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,1,0.8760721346635911,0.004326948446281908 +WildBench Information Seeking,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,2,0.9315137258308156,0.0007623806815109492 +WildBench Information Seeking,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,3,0.9671655908223616,8.633181797191984e-05 +WildBench Information Seeking,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,4,0.7675767218262903,0.026171781192995118 +WildBench Creative,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,0,0.8483878251754778,0.007751839541749867 +WildBench Creative,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,1,0.9222607240796445,0.0011071076795417618 +WildBench Creative,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,2,0.9440994017259922,0.00041860181264251746 +WildBench Creative,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,3,0.9640433681068886,0.00011310737614553013 +WildBench Creative,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,4,0.692434840005101,0.056990052908859494 +WildBench Code Debugging,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,0,0.9942767822652612,4.6665376445687894e-07 +WildBench Code Debugging,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,1,0.9682235346488557,7.831565067564543e-05 +WildBench Code Debugging,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,2,0.9370054660599566,0.0005958002530390111 +WildBench Code Debugging,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,3,0.969420946106877,6.985512173523951e-05 +WildBench Code Debugging,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,4,0.9834828472581691,1.1126279772397877e-05 +WildBench Math & Data,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,0,0.9928216304628095,9.197638948465057e-07 +WildBench Math & Data,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,1,0.9611899818187688,0.00014192004448559492 +WildBench Math & Data,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,2,0.9411758308443503,0.0004866843681750784 +WildBench Math & Data,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,3,0.9688368521395198,7.390226580769654e-05 +WildBench Math & Data,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,4,0.9723616916410369,5.16925798887181e-05 +WildBench Reasoning & Planning,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,0,0.9766036636486001,3.14580315476573e-05 +WildBench Reasoning & Planning,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,1,0.9597878054141521,0.00015769662952759886 +WildBench Reasoning & Planning,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,2,0.9404428288332258,0.0005048221249291256 +WildBench Reasoning & Planning,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,3,0.9831715348590928,1.176456701375346e-05 +WildBench Reasoning & Planning,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,4,0.97187689823272,5.4440740892278444e-05 +WildBench Score,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,0,0.9852421877364517,7.946695487913594e-06 +WildBench Score,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,1,0.9692179758222269,7.124441373542135e-05 +WildBench Score,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,2,0.9412248237761267,0.000485487558057933 +WildBench Score,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,3,0.98025276424875,1.8967257174977277e-05 +WildBench Score,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,4,0.9846373995357367,8.960181355366343e-06 +Arena Hard,arena_hard_240829.csv,aggregate,aggregate,pearson,random,8,0,0.9522400671025366,0.0002626898916961467 +Arena Hard,arena_hard_240829.csv,aggregate,aggregate,pearson,random,8,1,0.9732730607216835,4.677795327851405e-05 +Arena Hard,arena_hard_240829.csv,aggregate,aggregate,pearson,random,8,2,0.9110081304703664,0.001646433879397326 +Arena Hard,arena_hard_240829.csv,aggregate,aggregate,pearson,random,8,3,0.9433518650586681,0.0004353717167521428 +Arena Hard,arena_hard_240829.csv,aggregate,aggregate,pearson,random,8,4,0.880586328075459,0.003884834219553849 +HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,0,0.9355663499255871,0.0006368701046576545 +HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,1,0.9499604642147754,0.0003016036750416735 +HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,2,0.7164442699126142,0.04556339297891151 +HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,3,0.5643812833359342,0.14502482192576685 +HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,4,0.4448334653124403,0.269433453257965 +HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,0,0.9020957808919513,0.002177191904645508 +HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,1,0.9140262325400854,0.0014880077902407654 +HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,2,0.6613543728531551,0.07410115498793113 +HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,3,0.4797794956768499,0.2289297958345603 +HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,4,0.49503702005526434,0.21230024172428238 +HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,0,0.8658004484348707,0.005450353400185282 +HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,1,0.9239450258900821,0.0010380421984977164 +HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,2,0.6878185417270377,0.05936418242167244 +HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,3,0.6427492187377651,0.08562857067256696 +HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,4,0.19987101474191585,0.6351028985023905 +HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,0,0.7695981699173929,0.025536900476404875 +HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,1,0.851160886507116,0.00735033097799936 +HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,2,0.7535063061583401,0.030861215825263487 +HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,3,0.26946310602236634,0.5186811891252074 +HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,4,0.5071239778851739,0.19958915881626008 +HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,0,0.845558834843199,0.00817557674320208 +HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,1,0.8223598748455347,0.01221327849153134 +HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,2,0.7520379034546343,0.03137821860478068 +HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,3,0.5986152394502113,0.1169062576526029 +HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,4,0.766509325140422,0.026510916638992615 +HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,0,0.6388656044215879,0.08815791552969902 +HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,1,0.8220592376168137,0.012272442496278822 +HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,2,0.1610992186087647,0.7031245257171708 +HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,3,0.22938177579714764,0.584757473087143 +HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,4,0.16217150942988084,0.7012176634258844 +HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,0,0.8536693780854105,0.0069987855857581984 +HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,1,0.9079591032101378,0.0018172316533511903 +HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,2,0.7448797028215589,0.033974472983626124 +HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,3,0.436470242791583,0.2796159471960331 +HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,4,0.5113717481429286,0.195219904727713 +BFCL,bfcl_240906.csv,aggregate,aggregate,pearson,random,8,0,0.8848684214582546,0.0034933971141531536 +BFCL,bfcl_240906.csv,aggregate,aggregate,pearson,random,8,1,0.9247518427204778,0.0010059807632682822 +BFCL,bfcl_240906.csv,aggregate,aggregate,pearson,random,8,2,0.7024798803756629,0.05202256738347333 +BFCL,bfcl_240906.csv,aggregate,aggregate,pearson,random,8,3,0.6111548412929141,0.10745210550108082 +BFCL,bfcl_240906.csv,aggregate,aggregate,pearson,random,8,4,0.8864983521119945,0.0033513827582610342 +BIGGEN,biggen_240829.csv,aggregate,aggregate,pearson,random,8,0,0.8443252756395498,0.008364861793357709 +BIGGEN,biggen_240829.csv,aggregate,aggregate,pearson,random,8,1,0.8199557285303699,0.012691469447090417 +BIGGEN,biggen_240829.csv,aggregate,aggregate,pearson,random,8,2,0.6898121736766818,0.05833178396126367 +BIGGEN,biggen_240829.csv,aggregate,aggregate,pearson,random,8,3,0.1445400076243653,0.732738456710739 +BIGGEN,biggen_240829.csv,aggregate,aggregate,pearson,random,8,4,-0.13444519427677581,0.7509364951619687 +BIGGEN Grounding,biggen_240829.csv,aggregate,aggregate,pearson,random,8,0,0.9461712339012929,0.00037435448514068834 +BIGGEN Grounding,biggen_240829.csv,aggregate,aggregate,pearson,random,8,1,0.8543556725359636,0.006904516600543572 +BIGGEN Grounding,biggen_240829.csv,aggregate,aggregate,pearson,random,8,2,0.7671160990392422,0.026317800283773948 +BIGGEN Grounding,biggen_240829.csv,aggregate,aggregate,pearson,random,8,3,0.4230508906614041,0.29634091151848907 +BIGGEN Grounding,biggen_240829.csv,aggregate,aggregate,pearson,random,8,4,0.29492042180464345,0.478252042515081 +BIGGEN Instruction Following,biggen_240829.csv,aggregate,aggregate,pearson,random,8,0,0.8192056092552416,0.01284304904344425 +BIGGEN Instruction Following,biggen_240829.csv,aggregate,aggregate,pearson,random,8,1,0.8053230426409881,0.015856927546595193 +BIGGEN Instruction Following,biggen_240829.csv,aggregate,aggregate,pearson,random,8,2,0.6785867773117831,0.06428605698561919 +BIGGEN Instruction Following,biggen_240829.csv,aggregate,aggregate,pearson,random,8,3,0.021028776761034942,0.960582665935811 +BIGGEN Instruction Following,biggen_240829.csv,aggregate,aggregate,pearson,random,8,4,-0.25337930013147175,0.5448562000018814 +BIGGEN Planning,biggen_240829.csv,aggregate,aggregate,pearson,random,8,0,0.8101772449555595,0.014757563523095152 +BIGGEN Planning,biggen_240829.csv,aggregate,aggregate,pearson,random,8,1,0.7844308170919763,0.021169355122089707 +BIGGEN Planning,biggen_240829.csv,aggregate,aggregate,pearson,random,8,2,0.6407686957715764,0.08691312009391092 +BIGGEN Planning,biggen_240829.csv,aggregate,aggregate,pearson,random,8,3,0.042093006210129874,0.9211687904012325 +BIGGEN Planning,biggen_240829.csv,aggregate,aggregate,pearson,random,8,4,-0.2813292229519864,0.4996795026573654 +BIGGEN Reasoning,biggen_240829.csv,aggregate,aggregate,pearson,random,8,0,0.8350456630970934,0.00987857623206292 +BIGGEN Reasoning,biggen_240829.csv,aggregate,aggregate,pearson,random,8,1,0.879311548672376,0.004006582681021272 +BIGGEN Reasoning,biggen_240829.csv,aggregate,aggregate,pearson,random,8,2,0.6951300585252861,0.0556305769370549 +BIGGEN Reasoning,biggen_240829.csv,aggregate,aggregate,pearson,random,8,3,0.30955291195703166,0.4556002793087552 +BIGGEN Reasoning,biggen_240829.csv,aggregate,aggregate,pearson,random,8,4,0.09897629382276267,0.8156278898050575 +BIGGEN Refinement,biggen_240829.csv,aggregate,aggregate,pearson,random,8,0,0.8313126956210078,0.010533178480029779 +BIGGEN Refinement,biggen_240829.csv,aggregate,aggregate,pearson,random,8,1,0.8169388413464165,0.01330802664448977 +BIGGEN Refinement,biggen_240829.csv,aggregate,aggregate,pearson,random,8,2,0.8065284450649773,0.015579295379409611 +BIGGEN Refinement,biggen_240829.csv,aggregate,aggregate,pearson,random,8,3,0.23722382427262312,0.5716108619128892 +BIGGEN Refinement,biggen_240829.csv,aggregate,aggregate,pearson,random,8,4,0.026088426326565897,0.9511063910298649 +BIGGEN Safety,biggen_240829.csv,aggregate,aggregate,pearson,random,8,0,0.5558829816104426,0.15252894598370506 +BIGGEN Safety,biggen_240829.csv,aggregate,aggregate,pearson,random,8,1,0.6390946692796851,0.08800754271923365 +BIGGEN Safety,biggen_240829.csv,aggregate,aggregate,pearson,random,8,2,0.24121345447897227,0.5649619826999719 +BIGGEN Safety,biggen_240829.csv,aggregate,aggregate,pearson,random,8,3,-0.13262144042688304,0.7542351704927408 +BIGGEN Safety,biggen_240829.csv,aggregate,aggregate,pearson,random,8,4,-0.46784288126219703,0.24238975539995447 +BIGGEN Theory of Mind,biggen_240829.csv,aggregate,aggregate,pearson,random,8,0,0.7467577882406231,0.03328104267130768 +BIGGEN Theory of Mind,biggen_240829.csv,aggregate,aggregate,pearson,random,8,1,0.7611545287510072,0.028253164658278467 +BIGGEN Theory of Mind,biggen_240829.csv,aggregate,aggregate,pearson,random,8,2,0.6541774611460981,0.07843262445172178 +BIGGEN Theory of Mind,biggen_240829.csv,aggregate,aggregate,pearson,random,8,3,0.0830822493170678,0.8449361587214159 +BIGGEN Theory of Mind,biggen_240829.csv,aggregate,aggregate,pearson,random,8,4,-0.1985934514676979,0.6373119372341151 +BIGGEN Tool Usage,biggen_240829.csv,aggregate,aggregate,pearson,random,8,0,0.9103256104990007,0.001683717098370581 +BIGGEN Tool Usage,biggen_240829.csv,aggregate,aggregate,pearson,random,8,1,0.8079204807250888,0.015262498588799642 +BIGGEN Tool Usage,biggen_240829.csv,aggregate,aggregate,pearson,random,8,2,0.7253154362419392,0.0417256201301186 +BIGGEN Tool Usage,biggen_240829.csv,aggregate,aggregate,pearson,random,8,3,0.2776474358858506,0.5055464711128136 +BIGGEN Tool Usage,biggen_240829.csv,aggregate,aggregate,pearson,random,8,4,-0.04029159995291984,0.9245349726533298 +BIGGEN Multilingual,biggen_240829.csv,aggregate,aggregate,pearson,random,8,0,0.919432996814919,0.0012296819224052442 +BIGGEN Multilingual,biggen_240829.csv,aggregate,aggregate,pearson,random,8,1,0.87005129824662,0.004965222567299112 +BIGGEN Multilingual,biggen_240829.csv,aggregate,aggregate,pearson,random,8,2,0.9073703100625691,0.001851485138509531 +BIGGEN Multilingual,biggen_240829.csv,aggregate,aggregate,pearson,random,8,3,0.8673887162219034,0.005265692212272121 +BIGGEN Multilingual,biggen_240829.csv,aggregate,aggregate,pearson,random,8,4,0.8916723527123611,0.0029254223429427636 +LiveBench 240624,livebench_240701.csv,aggregate,aggregate,pearson,random,8,0,0.9804801069360884,1.832282630082123e-05 +LiveBench 240624,livebench_240701.csv,aggregate,aggregate,pearson,random,8,1,0.980051779203359,1.9549343460335766e-05 +LiveBench 240624,livebench_240701.csv,aggregate,aggregate,pearson,random,8,2,0.9720131442366731,5.3658869462094946e-05 +LiveBench 240624,livebench_240701.csv,aggregate,aggregate,pearson,random,8,3,0.9165887813382055,0.001361572704071016 +LiveBench 240624,livebench_240701.csv,aggregate,aggregate,pearson,random,8,4,0.9225103255266087,0.0010966889416837342 +LiveBench Reasoning Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,0,0.9292369266176062,0.000839501038985727 +LiveBench Reasoning Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,1,0.9505492134066896,0.00029121355501060477 +LiveBench Reasoning Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,2,0.9415690777822339,0.00047713248045663163 +LiveBench Reasoning Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,3,0.9576750897378552,0.00018358576102437457 +LiveBench Reasoning Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,4,0.8850761460392197,0.0034750864462593195 +LiveBench Coding Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,0,0.9798647388383346,2.0101576768271062e-05 +LiveBench Coding Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,1,0.9781250835045174,2.5741076148769547e-05 +LiveBench Coding Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,2,0.9598475365356987,0.00015700207944980397 +LiveBench Coding Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,3,0.9317002702003969,0.000756276259880365 +LiveBench Coding Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,4,0.8240635545541923,0.011881405061211926 +LiveBench Mathematics Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,0,0.9849433970479835,8.437305784682183e-06 +LiveBench Mathematics Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,1,0.9899107226768695,2.548168158279175e-06 +LiveBench Mathematics Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,2,0.9645217100316719,0.00010869253777108847 +LiveBench Mathematics Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,3,0.9447465624679983,0.00040443116308794275 +LiveBench Mathematics Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,4,0.8760879368136391,0.0043253470355424355 +LiveBench Data Analysis Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,0,0.9469408250476264,0.0003587374254477132 +LiveBench Data Analysis Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,1,0.9498225876442147,0.000304071618749767 +LiveBench Data Analysis Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,2,0.9413785598975157,0.0004817446027243596 +LiveBench Data Analysis Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,3,0.8197292667265523,0.012737111858293043 +LiveBench Data Analysis Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,4,0.9057861973602506,0.0019457176947306907 +LiveBench Language Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,0,0.9413025091864188,0.000483593804288479 +LiveBench Language Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,1,0.9083254977326705,0.001796125778484392 +LiveBench Language Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,2,0.8626635526406192,0.005827152548807454 +LiveBench Language Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,3,0.8043418970652331,0.016085184583393794 +LiveBench Language Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,4,0.8946872852632068,0.0026942203148939193 +LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,0,0.9025950086780581,0.002144887259438991 +LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,1,0.7564264003460613,0.02984872863501939 +LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,2,0.9033527343998258,0.002096452391428316 +LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,3,0.8494277893147777,0.0075996673267298715 +LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,4,0.8534145445088147,0.007033997470343221 +aggregate,aggregate,Holmes,holmes_240829.csv,pearson,random,8,0,-0.017485869096098686,0.9672206778351959 +aggregate,aggregate,Holmes,holmes_240829.csv,pearson,random,8,1,-0.06826285140114943,0.8724042132624071 +aggregate,aggregate,Holmes,holmes_240829.csv,pearson,random,8,2,-0.27291992568490936,0.5131179718629255 +aggregate,aggregate,Holmes,holmes_240829.csv,pearson,random,8,3,-0.0623085741331382,0.8834734515868299 +aggregate,aggregate,Holmes,holmes_240829.csv,pearson,random,8,4,0.11553071904436202,0.7852997192967395 +aggregate,aggregate,Helm Lite,helm_lite_240829.csv,pearson,random,8,0,0.8743737489954189,0.004501296794893102 +aggregate,aggregate,Helm Lite,helm_lite_240829.csv,pearson,random,8,1,0.8019858294586086,0.01664169341252048 +aggregate,aggregate,Helm Lite,helm_lite_240829.csv,pearson,random,8,2,0.865218326418788,0.005519059390504801 +aggregate,aggregate,Helm Lite,helm_lite_240829.csv,pearson,random,8,3,0.9324959770534272,0.0007305971150650418 +aggregate,aggregate,Helm Lite,helm_lite_240829.csv,pearson,random,8,4,0.9578331579912773,0.00018155839890573593 +aggregate,aggregate,Helm Lite NarrativeQA,helm_lite_240829.csv,pearson,random,8,0,-0.30992157835736617,0.4550353006304514 +aggregate,aggregate,Helm Lite NarrativeQA,helm_lite_240829.csv,pearson,random,8,1,-0.48460771469003827,0.2235972811859595 +aggregate,aggregate,Helm Lite NarrativeQA,helm_lite_240829.csv,pearson,random,8,2,-0.1162588388208577,0.78397092283469 +aggregate,aggregate,Helm Lite NarrativeQA,helm_lite_240829.csv,pearson,random,8,3,-0.03180360013624742,0.9404084479868535 +aggregate,aggregate,Helm Lite NarrativeQA,helm_lite_240829.csv,pearson,random,8,4,-0.6310234888301745,0.09339585968843296 +aggregate,aggregate,Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,pearson,random,8,0,0.5719061307929368,0.1385541569597628 +aggregate,aggregate,Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,pearson,random,8,1,-0.2953447949582872,0.47758892197811004 +aggregate,aggregate,Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,pearson,random,8,2,-0.08547114468780825,0.8405203853999355 +aggregate,aggregate,Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,pearson,random,8,3,-0.02680948636066538,0.9497562944796989 +aggregate,aggregate,Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,pearson,random,8,4,-0.4016145018471783,0.32402730112296474 +aggregate,aggregate,Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,pearson,random,8,0,0.7247956777996108,0.04194484960329344 +aggregate,aggregate,Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,pearson,random,8,1,0.2767660595168839,0.5069548295866992 +aggregate,aggregate,Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,pearson,random,8,2,0.3337223270100439,0.4191769676693079 +aggregate,aggregate,Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,pearson,random,8,3,0.6126891094585267,0.10632638977302632 +aggregate,aggregate,Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,pearson,random,8,4,0.8079257463851817,0.015261307993340337 +aggregate,aggregate,Helm Lite OpenBookQA,helm_lite_240829.csv,pearson,random,8,0,0.6647150497002838,0.07212235537894374 +aggregate,aggregate,Helm Lite OpenBookQA,helm_lite_240829.csv,pearson,random,8,1,0.9659235574949907,9.641323857066814e-05 +aggregate,aggregate,Helm Lite OpenBookQA,helm_lite_240829.csv,pearson,random,8,2,0.8593434484023453,0.0062437049978399314 +aggregate,aggregate,Helm Lite OpenBookQA,helm_lite_240829.csv,pearson,random,8,3,0.7704800482268904,0.025262942539415363 +aggregate,aggregate,Helm Lite OpenBookQA,helm_lite_240829.csv,pearson,random,8,4,0.9028773381740962,0.002126756432137772 +aggregate,aggregate,Helm Lite MMLU,helm_lite_240829.csv,pearson,random,8,0,0.748982925973149,0.032470780295939985 +aggregate,aggregate,Helm Lite MMLU,helm_lite_240829.csv,pearson,random,8,1,0.8599957450436625,0.006160409391629476 +aggregate,aggregate,Helm Lite MMLU,helm_lite_240829.csv,pearson,random,8,2,0.8718735582848011,0.004766072993988772 +aggregate,aggregate,Helm Lite MMLU,helm_lite_240829.csv,pearson,random,8,3,0.9069576656171551,0.001875739334441522 +aggregate,aggregate,Helm Lite MMLU,helm_lite_240829.csv,pearson,random,8,4,0.9502933219669614,0.00029570003340264575 +aggregate,aggregate,Helm Lite MathEquivalentCOT,helm_lite_240829.csv,pearson,random,8,0,0.8905328662549648,0.003016032865892646 +aggregate,aggregate,Helm Lite MathEquivalentCOT,helm_lite_240829.csv,pearson,random,8,1,0.5058552901713423,0.20090402274559316 +aggregate,aggregate,Helm Lite MathEquivalentCOT,helm_lite_240829.csv,pearson,random,8,2,0.6767432630833718,0.0652968761285632 +aggregate,aggregate,Helm Lite MathEquivalentCOT,helm_lite_240829.csv,pearson,random,8,3,0.7135518769682414,0.04685902831102101 +aggregate,aggregate,Helm Lite MathEquivalentCOT,helm_lite_240829.csv,pearson,random,8,4,0.873661116609048,0.004575776138454243 +aggregate,aggregate,Helm Lite GSM8K,helm_lite_240829.csv,pearson,random,8,0,0.8775217778627072,0.004181622363896538 +aggregate,aggregate,Helm Lite GSM8K,helm_lite_240829.csv,pearson,random,8,1,0.7683490298001087,0.025928082489068475 +aggregate,aggregate,Helm Lite GSM8K,helm_lite_240829.csv,pearson,random,8,2,0.741463148953373,0.035258455741147623 +aggregate,aggregate,Helm Lite GSM8K,helm_lite_240829.csv,pearson,random,8,3,0.7891209052525207,0.019892902878583873 +aggregate,aggregate,Helm Lite GSM8K,helm_lite_240829.csv,pearson,random,8,4,0.8157900850650412,0.013547661219765379 +aggregate,aggregate,Helm Lite LegalBench,helm_lite_240829.csv,pearson,random,8,0,0.8625206786227912,0.005844699973375535 +aggregate,aggregate,Helm Lite LegalBench,helm_lite_240829.csv,pearson,random,8,1,0.49625129009057833,0.211004712621783 +aggregate,aggregate,Helm Lite LegalBench,helm_lite_240829.csv,pearson,random,8,2,0.7482300147416783,0.0327435760119495 +aggregate,aggregate,Helm Lite LegalBench,helm_lite_240829.csv,pearson,random,8,3,0.9237060456412569,0.0010476652712265917 +aggregate,aggregate,Helm Lite LegalBench,helm_lite_240829.csv,pearson,random,8,4,0.8540419074377281,0.00694751386877189 +aggregate,aggregate,Helm Lite MedQA,helm_lite_240829.csv,pearson,random,8,0,0.7827735900001105,0.021632253958226707 +aggregate,aggregate,Helm Lite MedQA,helm_lite_240829.csv,pearson,random,8,1,0.7416615606437577,0.03518309274676423 +aggregate,aggregate,Helm Lite MedQA,helm_lite_240829.csv,pearson,random,8,2,0.8208959354305796,0.01250307893717913 +aggregate,aggregate,Helm Lite MedQA,helm_lite_240829.csv,pearson,random,8,3,0.9182336628416601,0.0012842298120423852 +aggregate,aggregate,Helm Lite MedQA,helm_lite_240829.csv,pearson,random,8,4,0.9199026021249039,0.0012087423991030853 +aggregate,aggregate,Helm Lite WMT2014,helm_lite_240829.csv,pearson,random,8,0,0.7807842071724994,0.022196180227557687 +aggregate,aggregate,Helm Lite WMT2014,helm_lite_240829.csv,pearson,random,8,1,0.6016089012086534,0.11460809097860054 +aggregate,aggregate,Helm Lite WMT2014,helm_lite_240829.csv,pearson,random,8,2,0.85978308688271,0.006187486327563118 +aggregate,aggregate,Helm Lite WMT2014,helm_lite_240829.csv,pearson,random,8,3,0.9471155608874564,0.00035525230596496123 +aggregate,aggregate,Helm Lite WMT2014,helm_lite_240829.csv,pearson,random,8,4,0.9238574615349179,0.0010415614421426264 +aggregate,aggregate,HF OpenLLM v2,hf_open_llm_v2_240829.csv,pearson,random,8,0,0.780599537830846,0.022248986205867058 +aggregate,aggregate,HF OpenLLM v2,hf_open_llm_v2_240829.csv,pearson,random,8,1,0.753379355065838,0.030905705190702806 +aggregate,aggregate,HF OpenLLM v2,hf_open_llm_v2_240829.csv,pearson,random,8,2,0.8379676352721162,0.009384640911630616 +aggregate,aggregate,HF OpenLLM v2,hf_open_llm_v2_240829.csv,pearson,random,8,3,0.8462209992405952,0.008075105621350536 +aggregate,aggregate,HF OpenLLM v2,hf_open_llm_v2_240829.csv,pearson,random,8,4,0.9020771423654268,0.0021784040615750178 +aggregate,aggregate,HFv2 BBH,hf_open_llm_v2_240829.csv,pearson,random,8,0,0.9392379026634557,0.000535591367028614 +aggregate,aggregate,HFv2 BBH,hf_open_llm_v2_240829.csv,pearson,random,8,1,0.7412355057774336,0.035345043191044964 +aggregate,aggregate,HFv2 BBH,hf_open_llm_v2_240829.csv,pearson,random,8,2,0.8195179387247324,0.01277979740900836 +aggregate,aggregate,HFv2 BBH,hf_open_llm_v2_240829.csv,pearson,random,8,3,0.9675915145186947,8.304238414993675e-05 +aggregate,aggregate,HFv2 BBH,hf_open_llm_v2_240829.csv,pearson,random,8,4,0.9356246311290696,0.0006351718939850358 +aggregate,aggregate,HFv2 GPQA,hf_open_llm_v2_240829.csv,pearson,random,8,0,0.7466011946729814,0.03333852605723143 +aggregate,aggregate,HFv2 GPQA,hf_open_llm_v2_240829.csv,pearson,random,8,1,0.9551682330569339,0.00021776057653192886 +aggregate,aggregate,HFv2 GPQA,hf_open_llm_v2_240829.csv,pearson,random,8,2,0.46353588273705637,0.24734250900688215 +aggregate,aggregate,HFv2 GPQA,hf_open_llm_v2_240829.csv,pearson,random,8,3,0.8866352243352398,0.003339629955133934 +aggregate,aggregate,HFv2 GPQA,hf_open_llm_v2_240829.csv,pearson,random,8,4,0.937902652612242,0.0005710971446370687 +aggregate,aggregate,HFv2 IFEval,hf_open_llm_v2_240829.csv,pearson,random,8,0,0.2831911510498836,0.4967225093410736 +aggregate,aggregate,HFv2 IFEval,hf_open_llm_v2_240829.csv,pearson,random,8,1,0.2031844122583542,0.6293846722461313 +aggregate,aggregate,HFv2 IFEval,hf_open_llm_v2_240829.csv,pearson,random,8,2,0.8894964926830444,0.0031000020401251533 +aggregate,aggregate,HFv2 IFEval,hf_open_llm_v2_240829.csv,pearson,random,8,3,0.549284007260608,0.15849945140105312 +aggregate,aggregate,HFv2 IFEval,hf_open_llm_v2_240829.csv,pearson,random,8,4,0.7889373199563972,0.01994193933246426 +aggregate,aggregate,HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,pearson,random,8,0,0.9415411104598773,0.00047780769988844555 +aggregate,aggregate,HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,pearson,random,8,1,0.8981158348442198,0.0024460728519243077 +aggregate,aggregate,HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,pearson,random,8,2,0.7526431927239958,0.0311644661156264 +aggregate,aggregate,HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,pearson,random,8,3,0.8962925022649735,0.0025761063553240114 +aggregate,aggregate,HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,pearson,random,8,4,0.937590300147702,0.0005796196796032962 +aggregate,aggregate,HFv2 Math Level 5,hf_open_llm_v2_240829.csv,pearson,random,8,0,0.5831241321997315,0.12921116102954364 +aggregate,aggregate,HFv2 Math Level 5,hf_open_llm_v2_240829.csv,pearson,random,8,1,0.5561145441014004,0.1523217142123119 +aggregate,aggregate,HFv2 Math Level 5,hf_open_llm_v2_240829.csv,pearson,random,8,2,0.5664450708720614,0.14323389729888122 +aggregate,aggregate,HFv2 Math Level 5,hf_open_llm_v2_240829.csv,pearson,random,8,3,0.47517181530974595,0.23407895750101468 +aggregate,aggregate,HFv2 Math Level 5,hf_open_llm_v2_240829.csv,pearson,random,8,4,0.718855715365913,0.04449992445427745 +aggregate,aggregate,HFv2 MuSR,hf_open_llm_v2_240829.csv,pearson,random,8,0,0.7168604276016974,0.04537877960385103 +aggregate,aggregate,HFv2 MuSR,hf_open_llm_v2_240829.csv,pearson,random,8,1,0.18264726732113173,0.6650765454064547 +aggregate,aggregate,HFv2 MuSR,hf_open_llm_v2_240829.csv,pearson,random,8,2,0.04614314940391431,0.9136043258512831 +aggregate,aggregate,HFv2 MuSR,hf_open_llm_v2_240829.csv,pearson,random,8,3,0.6369093478690498,0.08944819108801377 +aggregate,aggregate,HFv2 MuSR,hf_open_llm_v2_240829.csv,pearson,random,8,4,0.8650362997962656,0.005540656777637369 +aggregate,aggregate,OpenCompass Academic,opencompass_academic_240829.csv,pearson,random,8,0,0.9481614738377944,0.00033485605767966255 +aggregate,aggregate,OpenCompass Academic,opencompass_academic_240829.csv,pearson,random,8,1,0.8579024362848122,0.006430262194723998 +aggregate,aggregate,OpenCompass Academic,opencompass_academic_240829.csv,pearson,random,8,2,0.9674751054383679,8.39330376548511e-05 +aggregate,aggregate,OpenCompass Academic,opencompass_academic_240829.csv,pearson,random,8,3,0.9711920829273566,5.848502027941985e-05 +aggregate,aggregate,OpenCompass Academic,opencompass_academic_240829.csv,pearson,random,8,4,0.9121630061872308,0.0015845787994022296 +aggregate,aggregate,OpenCompass MMLU,opencompass_academic_240829.csv,pearson,random,8,0,0.7303458809128464,0.03963972108447683 +aggregate,aggregate,OpenCompass MMLU,opencompass_academic_240829.csv,pearson,random,8,1,0.7466964409211542,0.03330355520543848 +aggregate,aggregate,OpenCompass MMLU,opencompass_academic_240829.csv,pearson,random,8,2,0.8886798251454765,0.0031672235640011434 +aggregate,aggregate,OpenCompass MMLU,opencompass_academic_240829.csv,pearson,random,8,3,0.9036719475219376,0.002076262347775526 +aggregate,aggregate,OpenCompass MMLU,opencompass_academic_240829.csv,pearson,random,8,4,0.7623592248502944,0.02785522986224059 +aggregate,aggregate,OpenCompass MMLU Pro,opencompass_academic_240829.csv,pearson,random,8,0,0.8902509919824877,0.0030387234498153886 +aggregate,aggregate,OpenCompass MMLU Pro,opencompass_academic_240829.csv,pearson,random,8,1,0.8349964637145074,0.009887030967730168 +aggregate,aggregate,OpenCompass MMLU Pro,opencompass_academic_240829.csv,pearson,random,8,2,0.9513669166922365,0.00027717775621958416 +aggregate,aggregate,OpenCompass MMLU Pro,opencompass_academic_240829.csv,pearson,random,8,3,0.979588964641596,2.0934517813580252e-05 +aggregate,aggregate,OpenCompass MMLU Pro,opencompass_academic_240829.csv,pearson,random,8,4,0.8898917220751776,0.0030678038612609354 +aggregate,aggregate,OpenCompass CMMLU,opencompass_academic_240829.csv,pearson,random,8,0,0.8000397965603336,0.01711033114623395 +aggregate,aggregate,OpenCompass CMMLU,opencompass_academic_240829.csv,pearson,random,8,1,0.7666453684194998,0.026467542617941944 +aggregate,aggregate,OpenCompass CMMLU,opencompass_academic_240829.csv,pearson,random,8,2,0.8751438663188438,0.004421691058140597 +aggregate,aggregate,OpenCompass CMMLU,opencompass_academic_240829.csv,pearson,random,8,3,0.8954496186826447,0.0026376993343606783 +aggregate,aggregate,OpenCompass CMMLU,opencompass_academic_240829.csv,pearson,random,8,4,0.8195357136433342,0.012776203631959988 +aggregate,aggregate,OpenCompass BBH,opencompass_academic_240829.csv,pearson,random,8,0,0.8973997559676354,0.0024966210305528294 +aggregate,aggregate,OpenCompass BBH,opencompass_academic_240829.csv,pearson,random,8,1,0.9238541898435834,0.0010416930833947954 +aggregate,aggregate,OpenCompass BBH,opencompass_academic_240829.csv,pearson,random,8,2,0.9232578806881373,0.0010658683179569461 +aggregate,aggregate,OpenCompass BBH,opencompass_academic_240829.csv,pearson,random,8,3,0.9466806411756816,0.00036396834317210526 +aggregate,aggregate,OpenCompass BBH,opencompass_academic_240829.csv,pearson,random,8,4,0.9402048459613361,0.0005108048313780666 +aggregate,aggregate,OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,pearson,random,8,0,0.7428545649568395,0.03473202812850355 +aggregate,aggregate,OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,pearson,random,8,1,0.8962239297969814,0.0025810820467571426 +aggregate,aggregate,OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,pearson,random,8,2,0.9528032040825007,0.0002536158007562822 +aggregate,aggregate,OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,pearson,random,8,3,0.8136140570811612,0.01400900062666989 +aggregate,aggregate,OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,pearson,random,8,4,0.5749045753814719,0.13602130778385005 +aggregate,aggregate,OpenCompass HumanEval,opencompass_academic_240829.csv,pearson,random,8,0,0.780595487125304,0.022250145374352125 +aggregate,aggregate,OpenCompass HumanEval,opencompass_academic_240829.csv,pearson,random,8,1,0.8389921086523722,0.009215256295109017 +aggregate,aggregate,OpenCompass HumanEval,opencompass_academic_240829.csv,pearson,random,8,2,0.8803463320171083,0.003907570379771439 +aggregate,aggregate,OpenCompass HumanEval,opencompass_academic_240829.csv,pearson,random,8,3,0.7142670311425445,0.04653663665491792 +aggregate,aggregate,OpenCompass HumanEval,opencompass_academic_240829.csv,pearson,random,8,4,0.7977979460712193,0.017660348313797546 +aggregate,aggregate,OpenCompass IFEval,opencompass_academic_240829.csv,pearson,random,8,0,0.7240026280446691,0.04228069432019545 +aggregate,aggregate,OpenCompass IFEval,opencompass_academic_240829.csv,pearson,random,8,1,0.8051290094703403,0.01590190576987268 +aggregate,aggregate,OpenCompass IFEval,opencompass_academic_240829.csv,pearson,random,8,2,0.9226246952938778,0.0010919364406592675 +aggregate,aggregate,OpenCompass IFEval,opencompass_academic_240829.csv,pearson,random,8,3,0.770582228125362,0.025231318204288148 +aggregate,aggregate,OpenCompass IFEval,opencompass_academic_240829.csv,pearson,random,8,4,0.5188109005585113,0.18769119165787862 +aggregate,aggregate,Helm MMLU,helm_mmlu_240829.csv,pearson,random,8,0,0.9642212364414142,0.00011145218096014672 +aggregate,aggregate,Helm MMLU,helm_mmlu_240829.csv,pearson,random,8,1,0.7836454491081474,0.021387948565361206 +aggregate,aggregate,Helm MMLU,helm_mmlu_240829.csv,pearson,random,8,2,0.865235745718993,0.005516995432107779 +aggregate,aggregate,Helm MMLU,helm_mmlu_240829.csv,pearson,random,8,3,0.819500116935474,0.012783401302719894 +aggregate,aggregate,Helm MMLU,helm_mmlu_240829.csv,pearson,random,8,4,0.7432637726714306,0.034578129186903464 +aggregate,aggregate,LMSys Arena,chatbot_arena_240829.csv,pearson,random,8,0,0.9522400671025366,0.0002626898916961467 +aggregate,aggregate,LMSys Arena,chatbot_arena_240829.csv,pearson,random,8,1,0.9732730607216835,4.677795327851405e-05 +aggregate,aggregate,LMSys Arena,chatbot_arena_240829.csv,pearson,random,8,2,0.9110081304703664,0.001646433879397326 +aggregate,aggregate,LMSys Arena,chatbot_arena_240829.csv,pearson,random,8,3,0.9433518650586681,0.0004353717167521428 +aggregate,aggregate,LMSys Arena,chatbot_arena_240829.csv,pearson,random,8,4,0.880586328075459,0.003884834219553849 +aggregate,aggregate,Helm Classic,helm_classic_240829.csv,pearson,random,8,0,0.9603201312455674,0.00015157780411521223 +aggregate,aggregate,Helm Classic,helm_classic_240829.csv,pearson,random,8,1,0.9604114108423772,0.00015054459028416203 +aggregate,aggregate,Helm Classic,helm_classic_240829.csv,pearson,random,8,2,0.9519258192529104,0.00026784516618954716 +aggregate,aggregate,Helm Classic,helm_classic_240829.csv,pearson,random,8,3,0.9352773832366816,0.0006453340323628832 +aggregate,aggregate,Helm Classic,helm_classic_240829.csv,pearson,random,8,4,0.9756845534259928,3.5288470321501036e-05 +aggregate,aggregate,Helm BoolQ,helm_classic_240829.csv,pearson,random,8,0,0.6730282904268812,0.06736225845470355 +aggregate,aggregate,Helm BoolQ,helm_classic_240829.csv,pearson,random,8,1,0.9361725603565639,0.0006193510978979659 +aggregate,aggregate,Helm BoolQ,helm_classic_240829.csv,pearson,random,8,2,0.8618105831276622,0.005932414266978994 +aggregate,aggregate,Helm BoolQ,helm_classic_240829.csv,pearson,random,8,3,0.9371490197710903,0.0005918014940797798 +aggregate,aggregate,Helm BoolQ,helm_classic_240829.csv,pearson,random,8,4,0.8046621876144952,0.01601044603512172 +aggregate,aggregate,Helm NarrativeQA,helm_classic_240829.csv,pearson,random,8,0,0.41770329390345684,0.30313696659492734 +aggregate,aggregate,Helm NarrativeQA,helm_classic_240829.csv,pearson,random,8,1,0.6529975286213465,0.07915856325659755 +aggregate,aggregate,Helm NarrativeQA,helm_classic_240829.csv,pearson,random,8,2,0.6950517775314824,0.05566978580633573 +aggregate,aggregate,Helm NarrativeQA,helm_classic_240829.csv,pearson,random,8,3,0.5130382972054114,0.19351964488420637 +aggregate,aggregate,Helm NarrativeQA,helm_classic_240829.csv,pearson,random,8,4,0.6825577913683614,0.062140382561143265 +aggregate,aggregate,Helm NaturalQuestionsClosed,helm_classic_240829.csv,pearson,random,8,0,0.9059635004669196,0.0019350193188838174 +aggregate,aggregate,Helm NaturalQuestionsClosed,helm_classic_240829.csv,pearson,random,8,1,0.8702987510549938,0.00493787146977232 +aggregate,aggregate,Helm NaturalQuestionsClosed,helm_classic_240829.csv,pearson,random,8,2,0.8349295032906534,0.009898545248446817 +aggregate,aggregate,Helm NaturalQuestionsClosed,helm_classic_240829.csv,pearson,random,8,3,0.8245663895988613,0.011784555837564846 +aggregate,aggregate,Helm NaturalQuestionsClosed,helm_classic_240829.csv,pearson,random,8,4,0.9186996315597573,0.0012628532368153516 +aggregate,aggregate,Helm NaturalQuestionsOpen,helm_classic_240829.csv,pearson,random,8,0,0.899783088468177,0.002330962388754791 +aggregate,aggregate,Helm NaturalQuestionsOpen,helm_classic_240829.csv,pearson,random,8,1,0.8724919719311256,0.004699674798249593 +aggregate,aggregate,Helm NaturalQuestionsOpen,helm_classic_240829.csv,pearson,random,8,2,0.9486250828884353,0.00032606741963897914 +aggregate,aggregate,Helm NaturalQuestionsOpen,helm_classic_240829.csv,pearson,random,8,3,0.9264530754805538,0.0009405124032405977 +aggregate,aggregate,Helm NaturalQuestionsOpen,helm_classic_240829.csv,pearson,random,8,4,0.926933634016331,0.000922537739358256 +aggregate,aggregate,Helm QuAC,helm_classic_240829.csv,pearson,random,8,0,0.6984411569502376,0.05398723363884652 +aggregate,aggregate,Helm QuAC,helm_classic_240829.csv,pearson,random,8,1,0.754828418128203,0.03040022622820331 +aggregate,aggregate,Helm QuAC,helm_classic_240829.csv,pearson,random,8,2,0.5655988276473191,0.14396676855997925 +aggregate,aggregate,Helm QuAC,helm_classic_240829.csv,pearson,random,8,3,0.9407474980820671,0.000497230334167822 +aggregate,aggregate,Helm QuAC,helm_classic_240829.csv,pearson,random,8,4,0.770589245932409,0.025229147116181697 +aggregate,aggregate,Helm OpenBookQA,helm_classic_240829.csv,pearson,random,8,0,0.7775815292717585,0.023123063813025962 +aggregate,aggregate,Helm OpenBookQA,helm_classic_240829.csv,pearson,random,8,1,0.5611200837416681,0.14787988852194642 +aggregate,aggregate,Helm OpenBookQA,helm_classic_240829.csv,pearson,random,8,2,0.34646366697352105,0.40049416986179387 +aggregate,aggregate,Helm OpenBookQA,helm_classic_240829.csv,pearson,random,8,3,0.7868643731535557,0.020500867535993103 +aggregate,aggregate,Helm OpenBookQA,helm_classic_240829.csv,pearson,random,8,4,0.8114670933196435,0.014473750045325934 +aggregate,aggregate,Helm IMDB,helm_classic_240829.csv,pearson,random,8,0,0.4013581254554363,0.32436552572418753 +aggregate,aggregate,Helm IMDB,helm_classic_240829.csv,pearson,random,8,1,0.28341806840646894,0.4963625961904983 +aggregate,aggregate,Helm IMDB,helm_classic_240829.csv,pearson,random,8,2,0.3139211847524032,0.44892434309679713 +aggregate,aggregate,Helm IMDB,helm_classic_240829.csv,pearson,random,8,3,0.2606167560977108,0.5330194398770082 +aggregate,aggregate,Helm IMDB,helm_classic_240829.csv,pearson,random,8,4,0.32260154615753545,0.43577896021471924 +aggregate,aggregate,Helm CivilComments,helm_classic_240829.csv,pearson,random,8,0,0.7827817854375669,0.021629949458519884 +aggregate,aggregate,Helm CivilComments,helm_classic_240829.csv,pearson,random,8,1,0.9421767369217469,0.0004626159242720608 +aggregate,aggregate,Helm CivilComments,helm_classic_240829.csv,pearson,random,8,2,0.5386185630062554,0.16841388744478442 +aggregate,aggregate,Helm CivilComments,helm_classic_240829.csv,pearson,random,8,3,0.7045551126623175,0.05103000019308416 +aggregate,aggregate,Helm CivilComments,helm_classic_240829.csv,pearson,random,8,4,0.8414540075802577,0.00881618884168942 +aggregate,aggregate,Helm RAFT,helm_classic_240829.csv,pearson,random,8,0,0.8748256107732684,0.0044544778532186755 +aggregate,aggregate,Helm RAFT,helm_classic_240829.csv,pearson,random,8,1,0.8614522174161048,0.005976999431835443 +aggregate,aggregate,Helm RAFT,helm_classic_240829.csv,pearson,random,8,2,0.7878166990611953,0.02024289628983945 +aggregate,aggregate,Helm RAFT,helm_classic_240829.csv,pearson,random,8,3,0.8381151096374623,0.009360136935052572 +aggregate,aggregate,Helm RAFT,helm_classic_240829.csv,pearson,random,8,4,0.876154278920616,0.0043186280005204514 +aggregate,aggregate,MMLU Pro,mmlu_pro_240829.csv,pearson,random,8,0,0.9802952193136,1.884578972104051e-05 +aggregate,aggregate,MMLU Pro,mmlu_pro_240829.csv,pearson,random,8,1,0.9841937367574427,9.755845662836177e-06 +aggregate,aggregate,MMLU Pro,mmlu_pro_240829.csv,pearson,random,8,2,0.8661864185981796,0.005405102460401999 +aggregate,aggregate,MMLU Pro,mmlu_pro_240829.csv,pearson,random,8,3,0.8297856426405835,0.010808669505560614 +aggregate,aggregate,MMLU Pro,mmlu_pro_240829.csv,pearson,random,8,4,0.9329487606730291,0.000716243089312378 +aggregate,aggregate,MixEval,mixeval_240829.csv,pearson,random,8,0,0.822202489777381,0.01224422861798353 +aggregate,aggregate,MixEval,mixeval_240829.csv,pearson,random,8,1,0.6968865871905413,0.05475511707469452 +aggregate,aggregate,MixEval,mixeval_240829.csv,pearson,random,8,2,0.9318897100616549,0.0007501099193828288 +aggregate,aggregate,MixEval,mixeval_240829.csv,pearson,random,8,3,0.7939152572032528,0.018638835543465734 +aggregate,aggregate,MixEval,mixeval_240829.csv,pearson,random,8,4,0.7761614135775217,0.02354161442763604 +aggregate,aggregate,MixEval Hard,mixeval_240829.csv,pearson,random,8,0,0.9079242687040253,0.0018192466167481706 +aggregate,aggregate,MixEval Hard,mixeval_240829.csv,pearson,random,8,1,0.5935991848770941,0.12081484777974201 +aggregate,aggregate,MixEval Hard,mixeval_240829.csv,pearson,random,8,2,0.96841302674998,7.693398893847449e-05 +aggregate,aggregate,MixEval Hard,mixeval_240829.csv,pearson,random,8,3,0.9131963004520903,0.001530535130781307 +aggregate,aggregate,MixEval Hard,mixeval_240829.csv,pearson,random,8,4,0.7594573765014532,0.02881968270449265 +aggregate,aggregate,MixEval TriviaQA,mixeval_240829.csv,pearson,random,8,0,0.6622792441367216,0.07355344210000651 +aggregate,aggregate,MixEval TriviaQA,mixeval_240829.csv,pearson,random,8,1,0.5835165093102912,0.1288909419896904 +aggregate,aggregate,MixEval TriviaQA,mixeval_240829.csv,pearson,random,8,2,0.7271748558955601,0.04094703171178795 +aggregate,aggregate,MixEval TriviaQA,mixeval_240829.csv,pearson,random,8,3,0.7369082697183147,0.0370157216672518 +aggregate,aggregate,MixEval TriviaQA,mixeval_240829.csv,pearson,random,8,4,0.7219159720057066,0.04317213020613491 +aggregate,aggregate,MixEval MMLU,mixeval_240829.csv,pearson,random,8,0,0.8973595810319037,0.002499476856786579 +aggregate,aggregate,MixEval MMLU,mixeval_240829.csv,pearson,random,8,1,0.6540145328427245,0.07853263145320354 +aggregate,aggregate,MixEval MMLU,mixeval_240829.csv,pearson,random,8,2,0.9470816844896075,0.0003559262259996983 +aggregate,aggregate,MixEval MMLU,mixeval_240829.csv,pearson,random,8,3,0.798793471524343,0.017414760604056785 +aggregate,aggregate,MixEval MMLU,mixeval_240829.csv,pearson,random,8,4,0.766501585020503,0.026513385703318352 +aggregate,aggregate,MixEval DROP,mixeval_240829.csv,pearson,random,8,0,0.6776894663079587,0.06477689572321889 +aggregate,aggregate,MixEval DROP,mixeval_240829.csv,pearson,random,8,1,0.6576248245381009,0.07633405000799688 +aggregate,aggregate,MixEval DROP,mixeval_240829.csv,pearson,random,8,2,0.796342090311639,0.018023378799051942 +aggregate,aggregate,MixEval DROP,mixeval_240829.csv,pearson,random,8,3,0.689140856921657,0.058678219175095074 +aggregate,aggregate,MixEval DROP,mixeval_240829.csv,pearson,random,8,4,0.6705942614169457,0.06873614015066103 +aggregate,aggregate,MixEval HellaSwag,mixeval_240829.csv,pearson,random,8,0,0.6842754194067544,0.0612256583562849 +aggregate,aggregate,MixEval HellaSwag,mixeval_240829.csv,pearson,random,8,1,0.7338112096805872,0.03824046140795786 +aggregate,aggregate,MixEval HellaSwag,mixeval_240829.csv,pearson,random,8,2,0.8786344078919507,0.0040722405599500165 +aggregate,aggregate,MixEval HellaSwag,mixeval_240829.csv,pearson,random,8,3,0.8914863638509409,0.0029400900210167272 +aggregate,aggregate,MixEval HellaSwag,mixeval_240829.csv,pearson,random,8,4,0.8522000994286094,0.007203358614415384 +aggregate,aggregate,MixEval CommonsenseQA,mixeval_240829.csv,pearson,random,8,0,0.7479170810940026,0.03285737031031745 +aggregate,aggregate,MixEval CommonsenseQA,mixeval_240829.csv,pearson,random,8,1,0.5899049701184135,0.1237398240474465 +aggregate,aggregate,MixEval CommonsenseQA,mixeval_240829.csv,pearson,random,8,2,0.864013241961245,0.005663050469813282 +aggregate,aggregate,MixEval CommonsenseQA,mixeval_240829.csv,pearson,random,8,3,0.726560560314063,0.04120326937800088 +aggregate,aggregate,MixEval CommonsenseQA,mixeval_240829.csv,pearson,random,8,4,0.7600546147835674,0.02861953111724766 +aggregate,aggregate,MixEval TriviaQA Hard,mixeval_240829.csv,pearson,random,8,0,0.8675817638279608,0.00524352512595729 +aggregate,aggregate,MixEval TriviaQA Hard,mixeval_240829.csv,pearson,random,8,1,0.4358953069712842,0.280322780055143 +aggregate,aggregate,MixEval TriviaQA Hard,mixeval_240829.csv,pearson,random,8,2,0.8724977849323057,0.004699053502733089 +aggregate,aggregate,MixEval TriviaQA Hard,mixeval_240829.csv,pearson,random,8,3,0.871502377377448,0.004806214049293794 +aggregate,aggregate,MixEval TriviaQA Hard,mixeval_240829.csv,pearson,random,8,4,0.636462032322589,0.08974474991245225 +aggregate,aggregate,MixEval MMLU Hard,mixeval_240829.csv,pearson,random,8,0,0.7407371067623334,0.035535069908202585 +aggregate,aggregate,MixEval MMLU Hard,mixeval_240829.csv,pearson,random,8,1,0.13754152986907456,0.7453436298315592 +aggregate,aggregate,MixEval MMLU Hard,mixeval_240829.csv,pearson,random,8,2,0.8584434869588686,0.006359804257501524 +aggregate,aggregate,MixEval MMLU Hard,mixeval_240829.csv,pearson,random,8,3,0.9096718109287911,0.0017199423212977748 +aggregate,aggregate,MixEval MMLU Hard,mixeval_240829.csv,pearson,random,8,4,0.429513562091493,0.2882272134157949 +aggregate,aggregate,MixEval DROP Hard,mixeval_240829.csv,pearson,random,8,0,0.7258395762861067,0.04150524782255408 +aggregate,aggregate,MixEval DROP Hard,mixeval_240829.csv,pearson,random,8,1,0.4140057077993773,0.3078793667149351 +aggregate,aggregate,MixEval DROP Hard,mixeval_240829.csv,pearson,random,8,2,0.8860840192325219,0.003387122941063616 +aggregate,aggregate,MixEval DROP Hard,mixeval_240829.csv,pearson,random,8,3,0.8531999374729967,0.007063738601380546 +aggregate,aggregate,MixEval DROP Hard,mixeval_240829.csv,pearson,random,8,4,0.570698753672453,0.13958138247636556 +aggregate,aggregate,AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,pearson,random,8,0,0.9462124246513754,0.00037350751375720304 +aggregate,aggregate,AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,pearson,random,8,1,0.820982530302196,0.012485817170678851 +aggregate,aggregate,AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,pearson,random,8,2,0.9284819872198913,0.0008661544234609058 +aggregate,aggregate,AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,pearson,random,8,3,0.9226572389021586,0.0010905865909148318 +aggregate,aggregate,AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,pearson,random,8,4,0.8996834645928126,0.0023377397968761906 +aggregate,aggregate,OpenCompass,opencompass_240829.csv,pearson,random,8,0,0.9806889787900566,1.77437080791335e-05 +aggregate,aggregate,OpenCompass,opencompass_240829.csv,pearson,random,8,1,0.9467481050448351,0.00036260722071780783 +aggregate,aggregate,OpenCompass,opencompass_240829.csv,pearson,random,8,2,0.9051882617143683,0.001982079878231783 +aggregate,aggregate,OpenCompass,opencompass_240829.csv,pearson,random,8,3,0.8448816290057799,0.008279149903754354 +aggregate,aggregate,OpenCompass,opencompass_240829.csv,pearson,random,8,4,0.9486969514405281,0.0003247187445212263 +aggregate,aggregate,OpenCompass Language,opencompass_240829.csv,pearson,random,8,0,0.7138885174194392,0.046707103452906885 +aggregate,aggregate,OpenCompass Language,opencompass_240829.csv,pearson,random,8,1,0.40763933138747765,0.3161269846214854 +aggregate,aggregate,OpenCompass Language,opencompass_240829.csv,pearson,random,8,2,0.5033557119680766,0.20350786972733814 +aggregate,aggregate,OpenCompass Language,opencompass_240829.csv,pearson,random,8,3,0.4943676910774294,0.21301612937354739 +aggregate,aggregate,OpenCompass Language,opencompass_240829.csv,pearson,random,8,4,0.3662549994154035,0.3722134961617391 +aggregate,aggregate,OpenCompass Knowledge,opencompass_240829.csv,pearson,random,8,0,0.6943274080319848,0.05603338677616118 +aggregate,aggregate,OpenCompass Knowledge,opencompass_240829.csv,pearson,random,8,1,0.888202282224346,0.0032069637473251308 +aggregate,aggregate,OpenCompass Knowledge,opencompass_240829.csv,pearson,random,8,2,0.862959786938574,0.0057908774192851585 +aggregate,aggregate,OpenCompass Knowledge,opencompass_240829.csv,pearson,random,8,3,0.4422315456206938,0.2725814015162671 +aggregate,aggregate,OpenCompass Knowledge,opencompass_240829.csv,pearson,random,8,4,0.9314197867245828,0.0007654668867563735 +aggregate,aggregate,OpenCompass Reasoning,opencompass_240829.csv,pearson,random,8,0,0.8576726697477571,0.006460333718352682 +aggregate,aggregate,OpenCompass Reasoning,opencompass_240829.csv,pearson,random,8,1,0.6477798867796105,0.08241558395766836 +aggregate,aggregate,OpenCompass Reasoning,opencompass_240829.csv,pearson,random,8,2,0.7105249096891054,0.04823848031855015 +aggregate,aggregate,OpenCompass Reasoning,opencompass_240829.csv,pearson,random,8,3,0.7433756448219943,0.034536127920169364 +aggregate,aggregate,OpenCompass Reasoning,opencompass_240829.csv,pearson,random,8,4,0.465629371128827,0.24492880327618063 +aggregate,aggregate,OpenCompass Math,opencompass_240829.csv,pearson,random,8,0,0.9815968610969954,1.5367458655827867e-05 +aggregate,aggregate,OpenCompass Math,opencompass_240829.csv,pearson,random,8,1,0.9215279351913577,0.0011380681078154023 +aggregate,aggregate,OpenCompass Math,opencompass_240829.csv,pearson,random,8,2,0.9028698976709195,0.0021272329705264844 +aggregate,aggregate,OpenCompass Math,opencompass_240829.csv,pearson,random,8,3,0.8115257987039834,0.014460915122317916 +aggregate,aggregate,OpenCompass Math,opencompass_240829.csv,pearson,random,8,4,0.8840656907304268,0.003564741739845647 +aggregate,aggregate,OpenCompass Code,opencompass_240829.csv,pearson,random,8,0,0.9288767434076772,0.0008521494712455959 +aggregate,aggregate,OpenCompass Code,opencompass_240829.csv,pearson,random,8,1,0.8762491857760322,0.004309027650395265 +aggregate,aggregate,OpenCompass Code,opencompass_240829.csv,pearson,random,8,2,0.822174167720692,0.012249803466994006 +aggregate,aggregate,OpenCompass Code,opencompass_240829.csv,pearson,random,8,3,0.8388480886223416,0.009238949980481774 +aggregate,aggregate,OpenCompass Code,opencompass_240829.csv,pearson,random,8,4,0.9318866818637482,0.0007502082286076188 +aggregate,aggregate,OpenCompass Instruction,opencompass_240829.csv,pearson,random,8,0,0.6752208316271633,0.06613869004956173 +aggregate,aggregate,OpenCompass Instruction,opencompass_240829.csv,pearson,random,8,1,0.7677373687773497,0.026120973578910495 +aggregate,aggregate,OpenCompass Instruction,opencompass_240829.csv,pearson,random,8,2,0.7919204265038193,0.01915443839404165 +aggregate,aggregate,OpenCompass Instruction,opencompass_240829.csv,pearson,random,8,3,0.8238198607264919,0.01192852239680578 +aggregate,aggregate,OpenCompass Instruction,opencompass_240829.csv,pearson,random,8,4,0.8788769140000767,0.0040486473187813605 +aggregate,aggregate,OpenCompass Agent,opencompass_240829.csv,pearson,random,8,0,0.5937971020205063,0.1206592532108973 +aggregate,aggregate,OpenCompass Agent,opencompass_240829.csv,pearson,random,8,1,0.6743688104667733,0.0666125934693148 +aggregate,aggregate,OpenCompass Agent,opencompass_240829.csv,pearson,random,8,2,0.6092910701405022,0.10882867605607495 +aggregate,aggregate,OpenCompass Agent,opencompass_240829.csv,pearson,random,8,3,0.02436876480189197,0.954326651607438 +aggregate,aggregate,OpenCompass Agent,opencompass_240829.csv,pearson,random,8,4,0.7114255278499215,0.04782552820112736 +aggregate,aggregate,OpenCompass Arena,opencompass_arena_240829.csv,pearson,random,8,0,0.5887872724291499,0.12463254240428198 +aggregate,aggregate,OpenCompass Arena,opencompass_arena_240829.csv,pearson,random,8,1,0.4029552549015283,0.32226121873409685 +aggregate,aggregate,OpenCompass Arena,opencompass_arena_240829.csv,pearson,random,8,2,0.19589220319331574,0.6419903458052949 +aggregate,aggregate,OpenCompass Arena,opencompass_arena_240829.csv,pearson,random,8,3,0.5147894627560958,0.1917415408232741 +aggregate,aggregate,OpenCompass Arena,opencompass_arena_240829.csv,pearson,random,8,4,0.43696792691727815,0.2790047957490856 +aggregate,aggregate,LiveBench 240725,livebench_240829.csv,pearson,random,8,0,0.9683600812057522,7.731839943750683e-05 +aggregate,aggregate,LiveBench 240725,livebench_240829.csv,pearson,random,8,1,0.9490060035318915,0.00031896092810029624 +aggregate,aggregate,LiveBench 240725,livebench_240829.csv,pearson,random,8,2,0.9033732116949054,0.0020951534061901173 +aggregate,aggregate,LiveBench 240725,livebench_240829.csv,pearson,random,8,3,0.9728319200142996,4.911626350007423e-05 +aggregate,aggregate,LiveBench 240725,livebench_240829.csv,pearson,random,8,4,0.9174158952141087,0.0013223130420052574 +aggregate,aggregate,LiveBench Reasoning,livebench_240829.csv,pearson,random,8,0,0.8698029729880158,0.00499276771087744 +aggregate,aggregate,LiveBench Reasoning,livebench_240829.csv,pearson,random,8,1,0.9736499620869766,4.483954353741208e-05 +aggregate,aggregate,LiveBench Reasoning,livebench_240829.csv,pearson,random,8,2,0.8938963574061565,0.002753683842916408 +aggregate,aggregate,LiveBench Reasoning,livebench_240829.csv,pearson,random,8,3,0.9427230009399408,0.00044981624708065733 +aggregate,aggregate,LiveBench Reasoning,livebench_240829.csv,pearson,random,8,4,0.9288091831587435,0.0008545357544848401 +aggregate,aggregate,LiveBench Coding,livebench_240829.csv,pearson,random,8,0,0.9876650170257133,4.648675321533348e-06 +aggregate,aggregate,LiveBench Coding,livebench_240829.csv,pearson,random,8,1,0.9200698352872445,0.0012013420941124318 +aggregate,aggregate,LiveBench Coding,livebench_240829.csv,pearson,random,8,2,0.8197843971795349,0.012725991028944833 +aggregate,aggregate,LiveBench Coding,livebench_240829.csv,pearson,random,8,3,0.9667731014329254,8.943826166773405e-05 +aggregate,aggregate,LiveBench Coding,livebench_240829.csv,pearson,random,8,4,0.9135236868955329,0.0015136659995374103 +aggregate,aggregate,LiveBench Mathematics,livebench_240829.csv,pearson,random,8,0,0.983826044072315,1.0449743172360012e-05 +aggregate,aggregate,LiveBench Mathematics,livebench_240829.csv,pearson,random,8,1,0.9482689395026054,0.000332805134027447 +aggregate,aggregate,LiveBench Mathematics,livebench_240829.csv,pearson,random,8,2,0.9334433471484072,0.0007007762613840839 +aggregate,aggregate,LiveBench Mathematics,livebench_240829.csv,pearson,random,8,3,0.8998371432675459,0.0023272903802322954 +aggregate,aggregate,LiveBench Mathematics,livebench_240829.csv,pearson,random,8,4,0.9131450099069247,0.0015331889972515346 +aggregate,aggregate,LiveBench Data Analysis,livebench_240829.csv,pearson,random,8,0,0.9445409047411082,0.00040889964932544416 +aggregate,aggregate,LiveBench Data Analysis,livebench_240829.csv,pearson,random,8,1,0.8996453255999854,0.00234033776853281 +aggregate,aggregate,LiveBench Data Analysis,livebench_240829.csv,pearson,random,8,2,0.8662449830102448,0.005398257529969565 +aggregate,aggregate,LiveBench Data Analysis,livebench_240829.csv,pearson,random,8,3,0.9506955154682739,0.00028866872380162265 +aggregate,aggregate,LiveBench Data Analysis,livebench_240829.csv,pearson,random,8,4,0.9121357775980045,0.0015860194531010332 +aggregate,aggregate,LiveBench Language,livebench_240829.csv,pearson,random,8,0,0.9469225816315634,0.000359102582060145 +aggregate,aggregate,LiveBench Language,livebench_240829.csv,pearson,random,8,1,0.790872393374341,0.019428850798750914 +aggregate,aggregate,LiveBench Language,livebench_240829.csv,pearson,random,8,2,0.7384692720332464,0.03640761031575469 +aggregate,aggregate,LiveBench Language,livebench_240829.csv,pearson,random,8,3,0.9396936265489109,0.0005238133760109684 +aggregate,aggregate,LiveBench Language,livebench_240829.csv,pearson,random,8,4,0.7853349194194776,0.020919442242219075 +aggregate,aggregate,LiveBench Instruction Following,livebench_240829.csv,pearson,random,8,0,0.8636070293544758,0.005712124057773506 +aggregate,aggregate,LiveBench Instruction Following,livebench_240829.csv,pearson,random,8,1,0.837126038633602,0.009525258316342535 +aggregate,aggregate,LiveBench Instruction Following,livebench_240829.csv,pearson,random,8,2,0.7663953319208139,0.026547294337781743 +aggregate,aggregate,LiveBench Instruction Following,livebench_240829.csv,pearson,random,8,3,0.8834569465544357,0.00361946726545403 +aggregate,aggregate,LiveBench Instruction Following,livebench_240829.csv,pearson,random,8,4,0.8480938359553485,0.00779520658099071 +aggregate,aggregate,WildBench Elo LC,wildbench_240829.csv,pearson,random,8,0,0.9882164477730901,4.05436289119973e-06 +aggregate,aggregate,WildBench Elo LC,wildbench_240829.csv,pearson,random,8,1,0.9749878899040407,3.838912250625781e-05 +aggregate,aggregate,WildBench Elo LC,wildbench_240829.csv,pearson,random,8,2,0.9189017807616305,0.0012536521795481071 +aggregate,aggregate,WildBench Elo LC,wildbench_240829.csv,pearson,random,8,3,0.976785228034165,3.073554131266073e-05 +aggregate,aggregate,WildBench Elo LC,wildbench_240829.csv,pearson,random,8,4,0.9683736529744773,7.721974100004276e-05 +aggregate,aggregate,WildBench Information Seeking,wildbench_240829.csv,pearson,random,8,0,0.8793267175321069,0.004005119722136405 +aggregate,aggregate,WildBench Information Seeking,wildbench_240829.csv,pearson,random,8,1,0.8760721346635911,0.004326948446281908 +aggregate,aggregate,WildBench Information Seeking,wildbench_240829.csv,pearson,random,8,2,0.9315137258308156,0.0007623806815109492 +aggregate,aggregate,WildBench Information Seeking,wildbench_240829.csv,pearson,random,8,3,0.9671655908223616,8.633181797191984e-05 +aggregate,aggregate,WildBench Information Seeking,wildbench_240829.csv,pearson,random,8,4,0.7675767218262903,0.026171781192995118 +aggregate,aggregate,WildBench Creative,wildbench_240829.csv,pearson,random,8,0,0.8483878251754778,0.007751839541749867 +aggregate,aggregate,WildBench Creative,wildbench_240829.csv,pearson,random,8,1,0.9222607240796445,0.0011071076795417618 +aggregate,aggregate,WildBench Creative,wildbench_240829.csv,pearson,random,8,2,0.9440994017259922,0.00041860181264251746 +aggregate,aggregate,WildBench Creative,wildbench_240829.csv,pearson,random,8,3,0.9640433681068886,0.00011310737614553013 +aggregate,aggregate,WildBench Creative,wildbench_240829.csv,pearson,random,8,4,0.692434840005101,0.056990052908859494 +aggregate,aggregate,WildBench Code Debugging,wildbench_240829.csv,pearson,random,8,0,0.9942767822652612,4.6665376445687894e-07 +aggregate,aggregate,WildBench Code Debugging,wildbench_240829.csv,pearson,random,8,1,0.9682235346488557,7.831565067564543e-05 +aggregate,aggregate,WildBench Code Debugging,wildbench_240829.csv,pearson,random,8,2,0.9370054660599566,0.0005958002530390111 +aggregate,aggregate,WildBench Code Debugging,wildbench_240829.csv,pearson,random,8,3,0.969420946106877,6.985512173523951e-05 +aggregate,aggregate,WildBench Code Debugging,wildbench_240829.csv,pearson,random,8,4,0.9834828472581691,1.1126279772397877e-05 +aggregate,aggregate,WildBench Math & Data,wildbench_240829.csv,pearson,random,8,0,0.9928216304628095,9.197638948465057e-07 +aggregate,aggregate,WildBench Math & Data,wildbench_240829.csv,pearson,random,8,1,0.9611899818187688,0.00014192004448559492 +aggregate,aggregate,WildBench Math & Data,wildbench_240829.csv,pearson,random,8,2,0.9411758308443503,0.0004866843681750784 +aggregate,aggregate,WildBench Math & Data,wildbench_240829.csv,pearson,random,8,3,0.9688368521395198,7.390226580769654e-05 +aggregate,aggregate,WildBench Math & Data,wildbench_240829.csv,pearson,random,8,4,0.9723616916410369,5.16925798887181e-05 +aggregate,aggregate,WildBench Reasoning & Planning,wildbench_240829.csv,pearson,random,8,0,0.9766036636486001,3.14580315476573e-05 +aggregate,aggregate,WildBench Reasoning & Planning,wildbench_240829.csv,pearson,random,8,1,0.9597878054141521,0.00015769662952759886 +aggregate,aggregate,WildBench Reasoning & Planning,wildbench_240829.csv,pearson,random,8,2,0.9404428288332258,0.0005048221249291256 +aggregate,aggregate,WildBench Reasoning & Planning,wildbench_240829.csv,pearson,random,8,3,0.9831715348590928,1.176456701375346e-05 +aggregate,aggregate,WildBench Reasoning & Planning,wildbench_240829.csv,pearson,random,8,4,0.97187689823272,5.4440740892278444e-05 +aggregate,aggregate,WildBench Score,wildbench_240829.csv,pearson,random,8,0,0.9852421877364517,7.946695487913594e-06 +aggregate,aggregate,WildBench Score,wildbench_240829.csv,pearson,random,8,1,0.9692179758222269,7.124441373542135e-05 +aggregate,aggregate,WildBench Score,wildbench_240829.csv,pearson,random,8,2,0.9412248237761267,0.000485487558057933 +aggregate,aggregate,WildBench Score,wildbench_240829.csv,pearson,random,8,3,0.98025276424875,1.8967257174977277e-05 +aggregate,aggregate,WildBench Score,wildbench_240829.csv,pearson,random,8,4,0.9846373995357367,8.960181355366343e-06 +aggregate,aggregate,Arena Hard,arena_hard_240829.csv,pearson,random,8,0,0.9522400671025366,0.0002626898916961467 +aggregate,aggregate,Arena Hard,arena_hard_240829.csv,pearson,random,8,1,0.9732730607216835,4.677795327851405e-05 +aggregate,aggregate,Arena Hard,arena_hard_240829.csv,pearson,random,8,2,0.9110081304703664,0.001646433879397326 +aggregate,aggregate,Arena Hard,arena_hard_240829.csv,pearson,random,8,3,0.9433518650586681,0.0004353717167521428 +aggregate,aggregate,Arena Hard,arena_hard_240829.csv,pearson,random,8,4,0.880586328075459,0.003884834219553849 +aggregate,aggregate,HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,0,0.9355663499255871,0.0006368701046576545 +aggregate,aggregate,HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,1,0.9499604642147754,0.0003016036750416735 +aggregate,aggregate,HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,2,0.7164442699126142,0.04556339297891151 +aggregate,aggregate,HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,3,0.5643812833359342,0.14502482192576685 +aggregate,aggregate,HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,4,0.4448334653124403,0.269433453257965 +aggregate,aggregate,HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,0,0.9020957808919513,0.002177191904645508 +aggregate,aggregate,HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,1,0.9140262325400854,0.0014880077902407654 +aggregate,aggregate,HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,2,0.6613543728531551,0.07410115498793113 +aggregate,aggregate,HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,3,0.4797794956768499,0.2289297958345603 +aggregate,aggregate,HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,4,0.49503702005526434,0.21230024172428238 +aggregate,aggregate,HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,0,0.8658004484348707,0.005450353400185282 +aggregate,aggregate,HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,1,0.9239450258900821,0.0010380421984977164 +aggregate,aggregate,HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,2,0.6878185417270377,0.05936418242167244 +aggregate,aggregate,HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,3,0.6427492187377651,0.08562857067256696 +aggregate,aggregate,HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,4,0.19987101474191585,0.6351028985023905 +aggregate,aggregate,HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,0,0.7695981699173929,0.025536900476404875 +aggregate,aggregate,HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,1,0.851160886507116,0.00735033097799936 +aggregate,aggregate,HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,2,0.7535063061583401,0.030861215825263487 +aggregate,aggregate,HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,3,0.26946310602236634,0.5186811891252074 +aggregate,aggregate,HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,4,0.5071239778851739,0.19958915881626008 +aggregate,aggregate,HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,0,0.845558834843199,0.00817557674320208 +aggregate,aggregate,HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,1,0.8223598748455347,0.01221327849153134 +aggregate,aggregate,HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,2,0.7520379034546343,0.03137821860478068 +aggregate,aggregate,HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,3,0.5986152394502113,0.1169062576526029 +aggregate,aggregate,HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,4,0.766509325140422,0.026510916638992615 +aggregate,aggregate,HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,0,0.6388656044215879,0.08815791552969902 +aggregate,aggregate,HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,1,0.8220592376168137,0.012272442496278822 +aggregate,aggregate,HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,2,0.1610992186087647,0.7031245257171708 +aggregate,aggregate,HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,3,0.22938177579714764,0.584757473087143 +aggregate,aggregate,HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,4,0.16217150942988084,0.7012176634258844 +aggregate,aggregate,HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,0,0.8536693780854105,0.0069987855857581984 +aggregate,aggregate,HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,1,0.9079591032101378,0.0018172316533511903 +aggregate,aggregate,HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,2,0.7448797028215589,0.033974472983626124 +aggregate,aggregate,HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,3,0.436470242791583,0.2796159471960331 +aggregate,aggregate,HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,4,0.5113717481429286,0.195219904727713 +aggregate,aggregate,BFCL,bfcl_240906.csv,pearson,random,8,0,0.8848684214582546,0.0034933971141531536 +aggregate,aggregate,BFCL,bfcl_240906.csv,pearson,random,8,1,0.9247518427204778,0.0010059807632682822 +aggregate,aggregate,BFCL,bfcl_240906.csv,pearson,random,8,2,0.7024798803756629,0.05202256738347333 +aggregate,aggregate,BFCL,bfcl_240906.csv,pearson,random,8,3,0.6111548412929141,0.10745210550108082 +aggregate,aggregate,BFCL,bfcl_240906.csv,pearson,random,8,4,0.8864983521119945,0.0033513827582610342 +aggregate,aggregate,BIGGEN,biggen_240829.csv,pearson,random,8,0,0.8443252756395498,0.008364861793357709 +aggregate,aggregate,BIGGEN,biggen_240829.csv,pearson,random,8,1,0.8199557285303699,0.012691469447090417 +aggregate,aggregate,BIGGEN,biggen_240829.csv,pearson,random,8,2,0.6898121736766818,0.05833178396126367 +aggregate,aggregate,BIGGEN,biggen_240829.csv,pearson,random,8,3,0.1445400076243653,0.732738456710739 +aggregate,aggregate,BIGGEN,biggen_240829.csv,pearson,random,8,4,-0.13444519427677581,0.7509364951619687 +aggregate,aggregate,BIGGEN Grounding,biggen_240829.csv,pearson,random,8,0,0.9461712339012929,0.00037435448514068834 +aggregate,aggregate,BIGGEN Grounding,biggen_240829.csv,pearson,random,8,1,0.8543556725359636,0.006904516600543572 +aggregate,aggregate,BIGGEN Grounding,biggen_240829.csv,pearson,random,8,2,0.7671160990392422,0.026317800283773948 +aggregate,aggregate,BIGGEN Grounding,biggen_240829.csv,pearson,random,8,3,0.4230508906614041,0.29634091151848907 +aggregate,aggregate,BIGGEN Grounding,biggen_240829.csv,pearson,random,8,4,0.29492042180464345,0.478252042515081 +aggregate,aggregate,BIGGEN Instruction Following,biggen_240829.csv,pearson,random,8,0,0.8192056092552416,0.01284304904344425 +aggregate,aggregate,BIGGEN Instruction Following,biggen_240829.csv,pearson,random,8,1,0.8053230426409881,0.015856927546595193 +aggregate,aggregate,BIGGEN Instruction Following,biggen_240829.csv,pearson,random,8,2,0.6785867773117831,0.06428605698561919 +aggregate,aggregate,BIGGEN Instruction Following,biggen_240829.csv,pearson,random,8,3,0.021028776761034942,0.960582665935811 +aggregate,aggregate,BIGGEN Instruction Following,biggen_240829.csv,pearson,random,8,4,-0.25337930013147175,0.5448562000018814 +aggregate,aggregate,BIGGEN Planning,biggen_240829.csv,pearson,random,8,0,0.8101772449555595,0.014757563523095152 +aggregate,aggregate,BIGGEN Planning,biggen_240829.csv,pearson,random,8,1,0.7844308170919763,0.021169355122089707 +aggregate,aggregate,BIGGEN Planning,biggen_240829.csv,pearson,random,8,2,0.6407686957715764,0.08691312009391092 +aggregate,aggregate,BIGGEN Planning,biggen_240829.csv,pearson,random,8,3,0.042093006210129874,0.9211687904012325 +aggregate,aggregate,BIGGEN Planning,biggen_240829.csv,pearson,random,8,4,-0.2813292229519864,0.4996795026573654 +aggregate,aggregate,BIGGEN Reasoning,biggen_240829.csv,pearson,random,8,0,0.8350456630970934,0.00987857623206292 +aggregate,aggregate,BIGGEN Reasoning,biggen_240829.csv,pearson,random,8,1,0.879311548672376,0.004006582681021272 +aggregate,aggregate,BIGGEN Reasoning,biggen_240829.csv,pearson,random,8,2,0.6951300585252861,0.0556305769370549 +aggregate,aggregate,BIGGEN Reasoning,biggen_240829.csv,pearson,random,8,3,0.30955291195703166,0.4556002793087552 +aggregate,aggregate,BIGGEN Reasoning,biggen_240829.csv,pearson,random,8,4,0.09897629382276267,0.8156278898050575 +aggregate,aggregate,BIGGEN Refinement,biggen_240829.csv,pearson,random,8,0,0.8313126956210078,0.010533178480029779 +aggregate,aggregate,BIGGEN Refinement,biggen_240829.csv,pearson,random,8,1,0.8169388413464165,0.01330802664448977 +aggregate,aggregate,BIGGEN Refinement,biggen_240829.csv,pearson,random,8,2,0.8065284450649773,0.015579295379409611 +aggregate,aggregate,BIGGEN Refinement,biggen_240829.csv,pearson,random,8,3,0.23722382427262312,0.5716108619128892 +aggregate,aggregate,BIGGEN Refinement,biggen_240829.csv,pearson,random,8,4,0.026088426326565897,0.9511063910298649 +aggregate,aggregate,BIGGEN Safety,biggen_240829.csv,pearson,random,8,0,0.5558829816104426,0.15252894598370506 +aggregate,aggregate,BIGGEN Safety,biggen_240829.csv,pearson,random,8,1,0.6390946692796851,0.08800754271923365 +aggregate,aggregate,BIGGEN Safety,biggen_240829.csv,pearson,random,8,2,0.24121345447897227,0.5649619826999719 +aggregate,aggregate,BIGGEN Safety,biggen_240829.csv,pearson,random,8,3,-0.13262144042688304,0.7542351704927408 +aggregate,aggregate,BIGGEN Safety,biggen_240829.csv,pearson,random,8,4,-0.46784288126219703,0.24238975539995447 +aggregate,aggregate,BIGGEN Theory of Mind,biggen_240829.csv,pearson,random,8,0,0.7467577882406231,0.03328104267130768 +aggregate,aggregate,BIGGEN Theory of Mind,biggen_240829.csv,pearson,random,8,1,0.7611545287510072,0.028253164658278467 +aggregate,aggregate,BIGGEN Theory of Mind,biggen_240829.csv,pearson,random,8,2,0.6541774611460981,0.07843262445172178 +aggregate,aggregate,BIGGEN Theory of Mind,biggen_240829.csv,pearson,random,8,3,0.0830822493170678,0.8449361587214159 +aggregate,aggregate,BIGGEN Theory of Mind,biggen_240829.csv,pearson,random,8,4,-0.1985934514676979,0.6373119372341151 +aggregate,aggregate,BIGGEN Tool Usage,biggen_240829.csv,pearson,random,8,0,0.9103256104990007,0.001683717098370581 +aggregate,aggregate,BIGGEN Tool Usage,biggen_240829.csv,pearson,random,8,1,0.8079204807250888,0.015262498588799642 +aggregate,aggregate,BIGGEN Tool Usage,biggen_240829.csv,pearson,random,8,2,0.7253154362419392,0.0417256201301186 +aggregate,aggregate,BIGGEN Tool Usage,biggen_240829.csv,pearson,random,8,3,0.2776474358858506,0.5055464711128136 +aggregate,aggregate,BIGGEN Tool Usage,biggen_240829.csv,pearson,random,8,4,-0.04029159995291984,0.9245349726533298 +aggregate,aggregate,BIGGEN Multilingual,biggen_240829.csv,pearson,random,8,0,0.919432996814919,0.0012296819224052442 +aggregate,aggregate,BIGGEN Multilingual,biggen_240829.csv,pearson,random,8,1,0.87005129824662,0.004965222567299112 +aggregate,aggregate,BIGGEN Multilingual,biggen_240829.csv,pearson,random,8,2,0.9073703100625691,0.001851485138509531 +aggregate,aggregate,BIGGEN Multilingual,biggen_240829.csv,pearson,random,8,3,0.8673887162219034,0.005265692212272121 +aggregate,aggregate,BIGGEN Multilingual,biggen_240829.csv,pearson,random,8,4,0.8916723527123611,0.0029254223429427636 +aggregate,aggregate,LiveBench 240624,livebench_240701.csv,pearson,random,8,0,0.9804801069360884,1.832282630082123e-05 +aggregate,aggregate,LiveBench 240624,livebench_240701.csv,pearson,random,8,1,0.980051779203359,1.9549343460335766e-05 +aggregate,aggregate,LiveBench 240624,livebench_240701.csv,pearson,random,8,2,0.9720131442366731,5.3658869462094946e-05 +aggregate,aggregate,LiveBench 240624,livebench_240701.csv,pearson,random,8,3,0.9165887813382055,0.001361572704071016 +aggregate,aggregate,LiveBench 240624,livebench_240701.csv,pearson,random,8,4,0.9225103255266087,0.0010966889416837342 +aggregate,aggregate,LiveBench Reasoning Average,livebench_240701.csv,pearson,random,8,0,0.9292369266176062,0.000839501038985727 +aggregate,aggregate,LiveBench Reasoning Average,livebench_240701.csv,pearson,random,8,1,0.9505492134066896,0.00029121355501060477 +aggregate,aggregate,LiveBench Reasoning Average,livebench_240701.csv,pearson,random,8,2,0.9415690777822339,0.00047713248045663163 +aggregate,aggregate,LiveBench Reasoning Average,livebench_240701.csv,pearson,random,8,3,0.9576750897378552,0.00018358576102437457 +aggregate,aggregate,LiveBench Reasoning Average,livebench_240701.csv,pearson,random,8,4,0.8850761460392197,0.0034750864462593195 +aggregate,aggregate,LiveBench Coding Average,livebench_240701.csv,pearson,random,8,0,0.9798647388383346,2.0101576768271062e-05 +aggregate,aggregate,LiveBench Coding Average,livebench_240701.csv,pearson,random,8,1,0.9781250835045174,2.5741076148769547e-05 +aggregate,aggregate,LiveBench Coding Average,livebench_240701.csv,pearson,random,8,2,0.9598475365356987,0.00015700207944980397 +aggregate,aggregate,LiveBench Coding Average,livebench_240701.csv,pearson,random,8,3,0.9317002702003969,0.000756276259880365 +aggregate,aggregate,LiveBench Coding Average,livebench_240701.csv,pearson,random,8,4,0.8240635545541923,0.011881405061211926 +aggregate,aggregate,LiveBench Mathematics Average,livebench_240701.csv,pearson,random,8,0,0.9849433970479835,8.437305784682183e-06 +aggregate,aggregate,LiveBench Mathematics Average,livebench_240701.csv,pearson,random,8,1,0.9899107226768695,2.548168158279175e-06 +aggregate,aggregate,LiveBench Mathematics Average,livebench_240701.csv,pearson,random,8,2,0.9645217100316719,0.00010869253777108847 +aggregate,aggregate,LiveBench Mathematics Average,livebench_240701.csv,pearson,random,8,3,0.9447465624679983,0.00040443116308794275 +aggregate,aggregate,LiveBench Mathematics Average,livebench_240701.csv,pearson,random,8,4,0.8760879368136391,0.0043253470355424355 +aggregate,aggregate,LiveBench Data Analysis Average,livebench_240701.csv,pearson,random,8,0,0.9469408250476264,0.0003587374254477132 +aggregate,aggregate,LiveBench Data Analysis Average,livebench_240701.csv,pearson,random,8,1,0.9498225876442147,0.000304071618749767 +aggregate,aggregate,LiveBench Data Analysis Average,livebench_240701.csv,pearson,random,8,2,0.9413785598975157,0.0004817446027243596 +aggregate,aggregate,LiveBench Data Analysis Average,livebench_240701.csv,pearson,random,8,3,0.8197292667265523,0.012737111858293043 +aggregate,aggregate,LiveBench Data Analysis Average,livebench_240701.csv,pearson,random,8,4,0.9057861973602506,0.0019457176947306907 +aggregate,aggregate,LiveBench Language Average,livebench_240701.csv,pearson,random,8,0,0.9413025091864188,0.000483593804288479 +aggregate,aggregate,LiveBench Language Average,livebench_240701.csv,pearson,random,8,1,0.9083254977326705,0.001796125778484392 +aggregate,aggregate,LiveBench Language Average,livebench_240701.csv,pearson,random,8,2,0.8626635526406192,0.005827152548807454 +aggregate,aggregate,LiveBench Language Average,livebench_240701.csv,pearson,random,8,3,0.8043418970652331,0.016085184583393794 +aggregate,aggregate,LiveBench Language Average,livebench_240701.csv,pearson,random,8,4,0.8946872852632068,0.0026942203148939193 +aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,pearson,random,8,0,0.9025950086780581,0.002144887259438991 +aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,pearson,random,8,1,0.7564264003460613,0.02984872863501939 +aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,pearson,random,8,2,0.9033527343998258,0.002096452391428316 +aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,pearson,random,8,3,0.8494277893147777,0.0075996673267298715 +aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,pearson,random,8,4,0.8534145445088147,0.007033997470343221