Spaces:
Running
Running
File size: 85,237 Bytes
b9a3a5d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 |
scenario,scenario_source,ref_scenario,ref_source,corr_type,model_select_strategy,model_subset_size_requested,exp_n,correlation,p_value
Helm Lite,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,0,0.2778254199662385,0.2400384567875128
Helm Lite,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,1,0.40368671387966554,0.08581278065055217
Helm Lite,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,2,0.42599897728156577,0.07162425926742408
Helm Lite,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,3,0.2778254199662385,0.2400384567875128
Helm Lite,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,4,0.36698792170878686,0.11834981273562825
Helm Lite NarrativeQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,0,-0.018181818181818184,1.0
Helm Lite NarrativeQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,1,-0.018181818181818184,1.0
Helm Lite NarrativeQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,2,-0.05454545454545454,0.8792698312489979
Helm Lite NarrativeQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,3,-0.018181818181818184,1.0
Helm Lite NarrativeQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,4,-0.1272727272727273,0.6480954385121052
Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,0,0.05454545454545454,0.8792698312489979
Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,1,-0.018181818181818184,1.0
Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,2,-0.018181818181818184,1.0
Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,3,0.05454545454545454,0.8792698312489979
Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,4,-0.05454545454545454,0.8792698312489979
Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,0,0.05454545454545454,0.8792698312489979
Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,1,0.23636363636363636,0.3587114698573032
Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,2,0.2,0.4453821448613115
Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,3,0.05454545454545454,0.8792698312489979
Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,4,0.1272727272727273,0.6480954385121052
Helm Lite OpenBookQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,0,0.587180674734059,0.01246215829454031
Helm Lite OpenBookQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,1,0.6727272727272727,0.0031063111271444604
Helm Lite OpenBookQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,2,0.697277051246695,0.003004262239398284
Helm Lite OpenBookQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,3,0.587180674734059,0.01246215829454031
Helm Lite OpenBookQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,4,0.6605782590758164,0.004936818556325077
Helm Lite MMLU,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,0,0.6000000000000001,0.00994553671637005
Helm Lite MMLU,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,1,0.6727272727272727,0.0031063111271444604
Helm Lite MMLU,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,2,0.7090909090909091,0.0015912097162097162
Helm Lite MMLU,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,3,0.6000000000000001,0.00994553671637005
Helm Lite MMLU,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,4,0.6363636363636364,0.005707170915504249
Helm Lite MathEquivalentCOT,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,0,0.2727272727272727,0.2829668209876543
Helm Lite MathEquivalentCOT,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,1,0.34545454545454546,0.16457331248997917
Helm Lite MathEquivalentCOT,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,2,0.38181818181818183,0.12097096961680295
Helm Lite MathEquivalentCOT,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,3,0.2727272727272727,0.2829668209876543
Helm Lite MathEquivalentCOT,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,4,0.34545454545454546,0.16457331248997917
Helm Lite GSM8K,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,0,0.6363636363636364,0.005707170915504249
Helm Lite GSM8K,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,1,0.6727272727272727,0.0031063111271444604
Helm Lite GSM8K,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,2,0.6363636363636364,0.005707170915504249
Helm Lite GSM8K,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,3,0.6363636363636364,0.005707170915504249
Helm Lite GSM8K,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,4,0.7090909090909091,0.0015912097162097162
Helm Lite LegalBench,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,0,0.18349396085439343,0.43487965849578336
Helm Lite LegalBench,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,1,0.2935903373670295,0.21152242941072896
Helm Lite LegalBench,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,2,0.2727272727272727,0.2829668209876543
Helm Lite LegalBench,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,3,0.18349396085439343,0.43487965849578336
Helm Lite LegalBench,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,4,0.2568915451961508,0.27429882739587574
Helm Lite MedQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,0,0.4909090909090909,0.04053235730319064
Helm Lite MedQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,1,0.6000000000000001,0.00994553671637005
Helm Lite MedQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,2,0.5636363636363636,0.016540504248837583
Helm Lite MedQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,3,0.4909090909090909,0.04053235730319064
Helm Lite MedQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,4,0.5636363636363636,0.016540504248837583
Helm Lite WMT2014,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,0,0.34545454545454546,0.16457331248997917
Helm Lite WMT2014,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,1,0.41818181818181815,0.08656124739458072
Helm Lite WMT2014,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,2,0.4909090909090909,0.04053235730319064
Helm Lite WMT2014,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,3,0.34545454545454546,0.16457331248997917
Helm Lite WMT2014,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,4,0.34545454545454546,0.16457331248997917
HF OpenLLM v2,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,0,0.9272727272727274,3.2567740901074234e-06
HF OpenLLM v2,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,1,0.8545454545454545,4.624619207952541e-05
HF OpenLLM v2,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,2,0.9272727272727274,3.2567740901074234e-06
HF OpenLLM v2,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,3,0.8545454545454545,4.624619207952541e-05
HF OpenLLM v2,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,4,0.7818181818181819,0.0003334435626102293
HFv2 BBH,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,0,0.8181818181818182,0.00013227513227513228
HFv2 BBH,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,1,0.8181818181818182,0.00013227513227513228
HFv2 BBH,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,2,1.0,5.010421677088344e-08
HFv2 BBH,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,3,0.6727272727272727,0.0031063111271444604
HFv2 BBH,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,4,0.7454545454545454,0.000759529822029822
HFv2 GPQA,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,0,0.45454545454545453,0.06017015392015392
HFv2 GPQA,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,1,0.41818181818181815,0.08656124739458072
HFv2 GPQA,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,2,0.45454545454545453,0.06017015392015392
HFv2 GPQA,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,3,0.7454545454545454,0.000759529822029822
HFv2 GPQA,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,4,0.5272727272727272,0.02638447971781305
HFv2 IFEval,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,0,0.8181818181818182,0.00013227513227513228
HFv2 IFEval,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,1,0.5636363636363636,0.016540504248837583
HFv2 IFEval,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,2,0.7454545454545454,0.000759529822029822
HFv2 IFEval,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,3,0.7454545454545454,0.000759529822029822
HFv2 IFEval,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,4,0.7090909090909091,0.0015912097162097162
HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,0,0.8909090909090909,1.3728555395222063e-05
HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,1,0.9272727272727274,3.2567740901074234e-06
HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,2,1.0,5.010421677088344e-08
HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,3,0.8181818181818182,0.00013227513227513228
HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,4,0.7818181818181819,0.0003334435626102293
HFv2 Math Level 5,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,0,0.8909090909090909,1.3728555395222063e-05
HFv2 Math Level 5,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,1,0.5272727272727272,0.02638447971781305
HFv2 Math Level 5,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,2,0.8545454545454545,4.624619207952541e-05
HFv2 Math Level 5,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,3,0.4403855060505442,0.06091869077971648
HFv2 Math Level 5,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,4,0.587180674734059,0.01246215829454031
HFv2 MuSR,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,0,0.7454545454545454,0.000759529822029822
HFv2 MuSR,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,1,0.45454545454545453,0.06017015392015392
HFv2 MuSR,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,2,0.38181818181818183,0.12097096961680295
HFv2 MuSR,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,3,0.6363636363636364,0.005707170915504249
HFv2 MuSR,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,4,0.587180674734059,0.01246215829454031
Helm MMLU,helm_mmlu_240829.csv,aggregate,aggregate,kendall,random,11,0,0.7090909090909091,0.0015912097162097162
Helm MMLU,helm_mmlu_240829.csv,aggregate,aggregate,kendall,random,11,1,0.7090909090909091,0.0015912097162097162
Helm MMLU,helm_mmlu_240829.csv,aggregate,aggregate,kendall,random,11,2,0.7090909090909091,0.0015912097162097162
Helm MMLU,helm_mmlu_240829.csv,aggregate,aggregate,kendall,random,11,3,0.7090909090909091,0.0015912097162097162
Helm MMLU,helm_mmlu_240829.csv,aggregate,aggregate,kendall,random,11,4,0.7090909090909091,0.0015912097162097162
LMSys Arena,chatbot_arena_240829.csv,aggregate,aggregate,kendall,random,11,0,1.0,5.010421677088344e-08
LMSys Arena,chatbot_arena_240829.csv,aggregate,aggregate,kendall,random,11,1,1.0,5.010421677088344e-08
LMSys Arena,chatbot_arena_240829.csv,aggregate,aggregate,kendall,random,11,2,1.0,5.010421677088344e-08
LMSys Arena,chatbot_arena_240829.csv,aggregate,aggregate,kendall,random,11,3,1.0,5.010421677088344e-08
LMSys Arena,chatbot_arena_240829.csv,aggregate,aggregate,kendall,random,11,4,1.0,5.010421677088344e-08
MixEval,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,0,0.8545454545454545,4.624619207952541e-05
MixEval,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,1,0.7818181818181819,0.0003334435626102293
MixEval,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,2,0.8181818181818182,0.00013227513227513228
MixEval,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,3,0.8181818181818182,0.00013227513227513228
MixEval,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,4,0.8181818181818182,0.00013227513227513228
MixEval Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,0,0.7818181818181819,0.0003334435626102293
MixEval Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,1,0.7454545454545454,0.000759529822029822
MixEval Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,2,0.7090909090909091,0.0015912097162097162
MixEval Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,3,0.4909090909090909,0.04053235730319064
MixEval Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,4,0.8181818181818182,0.00013227513227513228
MixEval TriviaQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,0,0.6238794669049377,0.007931923532795268
MixEval TriviaQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,1,0.6605782590758164,0.004936818556325077
MixEval TriviaQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,2,0.4403855060505442,0.06091869077971648
MixEval TriviaQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,3,0.7090909090909091,0.0015912097162097162
MixEval TriviaQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,4,0.697277051246695,0.003004262239398284
MixEval MMLU,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,0,0.8545454545454545,4.624619207952541e-05
MixEval MMLU,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,1,0.7818181818181819,0.0003334435626102293
MixEval MMLU,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,2,0.8181818181818182,0.00013227513227513228
MixEval MMLU,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,3,0.7818181818181819,0.0003334435626102293
MixEval MMLU,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,4,0.7090909090909091,0.0015912097162097162
MixEval DROP,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,0,0.6238794669049377,0.007931923532795268
MixEval DROP,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,1,0.4403855060505442,0.06091869077971648
MixEval DROP,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,2,0.5636363636363636,0.016540504248837583
MixEval DROP,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,3,0.6363636363636364,0.005707170915504249
MixEval DROP,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,4,0.5636363636363636,0.016540504248837583
MixEval HellaSwag,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,0,0.6000000000000001,0.00994553671637005
MixEval HellaSwag,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,1,0.5636363636363636,0.016540504248837583
MixEval HellaSwag,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,2,0.6000000000000001,0.00994553671637005
MixEval HellaSwag,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,3,0.6000000000000001,0.00994553671637005
MixEval HellaSwag,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,4,0.5636363636363636,0.016540504248837583
MixEval CommonsenseQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,0,0.7339758434175737,0.0017872890369872653
MixEval CommonsenseQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,1,0.587180674734059,0.01246215829454031
MixEval CommonsenseQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,2,0.6482593132545567,0.006117582447622459
MixEval CommonsenseQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,3,0.759389481241052,0.0013210471654040124
MixEval CommonsenseQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,4,0.759389481241052,0.0013210471654040124
MixEval TriviaQA Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,0,0.6363636363636364,0.005707170915504249
MixEval TriviaQA Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,1,0.7090909090909091,0.0015912097162097162
MixEval TriviaQA Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,2,0.6727272727272727,0.0031063111271444604
MixEval TriviaQA Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,3,0.2727272727272727,0.2829668209876543
MixEval TriviaQA Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,4,0.5272727272727272,0.02638447971781305
MixEval MMLU Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,0,0.38181818181818183,0.12097096961680295
MixEval MMLU Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,1,0.4909090909090909,0.04053235730319064
MixEval MMLU Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,2,0.38895558795273394,0.10000137830747906
MixEval MMLU Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,3,0.38181818181818183,0.12097096961680295
MixEval MMLU Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,4,0.697277051246695,0.003004262239398284
MixEval DROP Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,0,0.4909090909090909,0.04053235730319064
MixEval DROP Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,1,0.41818181818181815,0.08656124739458072
MixEval DROP Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,2,0.6000000000000001,0.00994553671637005
MixEval DROP Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,3,0.6000000000000001,0.00994553671637005
MixEval DROP Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,4,0.6000000000000001,0.00994553671637005
AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,aggregate,aggregate,kendall,random,11,0,0.8181818181818182,0.00013227513227513228
AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,aggregate,aggregate,kendall,random,11,1,0.8807710121010884,0.00017812930545546289
AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,aggregate,aggregate,kendall,random,11,2,0.8807710121010884,0.00017812930545546289
AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,aggregate,aggregate,kendall,random,11,3,0.8807710121010884,0.00017812930545546289
AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,aggregate,aggregate,kendall,random,11,4,0.8545454545454545,4.624619207952541e-05
LiveBench 240725,livebench_240829.csv,aggregate,aggregate,kendall,random,11,0,0.7454545454545454,0.000759529822029822
LiveBench 240725,livebench_240829.csv,aggregate,aggregate,kendall,random,11,1,0.8181818181818182,0.00013227513227513228
LiveBench 240725,livebench_240829.csv,aggregate,aggregate,kendall,random,11,2,0.8545454545454545,4.624619207952541e-05
LiveBench 240725,livebench_240829.csv,aggregate,aggregate,kendall,random,11,3,0.8545454545454545,4.624619207952541e-05
LiveBench 240725,livebench_240829.csv,aggregate,aggregate,kendall,random,11,4,0.8545454545454545,4.624619207952541e-05
LiveBench Reasoning,livebench_240829.csv,aggregate,aggregate,kendall,random,11,0,0.4770842982214229,0.042330229121360724
LiveBench Reasoning,livebench_240829.csv,aggregate,aggregate,kendall,random,11,1,0.6727272727272727,0.0031063111271444604
LiveBench Reasoning,livebench_240829.csv,aggregate,aggregate,kendall,random,11,2,0.6605782590758164,0.004936818556325077
LiveBench Reasoning,livebench_240829.csv,aggregate,aggregate,kendall,random,11,3,0.8440722199302099,0.0003281542287518694
LiveBench Reasoning,livebench_240829.csv,aggregate,aggregate,kendall,random,11,4,0.7339758434175737,0.0017872890369872653
LiveBench Coding,livebench_240829.csv,aggregate,aggregate,kendall,random,11,0,0.9636363636363636,5.511463844797178e-07
LiveBench Coding,livebench_240829.csv,aggregate,aggregate,kendall,random,11,1,0.6727272727272727,0.0031063111271444604
LiveBench Coding,livebench_240829.csv,aggregate,aggregate,kendall,random,11,2,0.7818181818181819,0.0003334435626102293
LiveBench Coding,livebench_240829.csv,aggregate,aggregate,kendall,random,11,3,0.8545454545454545,4.624619207952541e-05
LiveBench Coding,livebench_240829.csv,aggregate,aggregate,kendall,random,11,4,0.7818181818181819,0.0003334435626102293
LiveBench Mathematics,livebench_240829.csv,aggregate,aggregate,kendall,random,11,0,0.7090909090909091,0.0015912097162097162
LiveBench Mathematics,livebench_240829.csv,aggregate,aggregate,kendall,random,11,1,0.7454545454545454,0.000759529822029822
LiveBench Mathematics,livebench_240829.csv,aggregate,aggregate,kendall,random,11,2,0.8181818181818182,0.00013227513227513228
LiveBench Mathematics,livebench_240829.csv,aggregate,aggregate,kendall,random,11,3,0.8909090909090909,1.3728555395222063e-05
LiveBench Mathematics,livebench_240829.csv,aggregate,aggregate,kendall,random,11,4,0.8545454545454545,4.624619207952541e-05
LiveBench Data Analysis,livebench_240829.csv,aggregate,aggregate,kendall,random,11,0,0.4909090909090909,0.04053235730319064
LiveBench Data Analysis,livebench_240829.csv,aggregate,aggregate,kendall,random,11,1,0.6363636363636364,0.005707170915504249
LiveBench Data Analysis,livebench_240829.csv,aggregate,aggregate,kendall,random,11,2,0.7454545454545454,0.000759529822029822
LiveBench Data Analysis,livebench_240829.csv,aggregate,aggregate,kendall,random,11,3,0.8545454545454545,4.624619207952541e-05
LiveBench Data Analysis,livebench_240829.csv,aggregate,aggregate,kendall,random,11,4,0.7454545454545454,0.000759529822029822
LiveBench Language,livebench_240829.csv,aggregate,aggregate,kendall,random,11,0,0.45454545454545453,0.06017015392015392
LiveBench Language,livebench_240829.csv,aggregate,aggregate,kendall,random,11,1,0.8181818181818182,0.00013227513227513228
LiveBench Language,livebench_240829.csv,aggregate,aggregate,kendall,random,11,2,0.6363636363636364,0.005707170915504249
LiveBench Language,livebench_240829.csv,aggregate,aggregate,kendall,random,11,3,0.9272727272727274,3.2567740901074234e-06
LiveBench Language,livebench_240829.csv,aggregate,aggregate,kendall,random,11,4,0.8545454545454545,4.624619207952541e-05
LiveBench Instruction Following,livebench_240829.csv,aggregate,aggregate,kendall,random,11,0,0.6000000000000001,0.00994553671637005
LiveBench Instruction Following,livebench_240829.csv,aggregate,aggregate,kendall,random,11,1,0.7090909090909091,0.0015912097162097162
LiveBench Instruction Following,livebench_240829.csv,aggregate,aggregate,kendall,random,11,2,0.6727272727272727,0.0031063111271444604
LiveBench Instruction Following,livebench_240829.csv,aggregate,aggregate,kendall,random,11,3,0.4909090909090909,0.04053235730319064
LiveBench Instruction Following,livebench_240829.csv,aggregate,aggregate,kendall,random,11,4,0.6727272727272727,0.0031063111271444604
WildBench Elo LC,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,0,0.8909090909090909,1.3728555395222063e-05
WildBench Elo LC,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,1,0.8181818181818182,0.00013227513227513228
WildBench Elo LC,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,2,0.8909090909090909,1.3728555395222063e-05
WildBench Elo LC,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,3,0.8909090909090909,1.3728555395222063e-05
WildBench Elo LC,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,4,0.8181818181818182,0.00013227513227513228
WildBench Information Seeking,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,0,0.7454545454545454,0.000759529822029822
WildBench Information Seeking,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,1,0.8181818181818182,0.00013227513227513228
WildBench Information Seeking,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,2,0.8545454545454545,4.624619207952541e-05
WildBench Information Seeking,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,3,0.7454545454545454,0.000759529822029822
WildBench Information Seeking,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,4,0.7454545454545454,0.000759529822029822
WildBench Creative,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,0,0.7454545454545454,0.000759529822029822
WildBench Creative,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,1,0.7818181818181819,0.0003334435626102293
WildBench Creative,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,2,0.7818181818181819,0.0003334435626102293
WildBench Creative,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,3,0.7454545454545454,0.000759529822029822
WildBench Creative,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,4,0.7454545454545454,0.000759529822029822
WildBench Code Debugging,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,0,1.0,5.010421677088344e-08
WildBench Code Debugging,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,1,0.9636363636363636,5.511463844797178e-07
WildBench Code Debugging,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,2,0.8909090909090909,1.3728555395222063e-05
WildBench Code Debugging,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,3,0.9636363636363636,5.511463844797178e-07
WildBench Code Debugging,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,4,1.0,5.010421677088344e-08
WildBench Math & Data,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,0,1.0,5.010421677088344e-08
WildBench Math & Data,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,1,1.0,5.010421677088344e-08
WildBench Math & Data,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,2,0.9272727272727274,3.2567740901074234e-06
WildBench Math & Data,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,3,0.9636363636363636,5.511463844797178e-07
WildBench Math & Data,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,4,1.0,5.010421677088344e-08
WildBench Reasoning & Planning,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,0,0.9272727272727274,3.2567740901074234e-06
WildBench Reasoning & Planning,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,1,0.9636363636363636,5.511463844797178e-07
WildBench Reasoning & Planning,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,2,0.8909090909090909,1.3728555395222063e-05
WildBench Reasoning & Planning,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,3,0.8181818181818182,0.00013227513227513228
WildBench Reasoning & Planning,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,4,0.9272727272727274,3.2567740901074234e-06
WildBench Score,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,0,0.9636363636363636,5.511463844797178e-07
WildBench Score,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,1,0.9636363636363636,5.511463844797178e-07
WildBench Score,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,2,0.8909090909090909,1.3728555395222063e-05
WildBench Score,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,3,0.8909090909090909,1.3728555395222063e-05
WildBench Score,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,4,0.9636363636363636,5.511463844797178e-07
Arena Hard,arena_hard_240829.csv,aggregate,aggregate,kendall,random,11,0,1.0,5.010421677088344e-08
Arena Hard,arena_hard_240829.csv,aggregate,aggregate,kendall,random,11,1,1.0,5.010421677088344e-08
Arena Hard,arena_hard_240829.csv,aggregate,aggregate,kendall,random,11,2,1.0,5.010421677088344e-08
Arena Hard,arena_hard_240829.csv,aggregate,aggregate,kendall,random,11,3,1.0,5.010421677088344e-08
Arena Hard,arena_hard_240829.csv,aggregate,aggregate,kendall,random,11,4,1.0,5.010421677088344e-08
HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,0,0.45454545454545453,0.06017015392015392
HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,1,0.6000000000000001,0.00994553671637005
HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,2,0.3090909090909091,0.21834651074234407
HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,3,0.5272727272727272,0.02638447971781305
HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,4,0.38181818181818183,0.12097096961680295
HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,0,0.6000000000000001,0.00994553671637005
HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,1,0.5636363636363636,0.016540504248837583
HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,2,0.5272727272727272,0.02638447971781305
HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,3,0.6000000000000001,0.00994553671637005
HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,4,0.4909090909090909,0.04053235730319064
HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,0,0.45454545454545453,0.06017015392015392
HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,1,0.6000000000000001,0.00994553671637005
HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,2,0.34545454545454546,0.16457331248997917
HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,3,0.41818181818181815,0.08656124739458072
HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,4,0.2727272727272727,0.2829668209876543
HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,0,0.5741725345968929,0.015177848122929492
HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,1,0.3519121986239021,0.1366995137219537
HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,2,0.42599897728156577,0.07162425926742408
HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,3,0.38181818181818183,0.12097096961680295
HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,4,0.4403855060505442,0.06091869077971648
HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,0,0.7090909090909091,0.0015912097162097162
HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,1,0.7818181818181819,0.0003334435626102293
HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,2,0.6727272727272727,0.0031063111271444604
HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,3,0.7454545454545454,0.000759529822029822
HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,4,0.6727272727272727,0.0031063111271444604
HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,0,0.6000000000000001,0.00994553671637005
HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,1,0.5272727272727272,0.02638447971781305
HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,2,0.3090909090909091,0.21834651074234407
HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,3,0.45454545454545453,0.06017015392015392
HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,4,0.2727272727272727,0.2829668209876543
HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,0,0.4403855060505442,0.06091869077971648
HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,1,0.38181818181818183,0.12097096961680295
HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,2,0.4403855060505442,0.06091869077971648
HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,3,0.45454545454545453,0.06017015392015392
HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,4,0.36698792170878686,0.11834981273562825
BFCL,bfcl_240906.csv,aggregate,aggregate,kendall,random,11,0,0.2,0.4453821448613115
BFCL,bfcl_240906.csv,aggregate,aggregate,kendall,random,11,1,0.38181818181818183,0.12097096961680295
BFCL,bfcl_240906.csv,aggregate,aggregate,kendall,random,11,2,0.41818181818181815,0.08656124739458072
BFCL,bfcl_240906.csv,aggregate,aggregate,kendall,random,11,3,0.5272727272727272,0.02638447971781305
BFCL,bfcl_240906.csv,aggregate,aggregate,kendall,random,11,4,0.5272727272727272,0.02638447971781305
BIGGEN,biggen_240829.csv,aggregate,aggregate,kendall,random,11,0,0.8181818181818182,0.00013227513227513228
BIGGEN,biggen_240829.csv,aggregate,aggregate,kendall,random,11,1,0.8181818181818182,0.00013227513227513228
BIGGEN,biggen_240829.csv,aggregate,aggregate,kendall,random,11,2,0.8909090909090909,1.3728555395222063e-05
BIGGEN,biggen_240829.csv,aggregate,aggregate,kendall,random,11,3,0.8181818181818182,0.00013227513227513228
BIGGEN,biggen_240829.csv,aggregate,aggregate,kendall,random,11,4,0.7090909090909091,0.0015912097162097162
BIGGEN Grounding,biggen_240829.csv,aggregate,aggregate,kendall,random,11,0,0.7454545454545454,0.000759529822029822
BIGGEN Grounding,biggen_240829.csv,aggregate,aggregate,kendall,random,11,1,0.7818181818181819,0.0003334435626102293
BIGGEN Grounding,biggen_240829.csv,aggregate,aggregate,kendall,random,11,2,0.8181818181818182,0.00013227513227513228
BIGGEN Grounding,biggen_240829.csv,aggregate,aggregate,kendall,random,11,3,0.8073734277593311,0.0005907573118657002
BIGGEN Grounding,biggen_240829.csv,aggregate,aggregate,kendall,random,11,4,0.7454545454545454,0.000759529822029822
BIGGEN Instruction Following,biggen_240829.csv,aggregate,aggregate,kendall,random,11,0,0.587180674734059,0.01246215829454031
BIGGEN Instruction Following,biggen_240829.csv,aggregate,aggregate,kendall,random,11,1,0.6482593132545567,0.006117582447622459
BIGGEN Instruction Following,biggen_240829.csv,aggregate,aggregate,kendall,random,11,2,0.8545454545454545,4.624619207952541e-05
BIGGEN Instruction Following,biggen_240829.csv,aggregate,aggregate,kendall,random,11,3,0.7706746355884524,0.0010393630991335228
BIGGEN Instruction Following,biggen_240829.csv,aggregate,aggregate,kendall,random,11,4,0.5371291452680612,0.02311942970946668
BIGGEN Planning,biggen_240829.csv,aggregate,aggregate,kendall,random,11,0,0.6238794669049377,0.007931923532795268
BIGGEN Planning,biggen_240829.csv,aggregate,aggregate,kendall,random,11,1,0.4909090909090909,0.04053235730319064
BIGGEN Planning,biggen_240829.csv,aggregate,aggregate,kendall,random,11,2,0.8440722199302099,0.0003281542287518694
BIGGEN Planning,biggen_240829.csv,aggregate,aggregate,kendall,random,11,3,0.7454545454545454,0.000759529822029822
BIGGEN Planning,biggen_240829.csv,aggregate,aggregate,kendall,random,11,4,0.36698792170878686,0.11834981273562825
BIGGEN Reasoning,biggen_240829.csv,aggregate,aggregate,kendall,random,11,0,0.8909090909090909,1.3728555395222063e-05
BIGGEN Reasoning,biggen_240829.csv,aggregate,aggregate,kendall,random,11,1,0.9272727272727274,3.2567740901074234e-06
BIGGEN Reasoning,biggen_240829.csv,aggregate,aggregate,kendall,random,11,2,1.0,5.010421677088344e-08
BIGGEN Reasoning,biggen_240829.csv,aggregate,aggregate,kendall,random,11,3,0.7090909090909091,0.0015912097162097162
BIGGEN Reasoning,biggen_240829.csv,aggregate,aggregate,kendall,random,11,4,0.6727272727272727,0.0031063111271444604
BIGGEN Refinement,biggen_240829.csv,aggregate,aggregate,kendall,random,11,0,0.8181818181818182,0.00013227513227513228
BIGGEN Refinement,biggen_240829.csv,aggregate,aggregate,kendall,random,11,1,0.8073734277593311,0.0005907573118657002
BIGGEN Refinement,biggen_240829.csv,aggregate,aggregate,kendall,random,11,2,0.8909090909090909,1.3728555395222063e-05
BIGGEN Refinement,biggen_240829.csv,aggregate,aggregate,kendall,random,11,3,0.7818181818181819,0.0003334435626102293
BIGGEN Refinement,biggen_240829.csv,aggregate,aggregate,kendall,random,11,4,0.6605782590758164,0.004936818556325077
BIGGEN Safety,biggen_240829.csv,aggregate,aggregate,kendall,random,11,0,-0.0909090909090909,0.7611503928170594
BIGGEN Safety,biggen_240829.csv,aggregate,aggregate,kendall,random,11,1,0.07339758434175737,0.7547764265871044
BIGGEN Safety,biggen_240829.csv,aggregate,aggregate,kendall,random,11,2,0.4403855060505442,0.06091869077971648
BIGGEN Safety,biggen_240829.csv,aggregate,aggregate,kendall,random,11,3,0.3302891295379082,0.15985367483762747
BIGGEN Safety,biggen_240829.csv,aggregate,aggregate,kendall,random,11,4,0.1272727272727273,0.6480954385121052
BIGGEN Theory of Mind,biggen_240829.csv,aggregate,aggregate,kendall,random,11,0,0.7090909090909091,0.0015912097162097162
BIGGEN Theory of Mind,biggen_240829.csv,aggregate,aggregate,kendall,random,11,1,0.7818181818181819,0.0003334435626102293
BIGGEN Theory of Mind,biggen_240829.csv,aggregate,aggregate,kendall,random,11,2,0.8705196492275474,0.00023202582506637044
BIGGEN Theory of Mind,biggen_240829.csv,aggregate,aggregate,kendall,random,11,3,0.7479575920067658,0.001637274718449882
BIGGEN Theory of Mind,biggen_240829.csv,aggregate,aggregate,kendall,random,11,4,0.5983660736054126,0.01175728488671479
BIGGEN Tool Usage,biggen_240829.csv,aggregate,aggregate,kendall,random,11,0,0.7090909090909091,0.0015912097162097162
BIGGEN Tool Usage,biggen_240829.csv,aggregate,aggregate,kendall,random,11,1,0.6000000000000001,0.00994553671637005
BIGGEN Tool Usage,biggen_240829.csv,aggregate,aggregate,kendall,random,11,2,0.7090909090909091,0.0015912097162097162
BIGGEN Tool Usage,biggen_240829.csv,aggregate,aggregate,kendall,random,11,3,0.7454545454545454,0.000759529822029822
BIGGEN Tool Usage,biggen_240829.csv,aggregate,aggregate,kendall,random,11,4,0.5272727272727272,0.02638447971781305
BIGGEN Multilingual,biggen_240829.csv,aggregate,aggregate,kendall,random,11,0,0.7706746355884524,0.0010393630991335228
BIGGEN Multilingual,biggen_240829.csv,aggregate,aggregate,kendall,random,11,1,0.8909090909090909,1.3728555395222063e-05
BIGGEN Multilingual,biggen_240829.csv,aggregate,aggregate,kendall,random,11,2,0.5272727272727272,0.02638447971781305
BIGGEN Multilingual,biggen_240829.csv,aggregate,aggregate,kendall,random,11,3,0.7454545454545454,0.000759529822029822
BIGGEN Multilingual,biggen_240829.csv,aggregate,aggregate,kendall,random,11,4,0.7818181818181819,0.0003334435626102293
LiveBench 240624,livebench_240701.csv,aggregate,aggregate,kendall,random,11,0,0.7818181818181819,0.0003334435626102293
LiveBench 240624,livebench_240701.csv,aggregate,aggregate,kendall,random,11,1,0.8545454545454545,4.624619207952541e-05
LiveBench 240624,livebench_240701.csv,aggregate,aggregate,kendall,random,11,2,0.7818181818181819,0.0003334435626102293
LiveBench 240624,livebench_240701.csv,aggregate,aggregate,kendall,random,11,3,0.8545454545454545,4.624619207952541e-05
LiveBench 240624,livebench_240701.csv,aggregate,aggregate,kendall,random,11,4,0.7818181818181819,0.0003334435626102293
LiveBench Reasoning Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,0,0.6731618328060892,0.004677734981047257
LiveBench Reasoning Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,1,0.759389481241052,0.0013210471654040124
LiveBench Reasoning Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,2,0.7339758434175737,0.0017872890369872653
LiveBench Reasoning Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,3,0.6238794669049377,0.007931923532795268
LiveBench Reasoning Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,4,0.7090909090909091,0.0015912097162097162
LiveBench Coding Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,0,0.6363636363636364,0.005707170915504249
LiveBench Coding Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,1,0.7454545454545454,0.000759529822029822
LiveBench Coding Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,2,0.7706746355884524,0.0010393630991335228
LiveBench Coding Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,3,0.8181818181818182,0.00013227513227513228
LiveBench Coding Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,4,0.7706746355884524,0.0010393630991335228
LiveBench Mathematics Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,0,0.7090909090909091,0.0015912097162097162
LiveBench Mathematics Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,1,0.8909090909090909,1.3728555395222063e-05
LiveBench Mathematics Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,2,0.7818181818181819,0.0003334435626102293
LiveBench Mathematics Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,3,0.8545454545454545,4.624619207952541e-05
LiveBench Mathematics Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,4,0.8545454545454545,4.624619207952541e-05
LiveBench Data Analysis Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,0,0.5636363636363636,0.016540504248837583
LiveBench Data Analysis Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,1,0.7454545454545454,0.000759529822029822
LiveBench Data Analysis Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,2,0.6363636363636364,0.005707170915504249
LiveBench Data Analysis Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,3,0.7818181818181819,0.0003334435626102293
LiveBench Data Analysis Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,4,0.6363636363636364,0.005707170915504249
LiveBench Language Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,0,0.7090909090909091,0.0015912097162097162
LiveBench Language Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,1,0.8181818181818182,0.00013227513227513228
LiveBench Language Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,2,0.5636363636363636,0.016540504248837583
LiveBench Language Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,3,0.8545454545454545,4.624619207952541e-05
LiveBench Language Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,4,0.7454545454545454,0.000759529822029822
LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,0,0.6727272727272727,0.0031063111271444604
LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,1,0.7454545454545454,0.000759529822029822
LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,2,0.7090909090909091,0.0015912097162097162
LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,3,0.6000000000000001,0.00994553671637005
LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,4,0.7454545454545454,0.000759529822029822
aggregate,aggregate,Helm Lite,helm_lite_240829.csv,kendall,random,11,0,0.2778254199662385,0.2400384567875128
aggregate,aggregate,Helm Lite,helm_lite_240829.csv,kendall,random,11,1,0.40368671387966554,0.08581278065055217
aggregate,aggregate,Helm Lite,helm_lite_240829.csv,kendall,random,11,2,0.42599897728156577,0.07162425926742408
aggregate,aggregate,Helm Lite,helm_lite_240829.csv,kendall,random,11,3,0.2778254199662385,0.2400384567875128
aggregate,aggregate,Helm Lite,helm_lite_240829.csv,kendall,random,11,4,0.36698792170878686,0.11834981273562825
aggregate,aggregate,Helm Lite NarrativeQA,helm_lite_240829.csv,kendall,random,11,0,-0.018181818181818184,1.0
aggregate,aggregate,Helm Lite NarrativeQA,helm_lite_240829.csv,kendall,random,11,1,-0.018181818181818184,1.0
aggregate,aggregate,Helm Lite NarrativeQA,helm_lite_240829.csv,kendall,random,11,2,-0.05454545454545454,0.8792698312489979
aggregate,aggregate,Helm Lite NarrativeQA,helm_lite_240829.csv,kendall,random,11,3,-0.018181818181818184,1.0
aggregate,aggregate,Helm Lite NarrativeQA,helm_lite_240829.csv,kendall,random,11,4,-0.1272727272727273,0.6480954385121052
aggregate,aggregate,Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,kendall,random,11,0,0.05454545454545454,0.8792698312489979
aggregate,aggregate,Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,kendall,random,11,1,-0.018181818181818184,1.0
aggregate,aggregate,Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,kendall,random,11,2,-0.018181818181818184,1.0
aggregate,aggregate,Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,kendall,random,11,3,0.05454545454545454,0.8792698312489979
aggregate,aggregate,Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,kendall,random,11,4,-0.05454545454545454,0.8792698312489979
aggregate,aggregate,Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,kendall,random,11,0,0.05454545454545454,0.8792698312489979
aggregate,aggregate,Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,kendall,random,11,1,0.23636363636363636,0.3587114698573032
aggregate,aggregate,Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,kendall,random,11,2,0.2,0.4453821448613115
aggregate,aggregate,Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,kendall,random,11,3,0.05454545454545454,0.8792698312489979
aggregate,aggregate,Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,kendall,random,11,4,0.1272727272727273,0.6480954385121052
aggregate,aggregate,Helm Lite OpenBookQA,helm_lite_240829.csv,kendall,random,11,0,0.587180674734059,0.01246215829454031
aggregate,aggregate,Helm Lite OpenBookQA,helm_lite_240829.csv,kendall,random,11,1,0.6727272727272727,0.0031063111271444604
aggregate,aggregate,Helm Lite OpenBookQA,helm_lite_240829.csv,kendall,random,11,2,0.697277051246695,0.003004262239398284
aggregate,aggregate,Helm Lite OpenBookQA,helm_lite_240829.csv,kendall,random,11,3,0.587180674734059,0.01246215829454031
aggregate,aggregate,Helm Lite OpenBookQA,helm_lite_240829.csv,kendall,random,11,4,0.6605782590758164,0.004936818556325077
aggregate,aggregate,Helm Lite MMLU,helm_lite_240829.csv,kendall,random,11,0,0.6000000000000001,0.00994553671637005
aggregate,aggregate,Helm Lite MMLU,helm_lite_240829.csv,kendall,random,11,1,0.6727272727272727,0.0031063111271444604
aggregate,aggregate,Helm Lite MMLU,helm_lite_240829.csv,kendall,random,11,2,0.7090909090909091,0.0015912097162097162
aggregate,aggregate,Helm Lite MMLU,helm_lite_240829.csv,kendall,random,11,3,0.6000000000000001,0.00994553671637005
aggregate,aggregate,Helm Lite MMLU,helm_lite_240829.csv,kendall,random,11,4,0.6363636363636364,0.005707170915504249
aggregate,aggregate,Helm Lite MathEquivalentCOT,helm_lite_240829.csv,kendall,random,11,0,0.2727272727272727,0.2829668209876543
aggregate,aggregate,Helm Lite MathEquivalentCOT,helm_lite_240829.csv,kendall,random,11,1,0.34545454545454546,0.16457331248997917
aggregate,aggregate,Helm Lite MathEquivalentCOT,helm_lite_240829.csv,kendall,random,11,2,0.38181818181818183,0.12097096961680295
aggregate,aggregate,Helm Lite MathEquivalentCOT,helm_lite_240829.csv,kendall,random,11,3,0.2727272727272727,0.2829668209876543
aggregate,aggregate,Helm Lite MathEquivalentCOT,helm_lite_240829.csv,kendall,random,11,4,0.34545454545454546,0.16457331248997917
aggregate,aggregate,Helm Lite GSM8K,helm_lite_240829.csv,kendall,random,11,0,0.6363636363636364,0.005707170915504249
aggregate,aggregate,Helm Lite GSM8K,helm_lite_240829.csv,kendall,random,11,1,0.6727272727272727,0.0031063111271444604
aggregate,aggregate,Helm Lite GSM8K,helm_lite_240829.csv,kendall,random,11,2,0.6363636363636364,0.005707170915504249
aggregate,aggregate,Helm Lite GSM8K,helm_lite_240829.csv,kendall,random,11,3,0.6363636363636364,0.005707170915504249
aggregate,aggregate,Helm Lite GSM8K,helm_lite_240829.csv,kendall,random,11,4,0.7090909090909091,0.0015912097162097162
aggregate,aggregate,Helm Lite LegalBench,helm_lite_240829.csv,kendall,random,11,0,0.18349396085439343,0.43487965849578336
aggregate,aggregate,Helm Lite LegalBench,helm_lite_240829.csv,kendall,random,11,1,0.2935903373670295,0.21152242941072896
aggregate,aggregate,Helm Lite LegalBench,helm_lite_240829.csv,kendall,random,11,2,0.2727272727272727,0.2829668209876543
aggregate,aggregate,Helm Lite LegalBench,helm_lite_240829.csv,kendall,random,11,3,0.18349396085439343,0.43487965849578336
aggregate,aggregate,Helm Lite LegalBench,helm_lite_240829.csv,kendall,random,11,4,0.2568915451961508,0.27429882739587574
aggregate,aggregate,Helm Lite MedQA,helm_lite_240829.csv,kendall,random,11,0,0.4909090909090909,0.04053235730319064
aggregate,aggregate,Helm Lite MedQA,helm_lite_240829.csv,kendall,random,11,1,0.6000000000000001,0.00994553671637005
aggregate,aggregate,Helm Lite MedQA,helm_lite_240829.csv,kendall,random,11,2,0.5636363636363636,0.016540504248837583
aggregate,aggregate,Helm Lite MedQA,helm_lite_240829.csv,kendall,random,11,3,0.4909090909090909,0.04053235730319064
aggregate,aggregate,Helm Lite MedQA,helm_lite_240829.csv,kendall,random,11,4,0.5636363636363636,0.016540504248837583
aggregate,aggregate,Helm Lite WMT2014,helm_lite_240829.csv,kendall,random,11,0,0.34545454545454546,0.16457331248997917
aggregate,aggregate,Helm Lite WMT2014,helm_lite_240829.csv,kendall,random,11,1,0.41818181818181815,0.08656124739458072
aggregate,aggregate,Helm Lite WMT2014,helm_lite_240829.csv,kendall,random,11,2,0.4909090909090909,0.04053235730319064
aggregate,aggregate,Helm Lite WMT2014,helm_lite_240829.csv,kendall,random,11,3,0.34545454545454546,0.16457331248997917
aggregate,aggregate,Helm Lite WMT2014,helm_lite_240829.csv,kendall,random,11,4,0.34545454545454546,0.16457331248997917
aggregate,aggregate,HF OpenLLM v2,hf_open_llm_v2_240829.csv,kendall,random,11,0,0.9272727272727274,3.2567740901074234e-06
aggregate,aggregate,HF OpenLLM v2,hf_open_llm_v2_240829.csv,kendall,random,11,1,0.8545454545454545,4.624619207952541e-05
aggregate,aggregate,HF OpenLLM v2,hf_open_llm_v2_240829.csv,kendall,random,11,2,0.9272727272727274,3.2567740901074234e-06
aggregate,aggregate,HF OpenLLM v2,hf_open_llm_v2_240829.csv,kendall,random,11,3,0.8545454545454545,4.624619207952541e-05
aggregate,aggregate,HF OpenLLM v2,hf_open_llm_v2_240829.csv,kendall,random,11,4,0.7818181818181819,0.0003334435626102293
aggregate,aggregate,HFv2 BBH,hf_open_llm_v2_240829.csv,kendall,random,11,0,0.8181818181818182,0.00013227513227513228
aggregate,aggregate,HFv2 BBH,hf_open_llm_v2_240829.csv,kendall,random,11,1,0.8181818181818182,0.00013227513227513228
aggregate,aggregate,HFv2 BBH,hf_open_llm_v2_240829.csv,kendall,random,11,2,1.0,5.010421677088344e-08
aggregate,aggregate,HFv2 BBH,hf_open_llm_v2_240829.csv,kendall,random,11,3,0.6727272727272727,0.0031063111271444604
aggregate,aggregate,HFv2 BBH,hf_open_llm_v2_240829.csv,kendall,random,11,4,0.7454545454545454,0.000759529822029822
aggregate,aggregate,HFv2 GPQA,hf_open_llm_v2_240829.csv,kendall,random,11,0,0.45454545454545453,0.06017015392015392
aggregate,aggregate,HFv2 GPQA,hf_open_llm_v2_240829.csv,kendall,random,11,1,0.41818181818181815,0.08656124739458072
aggregate,aggregate,HFv2 GPQA,hf_open_llm_v2_240829.csv,kendall,random,11,2,0.45454545454545453,0.06017015392015392
aggregate,aggregate,HFv2 GPQA,hf_open_llm_v2_240829.csv,kendall,random,11,3,0.7454545454545454,0.000759529822029822
aggregate,aggregate,HFv2 GPQA,hf_open_llm_v2_240829.csv,kendall,random,11,4,0.5272727272727272,0.02638447971781305
aggregate,aggregate,HFv2 IFEval,hf_open_llm_v2_240829.csv,kendall,random,11,0,0.8181818181818182,0.00013227513227513228
aggregate,aggregate,HFv2 IFEval,hf_open_llm_v2_240829.csv,kendall,random,11,1,0.5636363636363636,0.016540504248837583
aggregate,aggregate,HFv2 IFEval,hf_open_llm_v2_240829.csv,kendall,random,11,2,0.7454545454545454,0.000759529822029822
aggregate,aggregate,HFv2 IFEval,hf_open_llm_v2_240829.csv,kendall,random,11,3,0.7454545454545454,0.000759529822029822
aggregate,aggregate,HFv2 IFEval,hf_open_llm_v2_240829.csv,kendall,random,11,4,0.7090909090909091,0.0015912097162097162
aggregate,aggregate,HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,kendall,random,11,0,0.8909090909090909,1.3728555395222063e-05
aggregate,aggregate,HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,kendall,random,11,1,0.9272727272727274,3.2567740901074234e-06
aggregate,aggregate,HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,kendall,random,11,2,1.0,5.010421677088344e-08
aggregate,aggregate,HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,kendall,random,11,3,0.8181818181818182,0.00013227513227513228
aggregate,aggregate,HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,kendall,random,11,4,0.7818181818181819,0.0003334435626102293
aggregate,aggregate,HFv2 Math Level 5,hf_open_llm_v2_240829.csv,kendall,random,11,0,0.8909090909090909,1.3728555395222063e-05
aggregate,aggregate,HFv2 Math Level 5,hf_open_llm_v2_240829.csv,kendall,random,11,1,0.5272727272727272,0.02638447971781305
aggregate,aggregate,HFv2 Math Level 5,hf_open_llm_v2_240829.csv,kendall,random,11,2,0.8545454545454545,4.624619207952541e-05
aggregate,aggregate,HFv2 Math Level 5,hf_open_llm_v2_240829.csv,kendall,random,11,3,0.4403855060505442,0.06091869077971648
aggregate,aggregate,HFv2 Math Level 5,hf_open_llm_v2_240829.csv,kendall,random,11,4,0.587180674734059,0.01246215829454031
aggregate,aggregate,HFv2 MuSR,hf_open_llm_v2_240829.csv,kendall,random,11,0,0.7454545454545454,0.000759529822029822
aggregate,aggregate,HFv2 MuSR,hf_open_llm_v2_240829.csv,kendall,random,11,1,0.45454545454545453,0.06017015392015392
aggregate,aggregate,HFv2 MuSR,hf_open_llm_v2_240829.csv,kendall,random,11,2,0.38181818181818183,0.12097096961680295
aggregate,aggregate,HFv2 MuSR,hf_open_llm_v2_240829.csv,kendall,random,11,3,0.6363636363636364,0.005707170915504249
aggregate,aggregate,HFv2 MuSR,hf_open_llm_v2_240829.csv,kendall,random,11,4,0.587180674734059,0.01246215829454031
aggregate,aggregate,Helm MMLU,helm_mmlu_240829.csv,kendall,random,11,0,0.7090909090909091,0.0015912097162097162
aggregate,aggregate,Helm MMLU,helm_mmlu_240829.csv,kendall,random,11,1,0.7090909090909091,0.0015912097162097162
aggregate,aggregate,Helm MMLU,helm_mmlu_240829.csv,kendall,random,11,2,0.7090909090909091,0.0015912097162097162
aggregate,aggregate,Helm MMLU,helm_mmlu_240829.csv,kendall,random,11,3,0.7090909090909091,0.0015912097162097162
aggregate,aggregate,Helm MMLU,helm_mmlu_240829.csv,kendall,random,11,4,0.7090909090909091,0.0015912097162097162
aggregate,aggregate,LMSys Arena,chatbot_arena_240829.csv,kendall,random,11,0,1.0,5.010421677088344e-08
aggregate,aggregate,LMSys Arena,chatbot_arena_240829.csv,kendall,random,11,1,1.0,5.010421677088344e-08
aggregate,aggregate,LMSys Arena,chatbot_arena_240829.csv,kendall,random,11,2,1.0,5.010421677088344e-08
aggregate,aggregate,LMSys Arena,chatbot_arena_240829.csv,kendall,random,11,3,1.0,5.010421677088344e-08
aggregate,aggregate,LMSys Arena,chatbot_arena_240829.csv,kendall,random,11,4,1.0,5.010421677088344e-08
aggregate,aggregate,MixEval,mixeval_240829.csv,kendall,random,11,0,0.8545454545454545,4.624619207952541e-05
aggregate,aggregate,MixEval,mixeval_240829.csv,kendall,random,11,1,0.7818181818181819,0.0003334435626102293
aggregate,aggregate,MixEval,mixeval_240829.csv,kendall,random,11,2,0.8181818181818182,0.00013227513227513228
aggregate,aggregate,MixEval,mixeval_240829.csv,kendall,random,11,3,0.8181818181818182,0.00013227513227513228
aggregate,aggregate,MixEval,mixeval_240829.csv,kendall,random,11,4,0.8181818181818182,0.00013227513227513228
aggregate,aggregate,MixEval Hard,mixeval_240829.csv,kendall,random,11,0,0.7818181818181819,0.0003334435626102293
aggregate,aggregate,MixEval Hard,mixeval_240829.csv,kendall,random,11,1,0.7454545454545454,0.000759529822029822
aggregate,aggregate,MixEval Hard,mixeval_240829.csv,kendall,random,11,2,0.7090909090909091,0.0015912097162097162
aggregate,aggregate,MixEval Hard,mixeval_240829.csv,kendall,random,11,3,0.4909090909090909,0.04053235730319064
aggregate,aggregate,MixEval Hard,mixeval_240829.csv,kendall,random,11,4,0.8181818181818182,0.00013227513227513228
aggregate,aggregate,MixEval TriviaQA,mixeval_240829.csv,kendall,random,11,0,0.6238794669049377,0.007931923532795268
aggregate,aggregate,MixEval TriviaQA,mixeval_240829.csv,kendall,random,11,1,0.6605782590758164,0.004936818556325077
aggregate,aggregate,MixEval TriviaQA,mixeval_240829.csv,kendall,random,11,2,0.4403855060505442,0.06091869077971648
aggregate,aggregate,MixEval TriviaQA,mixeval_240829.csv,kendall,random,11,3,0.7090909090909091,0.0015912097162097162
aggregate,aggregate,MixEval TriviaQA,mixeval_240829.csv,kendall,random,11,4,0.697277051246695,0.003004262239398284
aggregate,aggregate,MixEval MMLU,mixeval_240829.csv,kendall,random,11,0,0.8545454545454545,4.624619207952541e-05
aggregate,aggregate,MixEval MMLU,mixeval_240829.csv,kendall,random,11,1,0.7818181818181819,0.0003334435626102293
aggregate,aggregate,MixEval MMLU,mixeval_240829.csv,kendall,random,11,2,0.8181818181818182,0.00013227513227513228
aggregate,aggregate,MixEval MMLU,mixeval_240829.csv,kendall,random,11,3,0.7818181818181819,0.0003334435626102293
aggregate,aggregate,MixEval MMLU,mixeval_240829.csv,kendall,random,11,4,0.7090909090909091,0.0015912097162097162
aggregate,aggregate,MixEval DROP,mixeval_240829.csv,kendall,random,11,0,0.6238794669049377,0.007931923532795268
aggregate,aggregate,MixEval DROP,mixeval_240829.csv,kendall,random,11,1,0.4403855060505442,0.06091869077971648
aggregate,aggregate,MixEval DROP,mixeval_240829.csv,kendall,random,11,2,0.5636363636363636,0.016540504248837583
aggregate,aggregate,MixEval DROP,mixeval_240829.csv,kendall,random,11,3,0.6363636363636364,0.005707170915504249
aggregate,aggregate,MixEval DROP,mixeval_240829.csv,kendall,random,11,4,0.5636363636363636,0.016540504248837583
aggregate,aggregate,MixEval HellaSwag,mixeval_240829.csv,kendall,random,11,0,0.6000000000000001,0.00994553671637005
aggregate,aggregate,MixEval HellaSwag,mixeval_240829.csv,kendall,random,11,1,0.5636363636363636,0.016540504248837583
aggregate,aggregate,MixEval HellaSwag,mixeval_240829.csv,kendall,random,11,2,0.6000000000000001,0.00994553671637005
aggregate,aggregate,MixEval HellaSwag,mixeval_240829.csv,kendall,random,11,3,0.6000000000000001,0.00994553671637005
aggregate,aggregate,MixEval HellaSwag,mixeval_240829.csv,kendall,random,11,4,0.5636363636363636,0.016540504248837583
aggregate,aggregate,MixEval CommonsenseQA,mixeval_240829.csv,kendall,random,11,0,0.7339758434175737,0.0017872890369872653
aggregate,aggregate,MixEval CommonsenseQA,mixeval_240829.csv,kendall,random,11,1,0.587180674734059,0.01246215829454031
aggregate,aggregate,MixEval CommonsenseQA,mixeval_240829.csv,kendall,random,11,2,0.6482593132545567,0.006117582447622459
aggregate,aggregate,MixEval CommonsenseQA,mixeval_240829.csv,kendall,random,11,3,0.759389481241052,0.0013210471654040124
aggregate,aggregate,MixEval CommonsenseQA,mixeval_240829.csv,kendall,random,11,4,0.759389481241052,0.0013210471654040124
aggregate,aggregate,MixEval TriviaQA Hard,mixeval_240829.csv,kendall,random,11,0,0.6363636363636364,0.005707170915504249
aggregate,aggregate,MixEval TriviaQA Hard,mixeval_240829.csv,kendall,random,11,1,0.7090909090909091,0.0015912097162097162
aggregate,aggregate,MixEval TriviaQA Hard,mixeval_240829.csv,kendall,random,11,2,0.6727272727272727,0.0031063111271444604
aggregate,aggregate,MixEval TriviaQA Hard,mixeval_240829.csv,kendall,random,11,3,0.2727272727272727,0.2829668209876543
aggregate,aggregate,MixEval TriviaQA Hard,mixeval_240829.csv,kendall,random,11,4,0.5272727272727272,0.02638447971781305
aggregate,aggregate,MixEval MMLU Hard,mixeval_240829.csv,kendall,random,11,0,0.38181818181818183,0.12097096961680295
aggregate,aggregate,MixEval MMLU Hard,mixeval_240829.csv,kendall,random,11,1,0.4909090909090909,0.04053235730319064
aggregate,aggregate,MixEval MMLU Hard,mixeval_240829.csv,kendall,random,11,2,0.38895558795273394,0.10000137830747906
aggregate,aggregate,MixEval MMLU Hard,mixeval_240829.csv,kendall,random,11,3,0.38181818181818183,0.12097096961680295
aggregate,aggregate,MixEval MMLU Hard,mixeval_240829.csv,kendall,random,11,4,0.697277051246695,0.003004262239398284
aggregate,aggregate,MixEval DROP Hard,mixeval_240829.csv,kendall,random,11,0,0.4909090909090909,0.04053235730319064
aggregate,aggregate,MixEval DROP Hard,mixeval_240829.csv,kendall,random,11,1,0.41818181818181815,0.08656124739458072
aggregate,aggregate,MixEval DROP Hard,mixeval_240829.csv,kendall,random,11,2,0.6000000000000001,0.00994553671637005
aggregate,aggregate,MixEval DROP Hard,mixeval_240829.csv,kendall,random,11,3,0.6000000000000001,0.00994553671637005
aggregate,aggregate,MixEval DROP Hard,mixeval_240829.csv,kendall,random,11,4,0.6000000000000001,0.00994553671637005
aggregate,aggregate,AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,kendall,random,11,0,0.8181818181818182,0.00013227513227513228
aggregate,aggregate,AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,kendall,random,11,1,0.8807710121010884,0.00017812930545546289
aggregate,aggregate,AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,kendall,random,11,2,0.8807710121010884,0.00017812930545546289
aggregate,aggregate,AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,kendall,random,11,3,0.8807710121010884,0.00017812930545546289
aggregate,aggregate,AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,kendall,random,11,4,0.8545454545454545,4.624619207952541e-05
aggregate,aggregate,LiveBench 240725,livebench_240829.csv,kendall,random,11,0,0.7454545454545454,0.000759529822029822
aggregate,aggregate,LiveBench 240725,livebench_240829.csv,kendall,random,11,1,0.8181818181818182,0.00013227513227513228
aggregate,aggregate,LiveBench 240725,livebench_240829.csv,kendall,random,11,2,0.8545454545454545,4.624619207952541e-05
aggregate,aggregate,LiveBench 240725,livebench_240829.csv,kendall,random,11,3,0.8545454545454545,4.624619207952541e-05
aggregate,aggregate,LiveBench 240725,livebench_240829.csv,kendall,random,11,4,0.8545454545454545,4.624619207952541e-05
aggregate,aggregate,LiveBench Reasoning,livebench_240829.csv,kendall,random,11,0,0.4770842982214229,0.042330229121360724
aggregate,aggregate,LiveBench Reasoning,livebench_240829.csv,kendall,random,11,1,0.6727272727272727,0.0031063111271444604
aggregate,aggregate,LiveBench Reasoning,livebench_240829.csv,kendall,random,11,2,0.6605782590758164,0.004936818556325077
aggregate,aggregate,LiveBench Reasoning,livebench_240829.csv,kendall,random,11,3,0.8440722199302099,0.0003281542287518694
aggregate,aggregate,LiveBench Reasoning,livebench_240829.csv,kendall,random,11,4,0.7339758434175737,0.0017872890369872653
aggregate,aggregate,LiveBench Coding,livebench_240829.csv,kendall,random,11,0,0.9636363636363636,5.511463844797178e-07
aggregate,aggregate,LiveBench Coding,livebench_240829.csv,kendall,random,11,1,0.6727272727272727,0.0031063111271444604
aggregate,aggregate,LiveBench Coding,livebench_240829.csv,kendall,random,11,2,0.7818181818181819,0.0003334435626102293
aggregate,aggregate,LiveBench Coding,livebench_240829.csv,kendall,random,11,3,0.8545454545454545,4.624619207952541e-05
aggregate,aggregate,LiveBench Coding,livebench_240829.csv,kendall,random,11,4,0.7818181818181819,0.0003334435626102293
aggregate,aggregate,LiveBench Mathematics,livebench_240829.csv,kendall,random,11,0,0.7090909090909091,0.0015912097162097162
aggregate,aggregate,LiveBench Mathematics,livebench_240829.csv,kendall,random,11,1,0.7454545454545454,0.000759529822029822
aggregate,aggregate,LiveBench Mathematics,livebench_240829.csv,kendall,random,11,2,0.8181818181818182,0.00013227513227513228
aggregate,aggregate,LiveBench Mathematics,livebench_240829.csv,kendall,random,11,3,0.8909090909090909,1.3728555395222063e-05
aggregate,aggregate,LiveBench Mathematics,livebench_240829.csv,kendall,random,11,4,0.8545454545454545,4.624619207952541e-05
aggregate,aggregate,LiveBench Data Analysis,livebench_240829.csv,kendall,random,11,0,0.4909090909090909,0.04053235730319064
aggregate,aggregate,LiveBench Data Analysis,livebench_240829.csv,kendall,random,11,1,0.6363636363636364,0.005707170915504249
aggregate,aggregate,LiveBench Data Analysis,livebench_240829.csv,kendall,random,11,2,0.7454545454545454,0.000759529822029822
aggregate,aggregate,LiveBench Data Analysis,livebench_240829.csv,kendall,random,11,3,0.8545454545454545,4.624619207952541e-05
aggregate,aggregate,LiveBench Data Analysis,livebench_240829.csv,kendall,random,11,4,0.7454545454545454,0.000759529822029822
aggregate,aggregate,LiveBench Language,livebench_240829.csv,kendall,random,11,0,0.45454545454545453,0.06017015392015392
aggregate,aggregate,LiveBench Language,livebench_240829.csv,kendall,random,11,1,0.8181818181818182,0.00013227513227513228
aggregate,aggregate,LiveBench Language,livebench_240829.csv,kendall,random,11,2,0.6363636363636364,0.005707170915504249
aggregate,aggregate,LiveBench Language,livebench_240829.csv,kendall,random,11,3,0.9272727272727274,3.2567740901074234e-06
aggregate,aggregate,LiveBench Language,livebench_240829.csv,kendall,random,11,4,0.8545454545454545,4.624619207952541e-05
aggregate,aggregate,LiveBench Instruction Following,livebench_240829.csv,kendall,random,11,0,0.6000000000000001,0.00994553671637005
aggregate,aggregate,LiveBench Instruction Following,livebench_240829.csv,kendall,random,11,1,0.7090909090909091,0.0015912097162097162
aggregate,aggregate,LiveBench Instruction Following,livebench_240829.csv,kendall,random,11,2,0.6727272727272727,0.0031063111271444604
aggregate,aggregate,LiveBench Instruction Following,livebench_240829.csv,kendall,random,11,3,0.4909090909090909,0.04053235730319064
aggregate,aggregate,LiveBench Instruction Following,livebench_240829.csv,kendall,random,11,4,0.6727272727272727,0.0031063111271444604
aggregate,aggregate,WildBench Elo LC,wildbench_240829.csv,kendall,random,11,0,0.8909090909090909,1.3728555395222063e-05
aggregate,aggregate,WildBench Elo LC,wildbench_240829.csv,kendall,random,11,1,0.8181818181818182,0.00013227513227513228
aggregate,aggregate,WildBench Elo LC,wildbench_240829.csv,kendall,random,11,2,0.8909090909090909,1.3728555395222063e-05
aggregate,aggregate,WildBench Elo LC,wildbench_240829.csv,kendall,random,11,3,0.8909090909090909,1.3728555395222063e-05
aggregate,aggregate,WildBench Elo LC,wildbench_240829.csv,kendall,random,11,4,0.8181818181818182,0.00013227513227513228
aggregate,aggregate,WildBench Information Seeking,wildbench_240829.csv,kendall,random,11,0,0.7454545454545454,0.000759529822029822
aggregate,aggregate,WildBench Information Seeking,wildbench_240829.csv,kendall,random,11,1,0.8181818181818182,0.00013227513227513228
aggregate,aggregate,WildBench Information Seeking,wildbench_240829.csv,kendall,random,11,2,0.8545454545454545,4.624619207952541e-05
aggregate,aggregate,WildBench Information Seeking,wildbench_240829.csv,kendall,random,11,3,0.7454545454545454,0.000759529822029822
aggregate,aggregate,WildBench Information Seeking,wildbench_240829.csv,kendall,random,11,4,0.7454545454545454,0.000759529822029822
aggregate,aggregate,WildBench Creative,wildbench_240829.csv,kendall,random,11,0,0.7454545454545454,0.000759529822029822
aggregate,aggregate,WildBench Creative,wildbench_240829.csv,kendall,random,11,1,0.7818181818181819,0.0003334435626102293
aggregate,aggregate,WildBench Creative,wildbench_240829.csv,kendall,random,11,2,0.7818181818181819,0.0003334435626102293
aggregate,aggregate,WildBench Creative,wildbench_240829.csv,kendall,random,11,3,0.7454545454545454,0.000759529822029822
aggregate,aggregate,WildBench Creative,wildbench_240829.csv,kendall,random,11,4,0.7454545454545454,0.000759529822029822
aggregate,aggregate,WildBench Code Debugging,wildbench_240829.csv,kendall,random,11,0,1.0,5.010421677088344e-08
aggregate,aggregate,WildBench Code Debugging,wildbench_240829.csv,kendall,random,11,1,0.9636363636363636,5.511463844797178e-07
aggregate,aggregate,WildBench Code Debugging,wildbench_240829.csv,kendall,random,11,2,0.8909090909090909,1.3728555395222063e-05
aggregate,aggregate,WildBench Code Debugging,wildbench_240829.csv,kendall,random,11,3,0.9636363636363636,5.511463844797178e-07
aggregate,aggregate,WildBench Code Debugging,wildbench_240829.csv,kendall,random,11,4,1.0,5.010421677088344e-08
aggregate,aggregate,WildBench Math & Data,wildbench_240829.csv,kendall,random,11,0,1.0,5.010421677088344e-08
aggregate,aggregate,WildBench Math & Data,wildbench_240829.csv,kendall,random,11,1,1.0,5.010421677088344e-08
aggregate,aggregate,WildBench Math & Data,wildbench_240829.csv,kendall,random,11,2,0.9272727272727274,3.2567740901074234e-06
aggregate,aggregate,WildBench Math & Data,wildbench_240829.csv,kendall,random,11,3,0.9636363636363636,5.511463844797178e-07
aggregate,aggregate,WildBench Math & Data,wildbench_240829.csv,kendall,random,11,4,1.0,5.010421677088344e-08
aggregate,aggregate,WildBench Reasoning & Planning,wildbench_240829.csv,kendall,random,11,0,0.9272727272727274,3.2567740901074234e-06
aggregate,aggregate,WildBench Reasoning & Planning,wildbench_240829.csv,kendall,random,11,1,0.9636363636363636,5.511463844797178e-07
aggregate,aggregate,WildBench Reasoning & Planning,wildbench_240829.csv,kendall,random,11,2,0.8909090909090909,1.3728555395222063e-05
aggregate,aggregate,WildBench Reasoning & Planning,wildbench_240829.csv,kendall,random,11,3,0.8181818181818182,0.00013227513227513228
aggregate,aggregate,WildBench Reasoning & Planning,wildbench_240829.csv,kendall,random,11,4,0.9272727272727274,3.2567740901074234e-06
aggregate,aggregate,WildBench Score,wildbench_240829.csv,kendall,random,11,0,0.9636363636363636,5.511463844797178e-07
aggregate,aggregate,WildBench Score,wildbench_240829.csv,kendall,random,11,1,0.9636363636363636,5.511463844797178e-07
aggregate,aggregate,WildBench Score,wildbench_240829.csv,kendall,random,11,2,0.8909090909090909,1.3728555395222063e-05
aggregate,aggregate,WildBench Score,wildbench_240829.csv,kendall,random,11,3,0.8909090909090909,1.3728555395222063e-05
aggregate,aggregate,WildBench Score,wildbench_240829.csv,kendall,random,11,4,0.9636363636363636,5.511463844797178e-07
aggregate,aggregate,Arena Hard,arena_hard_240829.csv,kendall,random,11,0,1.0,5.010421677088344e-08
aggregate,aggregate,Arena Hard,arena_hard_240829.csv,kendall,random,11,1,1.0,5.010421677088344e-08
aggregate,aggregate,Arena Hard,arena_hard_240829.csv,kendall,random,11,2,1.0,5.010421677088344e-08
aggregate,aggregate,Arena Hard,arena_hard_240829.csv,kendall,random,11,3,1.0,5.010421677088344e-08
aggregate,aggregate,Arena Hard,arena_hard_240829.csv,kendall,random,11,4,1.0,5.010421677088344e-08
aggregate,aggregate,HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,0,0.45454545454545453,0.06017015392015392
aggregate,aggregate,HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,1,0.6000000000000001,0.00994553671637005
aggregate,aggregate,HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,2,0.3090909090909091,0.21834651074234407
aggregate,aggregate,HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,3,0.5272727272727272,0.02638447971781305
aggregate,aggregate,HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,4,0.38181818181818183,0.12097096961680295
aggregate,aggregate,HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,0,0.6000000000000001,0.00994553671637005
aggregate,aggregate,HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,1,0.5636363636363636,0.016540504248837583
aggregate,aggregate,HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,2,0.5272727272727272,0.02638447971781305
aggregate,aggregate,HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,3,0.6000000000000001,0.00994553671637005
aggregate,aggregate,HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,4,0.4909090909090909,0.04053235730319064
aggregate,aggregate,HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,0,0.45454545454545453,0.06017015392015392
aggregate,aggregate,HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,1,0.6000000000000001,0.00994553671637005
aggregate,aggregate,HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,2,0.34545454545454546,0.16457331248997917
aggregate,aggregate,HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,3,0.41818181818181815,0.08656124739458072
aggregate,aggregate,HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,4,0.2727272727272727,0.2829668209876543
aggregate,aggregate,HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,0,0.5741725345968929,0.015177848122929492
aggregate,aggregate,HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,1,0.3519121986239021,0.1366995137219537
aggregate,aggregate,HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,2,0.42599897728156577,0.07162425926742408
aggregate,aggregate,HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,3,0.38181818181818183,0.12097096961680295
aggregate,aggregate,HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,4,0.4403855060505442,0.06091869077971648
aggregate,aggregate,HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,0,0.7090909090909091,0.0015912097162097162
aggregate,aggregate,HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,1,0.7818181818181819,0.0003334435626102293
aggregate,aggregate,HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,2,0.6727272727272727,0.0031063111271444604
aggregate,aggregate,HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,3,0.7454545454545454,0.000759529822029822
aggregate,aggregate,HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,4,0.6727272727272727,0.0031063111271444604
aggregate,aggregate,HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,0,0.6000000000000001,0.00994553671637005
aggregate,aggregate,HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,1,0.5272727272727272,0.02638447971781305
aggregate,aggregate,HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,2,0.3090909090909091,0.21834651074234407
aggregate,aggregate,HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,3,0.45454545454545453,0.06017015392015392
aggregate,aggregate,HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,4,0.2727272727272727,0.2829668209876543
aggregate,aggregate,HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,0,0.4403855060505442,0.06091869077971648
aggregate,aggregate,HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,1,0.38181818181818183,0.12097096961680295
aggregate,aggregate,HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,2,0.4403855060505442,0.06091869077971648
aggregate,aggregate,HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,3,0.45454545454545453,0.06017015392015392
aggregate,aggregate,HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,4,0.36698792170878686,0.11834981273562825
aggregate,aggregate,BFCL,bfcl_240906.csv,kendall,random,11,0,0.2,0.4453821448613115
aggregate,aggregate,BFCL,bfcl_240906.csv,kendall,random,11,1,0.38181818181818183,0.12097096961680295
aggregate,aggregate,BFCL,bfcl_240906.csv,kendall,random,11,2,0.41818181818181815,0.08656124739458072
aggregate,aggregate,BFCL,bfcl_240906.csv,kendall,random,11,3,0.5272727272727272,0.02638447971781305
aggregate,aggregate,BFCL,bfcl_240906.csv,kendall,random,11,4,0.5272727272727272,0.02638447971781305
aggregate,aggregate,BIGGEN,biggen_240829.csv,kendall,random,11,0,0.8181818181818182,0.00013227513227513228
aggregate,aggregate,BIGGEN,biggen_240829.csv,kendall,random,11,1,0.8181818181818182,0.00013227513227513228
aggregate,aggregate,BIGGEN,biggen_240829.csv,kendall,random,11,2,0.8909090909090909,1.3728555395222063e-05
aggregate,aggregate,BIGGEN,biggen_240829.csv,kendall,random,11,3,0.8181818181818182,0.00013227513227513228
aggregate,aggregate,BIGGEN,biggen_240829.csv,kendall,random,11,4,0.7090909090909091,0.0015912097162097162
aggregate,aggregate,BIGGEN Grounding,biggen_240829.csv,kendall,random,11,0,0.7454545454545454,0.000759529822029822
aggregate,aggregate,BIGGEN Grounding,biggen_240829.csv,kendall,random,11,1,0.7818181818181819,0.0003334435626102293
aggregate,aggregate,BIGGEN Grounding,biggen_240829.csv,kendall,random,11,2,0.8181818181818182,0.00013227513227513228
aggregate,aggregate,BIGGEN Grounding,biggen_240829.csv,kendall,random,11,3,0.8073734277593311,0.0005907573118657002
aggregate,aggregate,BIGGEN Grounding,biggen_240829.csv,kendall,random,11,4,0.7454545454545454,0.000759529822029822
aggregate,aggregate,BIGGEN Instruction Following,biggen_240829.csv,kendall,random,11,0,0.587180674734059,0.01246215829454031
aggregate,aggregate,BIGGEN Instruction Following,biggen_240829.csv,kendall,random,11,1,0.6482593132545567,0.006117582447622459
aggregate,aggregate,BIGGEN Instruction Following,biggen_240829.csv,kendall,random,11,2,0.8545454545454545,4.624619207952541e-05
aggregate,aggregate,BIGGEN Instruction Following,biggen_240829.csv,kendall,random,11,3,0.7706746355884524,0.0010393630991335228
aggregate,aggregate,BIGGEN Instruction Following,biggen_240829.csv,kendall,random,11,4,0.5371291452680612,0.02311942970946668
aggregate,aggregate,BIGGEN Planning,biggen_240829.csv,kendall,random,11,0,0.6238794669049377,0.007931923532795268
aggregate,aggregate,BIGGEN Planning,biggen_240829.csv,kendall,random,11,1,0.4909090909090909,0.04053235730319064
aggregate,aggregate,BIGGEN Planning,biggen_240829.csv,kendall,random,11,2,0.8440722199302099,0.0003281542287518694
aggregate,aggregate,BIGGEN Planning,biggen_240829.csv,kendall,random,11,3,0.7454545454545454,0.000759529822029822
aggregate,aggregate,BIGGEN Planning,biggen_240829.csv,kendall,random,11,4,0.36698792170878686,0.11834981273562825
aggregate,aggregate,BIGGEN Reasoning,biggen_240829.csv,kendall,random,11,0,0.8909090909090909,1.3728555395222063e-05
aggregate,aggregate,BIGGEN Reasoning,biggen_240829.csv,kendall,random,11,1,0.9272727272727274,3.2567740901074234e-06
aggregate,aggregate,BIGGEN Reasoning,biggen_240829.csv,kendall,random,11,2,1.0,5.010421677088344e-08
aggregate,aggregate,BIGGEN Reasoning,biggen_240829.csv,kendall,random,11,3,0.7090909090909091,0.0015912097162097162
aggregate,aggregate,BIGGEN Reasoning,biggen_240829.csv,kendall,random,11,4,0.6727272727272727,0.0031063111271444604
aggregate,aggregate,BIGGEN Refinement,biggen_240829.csv,kendall,random,11,0,0.8181818181818182,0.00013227513227513228
aggregate,aggregate,BIGGEN Refinement,biggen_240829.csv,kendall,random,11,1,0.8073734277593311,0.0005907573118657002
aggregate,aggregate,BIGGEN Refinement,biggen_240829.csv,kendall,random,11,2,0.8909090909090909,1.3728555395222063e-05
aggregate,aggregate,BIGGEN Refinement,biggen_240829.csv,kendall,random,11,3,0.7818181818181819,0.0003334435626102293
aggregate,aggregate,BIGGEN Refinement,biggen_240829.csv,kendall,random,11,4,0.6605782590758164,0.004936818556325077
aggregate,aggregate,BIGGEN Safety,biggen_240829.csv,kendall,random,11,0,-0.0909090909090909,0.7611503928170594
aggregate,aggregate,BIGGEN Safety,biggen_240829.csv,kendall,random,11,1,0.07339758434175737,0.7547764265871044
aggregate,aggregate,BIGGEN Safety,biggen_240829.csv,kendall,random,11,2,0.4403855060505442,0.06091869077971648
aggregate,aggregate,BIGGEN Safety,biggen_240829.csv,kendall,random,11,3,0.3302891295379082,0.15985367483762747
aggregate,aggregate,BIGGEN Safety,biggen_240829.csv,kendall,random,11,4,0.1272727272727273,0.6480954385121052
aggregate,aggregate,BIGGEN Theory of Mind,biggen_240829.csv,kendall,random,11,0,0.7090909090909091,0.0015912097162097162
aggregate,aggregate,BIGGEN Theory of Mind,biggen_240829.csv,kendall,random,11,1,0.7818181818181819,0.0003334435626102293
aggregate,aggregate,BIGGEN Theory of Mind,biggen_240829.csv,kendall,random,11,2,0.8705196492275474,0.00023202582506637044
aggregate,aggregate,BIGGEN Theory of Mind,biggen_240829.csv,kendall,random,11,3,0.7479575920067658,0.001637274718449882
aggregate,aggregate,BIGGEN Theory of Mind,biggen_240829.csv,kendall,random,11,4,0.5983660736054126,0.01175728488671479
aggregate,aggregate,BIGGEN Tool Usage,biggen_240829.csv,kendall,random,11,0,0.7090909090909091,0.0015912097162097162
aggregate,aggregate,BIGGEN Tool Usage,biggen_240829.csv,kendall,random,11,1,0.6000000000000001,0.00994553671637005
aggregate,aggregate,BIGGEN Tool Usage,biggen_240829.csv,kendall,random,11,2,0.7090909090909091,0.0015912097162097162
aggregate,aggregate,BIGGEN Tool Usage,biggen_240829.csv,kendall,random,11,3,0.7454545454545454,0.000759529822029822
aggregate,aggregate,BIGGEN Tool Usage,biggen_240829.csv,kendall,random,11,4,0.5272727272727272,0.02638447971781305
aggregate,aggregate,BIGGEN Multilingual,biggen_240829.csv,kendall,random,11,0,0.7706746355884524,0.0010393630991335228
aggregate,aggregate,BIGGEN Multilingual,biggen_240829.csv,kendall,random,11,1,0.8909090909090909,1.3728555395222063e-05
aggregate,aggregate,BIGGEN Multilingual,biggen_240829.csv,kendall,random,11,2,0.5272727272727272,0.02638447971781305
aggregate,aggregate,BIGGEN Multilingual,biggen_240829.csv,kendall,random,11,3,0.7454545454545454,0.000759529822029822
aggregate,aggregate,BIGGEN Multilingual,biggen_240829.csv,kendall,random,11,4,0.7818181818181819,0.0003334435626102293
aggregate,aggregate,LiveBench 240624,livebench_240701.csv,kendall,random,11,0,0.7818181818181819,0.0003334435626102293
aggregate,aggregate,LiveBench 240624,livebench_240701.csv,kendall,random,11,1,0.8545454545454545,4.624619207952541e-05
aggregate,aggregate,LiveBench 240624,livebench_240701.csv,kendall,random,11,2,0.7818181818181819,0.0003334435626102293
aggregate,aggregate,LiveBench 240624,livebench_240701.csv,kendall,random,11,3,0.8545454545454545,4.624619207952541e-05
aggregate,aggregate,LiveBench 240624,livebench_240701.csv,kendall,random,11,4,0.7818181818181819,0.0003334435626102293
aggregate,aggregate,LiveBench Reasoning Average,livebench_240701.csv,kendall,random,11,0,0.6731618328060892,0.004677734981047257
aggregate,aggregate,LiveBench Reasoning Average,livebench_240701.csv,kendall,random,11,1,0.759389481241052,0.0013210471654040124
aggregate,aggregate,LiveBench Reasoning Average,livebench_240701.csv,kendall,random,11,2,0.7339758434175737,0.0017872890369872653
aggregate,aggregate,LiveBench Reasoning Average,livebench_240701.csv,kendall,random,11,3,0.6238794669049377,0.007931923532795268
aggregate,aggregate,LiveBench Reasoning Average,livebench_240701.csv,kendall,random,11,4,0.7090909090909091,0.0015912097162097162
aggregate,aggregate,LiveBench Coding Average,livebench_240701.csv,kendall,random,11,0,0.6363636363636364,0.005707170915504249
aggregate,aggregate,LiveBench Coding Average,livebench_240701.csv,kendall,random,11,1,0.7454545454545454,0.000759529822029822
aggregate,aggregate,LiveBench Coding Average,livebench_240701.csv,kendall,random,11,2,0.7706746355884524,0.0010393630991335228
aggregate,aggregate,LiveBench Coding Average,livebench_240701.csv,kendall,random,11,3,0.8181818181818182,0.00013227513227513228
aggregate,aggregate,LiveBench Coding Average,livebench_240701.csv,kendall,random,11,4,0.7706746355884524,0.0010393630991335228
aggregate,aggregate,LiveBench Mathematics Average,livebench_240701.csv,kendall,random,11,0,0.7090909090909091,0.0015912097162097162
aggregate,aggregate,LiveBench Mathematics Average,livebench_240701.csv,kendall,random,11,1,0.8909090909090909,1.3728555395222063e-05
aggregate,aggregate,LiveBench Mathematics Average,livebench_240701.csv,kendall,random,11,2,0.7818181818181819,0.0003334435626102293
aggregate,aggregate,LiveBench Mathematics Average,livebench_240701.csv,kendall,random,11,3,0.8545454545454545,4.624619207952541e-05
aggregate,aggregate,LiveBench Mathematics Average,livebench_240701.csv,kendall,random,11,4,0.8545454545454545,4.624619207952541e-05
aggregate,aggregate,LiveBench Data Analysis Average,livebench_240701.csv,kendall,random,11,0,0.5636363636363636,0.016540504248837583
aggregate,aggregate,LiveBench Data Analysis Average,livebench_240701.csv,kendall,random,11,1,0.7454545454545454,0.000759529822029822
aggregate,aggregate,LiveBench Data Analysis Average,livebench_240701.csv,kendall,random,11,2,0.6363636363636364,0.005707170915504249
aggregate,aggregate,LiveBench Data Analysis Average,livebench_240701.csv,kendall,random,11,3,0.7818181818181819,0.0003334435626102293
aggregate,aggregate,LiveBench Data Analysis Average,livebench_240701.csv,kendall,random,11,4,0.6363636363636364,0.005707170915504249
aggregate,aggregate,LiveBench Language Average,livebench_240701.csv,kendall,random,11,0,0.7090909090909091,0.0015912097162097162
aggregate,aggregate,LiveBench Language Average,livebench_240701.csv,kendall,random,11,1,0.8181818181818182,0.00013227513227513228
aggregate,aggregate,LiveBench Language Average,livebench_240701.csv,kendall,random,11,2,0.5636363636363636,0.016540504248837583
aggregate,aggregate,LiveBench Language Average,livebench_240701.csv,kendall,random,11,3,0.8545454545454545,4.624619207952541e-05
aggregate,aggregate,LiveBench Language Average,livebench_240701.csv,kendall,random,11,4,0.7454545454545454,0.000759529822029822
aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,kendall,random,11,0,0.6727272727272727,0.0031063111271444604
aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,kendall,random,11,1,0.7454545454545454,0.000759529822029822
aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,kendall,random,11,2,0.7090909090909091,0.0015912097162097162
aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,kendall,random,11,3,0.6000000000000001,0.00994553671637005
aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,kendall,random,11,4,0.7454545454545454,0.000759529822029822
|