Yotam-Perlitz commited on
Commit
697d2f9
1 Parent(s): 0b85b1f

update cache

Browse files

Signed-off-by: Yotam-Perlitz <y.perlitz@ibm.com>

cache/aggregate_scoress_cache_05c0405c5253dda90dc632e052accfd2.csv ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model,score
2
+ gemini_1_5_pro_api_0409_preview,1.0
3
+ gemini_1_5_pro_exp_0801,0.9921875
4
+ chatgpt_4o_latest,0.984375
5
+ gpt_3_5_turbo_0314,0.9765625
6
+ bard_jan_24_gemini_pro,0.96875
7
+ claude_1,0.9609375
8
+ gemini_advanced_0514,0.953125
9
+ llama3_1_70b_instruct,0.9453125
10
+ gpt_4o_2024_05_13,0.9375
11
+ gpt_4o_2024_08_06,0.9296875
12
+ gpt_4o_mini_2024_07_18,0.921875
13
+ claude_3_5_sonnet_20240620,0.9140625
14
+ claude_3_opus_20240229,0.90625
15
+ athene_70b_0725,0.8984375
16
+ gemini_pro_dev_api,0.890625
17
+ claude_2_0,0.8828125
18
+ glm_4_0520,0.875
19
+ nemotron_4_340b_instruct,0.8671875
20
+ yi_large_preview,0.859375
21
+ llama_2_70b_chat,0.8515625
22
+ reka_core_20240722,0.84375
23
+ gemini_1_5_pro_api_0514,0.8359375
24
+ gemini_pro,0.828125
25
+ llama3_1_405b_instruct,0.8203125
26
+ mistral_large_2407,0.8125
27
+ gpt_4_turbo_2024_04_09,0.8046875
28
+ gpt_4_1106_preview,0.796875
29
+ gpt_3_5_turbo_0613,0.7890625
30
+ gpt_4_0125_preview,0.78125
31
+ glm_4_0116,0.7734375
32
+ zephyr_orpo_141b_a35b_v0_1,0.765625
33
+ qwen_max_0428,0.7578125
34
+ claude_instant_1,0.75
35
+ yi_large,0.7421875
36
+ deepseek_coder_v2_0724,0.734375
37
+ deepseek_v2_api_0628,0.7265625
38
+ gemini_1_5_flash_api_0514,0.71875
39
+ llama3_70b_instruct,0.7109375
40
+ command_r_plus,0.703125
41
+ gpt_4_0314,0.6953125
42
+ claude_2_1,0.6875
43
+ wizardlm_70b,0.6796875
44
+ gemma_2_27b_it,0.671875
45
+ dolphin_2_2_1_mistral_7b,0.6640625
46
+ guanaco_33b,0.65625
47
+ nous_hermes_2_mixtral_8x7b_dpo,0.6484375
48
+ wizardlm_13b,0.640625
49
+ mpt_30b_chat,0.6328125
50
+ qwen1_5_110b_chat,0.625
51
+ claude_3_sonnet_20240229,0.6171875
52
+ mistral_next,0.609375
53
+ deepseek_coder_v2,0.6015625
54
+ reka_flash_21b_20240226_online,0.59375
55
+ starling_lm_7b_beta,0.5859375
56
+ llama2_70b_steerlm_chat,0.578125
57
+ mistral_medium,0.5703125
58
+ llama_2_13b_chat,0.5625
59
+ tulu_2_dpo_70b,0.5546875
60
+ reka_core_20240501,0.546875
61
+ gpt_4_0613,0.5390625
62
+ deepseek_llm_67b_chat,0.53125
63
+ solar_10_7b_instruct_v1_0,0.5234375
64
+ openchat_3_5_0106,0.515625
65
+ reka_flash_20240722,0.5078125
66
+ gemma_2_9b_it,0.5
67
+ llama3_1_8b_instruct,0.4921875
68
+ openchat_3_5,0.484375
69
+ pplx_7b_online,0.4765625
70
+ qwen1_5_72b_chat,0.46875
71
+ zephyr_7b_alpha,0.4609375
72
+ claude_3_haiku_20240307,0.453125
73
+ starling_lm_7b_alpha,0.4453125
74
+ reka_flash_21b_20240226,0.4375
75
+ mistral_large_2402,0.4296875
76
+ gpt_3_5_turbo_1106,0.421875
77
+ qwen1_5_7b_chat,0.4140625
78
+ reka_flash_preview_20240611,0.40625
79
+ yi_1_5_34b_chat,0.3984375
80
+ openhermes_2_5_mistral_7b,0.390625
81
+ codellama34b_instruct,0.3828125
82
+ qwen1_5_14b_chat,0.375
83
+ yi_34b_chat,0.3671875
84
+ pplx_70b_online,0.359375
85
+ qwen2_72b_instruct,0.3515625
86
+ dbrx_instructruct_preview,0.34375
87
+ llama3_8b_instruct,0.3359375
88
+ falcon_180b_chat,0.328125
89
+ palm_2,0.3203125
90
+ qwen_14b_chat,0.3125
91
+ stripedhyena_nous_7b,0.3046875
92
+ qwen1_5_32b_chat,0.296875
93
+ command_r,0.2890625
94
+ gemma_7b_it,0.28125
95
+ zephyr_7b_beta,0.2734375
96
+ mixtral_8x22b_instruct_v0_1,0.265625
97
+ vicuna_7b,0.2578125
98
+ snowflake_arctic_instruct,0.25
99
+ vicuna_33b,0.2421875
100
+ gemma_2_2b_it,0.234375
101
+ koala_13b,0.2265625
102
+ gpt_3_5_turbo_0125,0.21875
103
+ mistral_7b_instruct,0.2109375
104
+ llama_2_7b_chat,0.203125
105
+ mistral_7b_instruct_v0_2,0.1953125
106
+ gemma_1_1_7b_it,0.1875
107
+ gpt4all_13b_snoozy,0.1796875
108
+ phi_3_small_8k_instruct,0.171875
109
+ olmo_7b_instruct,0.1640625
110
+ phi_3_mini_4k_instruct,0.15625
111
+ phi_3_mini_128k_instruct,0.1484375
112
+ rwkv_4_raven_14b,0.140625
113
+ vicuna_13b,0.1328125
114
+ codellama_70b_instruct,0.125
115
+ mpt_7b_chat,0.1171875
116
+ mixtral_8x7b_instruct_v0_1,0.109375
117
+ phi_3_medium_4k_instruct,0.1015625
118
+ gemma_2b_it,0.09375
119
+ phi_3_mini_4k_instruct_june_2024,0.0859375
120
+ qwen1_5_4b_chat,0.078125
121
+ chatglm_6b,0.0703125
122
+ alpaca_13b,0.0625
123
+ gemma_1_1_2b_it,0.0546875
124
+ chatglm2_6b,0.046875
125
+ stablelm_tuned_alpha_7b,0.0390625
126
+ chatglm3_6b,0.03125
127
+ oasst_pythia_12b,0.0234375
128
+ llama_13b,0.015625
129
+ fastchat_t5_3b,0.0078125
130
+ dolly_v2_12b,0.0
cache/aggregate_scoress_cache_5e66a88dab42480065db47711c55c458.csv CHANGED
@@ -1,122 +1,138 @@
1
  model,score
2
- gpt_4o_2024_05_13,0.9847612958226769
3
- claude_3_5_sonnet_20240620,0.982905982905983
4
- gpt_4o_2024_08_06,0.9575873827791986
5
- gpt_4_turbo_2024_04_09,0.9428463693169576
6
- gpt_4_0125_preview,0.9171132221004344
7
- mistral_large_2407,0.8868286445012787
8
- llama3_1_405b_instruct,0.8672150411280846
9
- yi_large_preview,0.8641553641553642
10
- hermes_3_llama3_1_70b,0.8626160990712074
11
- smaug_qwen2_72b_instruct,0.8593911248710011
12
- claude_3_opus_20240229,0.8573567665639277
13
- llama3_1_70b_instruct,0.8528408270971201
14
- athene_70b,0.8493788819875776
15
- deepseek_coder_v2,0.8444160272804775
16
- qwen2_72b_instruct,0.8354710666091739
17
- yi_large,0.8346273291925466
18
- gpt_4_0613,0.8146763722211293
19
- llama3_70b_instruct,0.8127546753337573
20
- llama3_70b,0.8105600539811066
21
- gemma_2_27b_it,0.8045273029120115
22
- gpt_4o_mini_2024_07_18,0.8032033326150972
23
- gemma_2_9b_it_dpo,0.790057915057915
24
- llama3_instruct_8b_simpo,0.7884068278805121
25
- phi_3_5_moe_instruct,0.7808307533539731
26
- qwen1_5_110b_chat,0.776004448721167
27
- qwen1_5_32b,0.7658569500674763
28
- yi_1_5_34b_chat,0.7553884711779449
29
- llama_2_70b,0.7303193882141251
30
- mixtral_8x22b_instruct_v0_1,0.7256023690940907
31
- gemma_2_9b_it_simpo,0.7199248120300753
32
- qwen1_5_32b_chat,0.7149122807017544
33
- mixtral_8x22b_v0_1,0.7135490753911806
34
- yi_34b,0.7128879892037787
35
- internlm2_5_20b_chat,0.6842105263157895
36
- phi_3_small_128k_instruct,0.66937564499484
37
- phi_3_medium_4k_instruct,0.6675079642841117
38
- claude_3_sonnet_20240229,0.653911731916847
39
- gemma_2_9b_it,0.6422797189051059
40
- infinity_instruct_3m_0625_llama3_8b,0.6273115220483642
 
 
 
 
 
41
  mistral_v0_1_7b,0.6239316239316239
42
- phi_3_5_mini_instruct,0.6202270381836945
43
- mistral_medium,0.6122209165687427
44
- mistral_large_2402,0.6058211467418628
45
- claude_instant_1_2,0.6049896049896051
46
- claude_2_0,0.6020066889632107
47
- yi_1_5_9b_chat,0.5881787802840435
48
- qwen1_5_14b,0.5770917678812416
49
- command_r_plus,0.5761033510394125
50
- llama_65b,0.5736992052781527
51
- gpt_3_5_turbo_0613,0.5724018332713985
52
- qwen1_5_72b_chat,0.5668371367348349
53
- phi_3_mini_4k_instruct,0.5548245614035088
54
- deepseek_llm_67b_chat,0.5506756756756757
55
- claude_3_haiku_20240307,0.549424005945745
56
- yi_34b_chat,0.5455449728905107
57
- dbrx_instructruct,0.5344129554655871
58
  jurassic_2_jumbo_178b,0.532051282051282
59
- llama3_1_8b_instruct,0.5175232440678665
60
- claude_2_1,0.5110980545763154
61
- qwen2_7b_instruct,0.5034227726178191
62
- mistral_small_2402,0.49924585218702866
63
- mixtral_8x7b_v0_1,0.49324324324324326
64
- glm_4_9b_chat,0.46499582289055974
65
- qwen1_5_14b_chat,0.4621068436857911
66
- phi_3_small_8k_instruct,0.45481670929241264
67
- gpt_3_5_turbo_0301,0.4528985507246377
68
- snorkel_mistral_pairrm_dpo,0.4521151586368978
69
- gemma_7b,0.4471997300944669
70
- gpt_3_5_turbo_0125,0.4401920188365201
71
- llama3_8b,0.43302968960863697
72
- dbrx_instruct,0.4266409266409266
73
- llama3_8b_instruct,0.420135922511747
74
- phi_3_mini_128k_instruct,0.4153205904787544
75
- llama_2_13b,0.41490478332583597
76
- jurassic_2_grande_17b,0.39529914529914534
77
- openhermes_2_5_mistral_7b,0.3832617447168531
78
- mistral_7b_v0_3,0.3737553342816501
79
- mixtral_8x7b_instruct_v0_1,0.3713078251895724
80
- qwen1_5_7b,0.3508771929824561
81
- yi_1_5_6b_chat,0.3354636591478697
82
- falcon_40b,0.32812265707002547
83
- command_r,0.32386140074759
84
- internlm2_chat_20b,0.32252252252252256
85
- mistral_7b_v0_2,0.31970128022759603
86
- luminous_supreme_70b,0.30128205128205127
87
- starling_lm_7b_alpha,0.29823530624445954
88
- yi_6b,0.29234143049932526
89
- mistral_7b_instruct_v0_2,0.28609513981031004
90
- zephyr_7b_alpha,0.2838442157327606
91
- zephyr_7b_beta,0.2666234345800909
92
- gemma_1_1_7b_it,0.26226051061156724
93
- mistral_7b_instruct_v0_3,0.2537839697282422
94
- starling_lm_7b_beta,0.25234441602728047
95
- llama_2_7b,0.2391288049182786
 
 
 
 
 
 
 
 
 
 
 
 
96
  luminous_extended_30b,0.2329059829059829
97
- alpaca_7b,0.22072072072072071
98
- vicuna_33b_v1_3,0.2056404230317274
99
- phi_2,0.20087901666849037
100
- qwen2_1_5b_instruct,0.19711042311661506
101
- yi_6b_chat,0.1938854489164087
102
- qwen1_5_7b_chat,0.1916569245052217
103
- tulu_2_dpo_70b,0.17624223602484473
104
- qwen1_5_4b_chat,0.1674406604747162
105
- llama_2_70b_chat,0.15527950310559005
106
- gpt_neox_20b,0.14400584795321636
107
- vicuna_7b_v1_5,0.13619501854795973
108
- falcon_40b_instruct,0.13264580369843526
109
- gemma_7b_it,0.12136319058515854
110
- falcon_7b,0.11407257459889038
111
- gpt_j_6b,0.10160818713450293
112
  luminous_base_13b,0.08333333333333333
113
- llama_2_7b_chat,0.08304448781801049
114
- gemma_1_1_2b_it,0.07665903890160183
115
- olmo_7b,0.06545209176788123
116
- gemma_2b_it,0.05921052631578947
117
- qwen1_5_1_8b_chat,0.059167526659786716
118
- qwen2_0_5b_instruct,0.059081527347781215
119
- pythia_12b,0.054093567251461985
120
- pythia_6_9b,0.019736842105263157
121
- falcon_7b_instruct,0.013513513513513514
122
- qwen1_5_0_5b_chat,0.013157894736842105
 
1
  model,score
2
+ gpt_4o_2024_05_13,0.9767482517482518
3
+ chatgpt_4o_latest,0.9754079254079254
4
+ gpt_4o_2024_08_06,0.9652680652680652
5
+ claude_3_5_sonnet_20240620,0.9572649572649573
6
+ gemini_1_5_pro_exp_0801,0.9545454545454546
7
+ llama3_1_70b_instruct,0.9343074620852398
8
+ gpt_4_turbo_2024_04_09,0.9055819180819181
9
+ claude_3_opus_20240229,0.8824397824397824
10
+ yi_large_preview,0.8714202464202464
11
+ llama3_1_405b_instruct,0.8598484848484849
12
+ gpt_4_0125_preview,0.8492118992118992
13
+ hermes_3_llama3_1_70b,0.8451178451178452
14
+ zephyr_orpo_141b_a35b_v0_1,0.8414055080721747
15
+ mistral_large_2407,0.8375291375291375
16
+ gpt_4o_mini_2024_07_18,0.8348776223776224
17
+ claude_2_0,0.8333333333333334
18
+ smaug_qwen2_72b_instruct,0.8331088664421997
19
+ gemini_1_5_pro_api_0514,0.8294871794871794
20
+ llama3_70b_instruct,0.8172801478357034
21
+ llama3_70b,0.8129154795821463
22
+ gemma_2_9b_it_dpo,0.8100649350649352
23
+ llama3_instruct_8b_simpo,0.7992424242424242
24
+ yi_large,0.7889194139194139
25
+ gemma_2_27b_it,0.776345259678593
26
+ qwen2_72b_instruct,0.7701936951936953
27
+ qwen1_5_32b,0.7678062678062678
28
+ gpt_4_0613,0.7641802641802643
29
+ phi_3_5_moe_instruct,0.7600448933782267
30
+ qwen1_5_110b_chat,0.7419770353103686
31
+ mixtral_8x22b_v0_1,0.7382154882154882
32
+ gemma_2_9b_it_simpo,0.7328042328042329
33
+ gemini_pro,0.7298951048951049
34
+ llama_2_70b,0.7293447293447294
35
+ gemini_1_5_flash_api_0514,0.7263403263403263
36
+ yi_34b,0.7188983855650521
37
+ deepseek_coder_v2,0.713053613053613
38
+ nous_hermes_2_mixtral_8x7b_dpo,0.7094017094017094
39
+ gpt_3_5_turbo_0613,0.6851851851851851
40
+ claude_2_1,0.6693861693861693
41
+ yi_1_5_34b_chat,0.6669566544566544
42
+ mistral_medium,0.657051282051282
43
+ phi_3_small_128k_instruct,0.6561167227833894
44
+ infinity_instruct_3m_0625_llama3_8b,0.6537598204264872
45
+ claude_instant_1_2,0.6486013986013985
46
  mistral_v0_1_7b,0.6239316239316239
47
+ command_r_plus,0.6183108558108558
48
+ phi_3_5_mini_instruct,0.6103254769921437
49
+ llama3_1_8b_instruct,0.6080822469711359
50
+ gemma_2_9b_it,0.6048877048877048
51
+ yi_1_5_9b_chat,0.6041446208112875
52
+ claude_3_sonnet_20240229,0.5985236985236985
53
+ mixtral_8x22b_instruct_v0_1,0.585565052231719
54
+ qwen1_5_14b,0.5797720797720798
55
+ llama_65b,0.5759734093067427
56
+ deepseek_llm_67b_chat,0.5734841290396846
57
+ qwen1_5_32b_chat,0.571383349161127
58
+ wizardlm_70b,0.5620629370629371
59
+ yi_34b_chat,0.5558361391694725
60
+ qwen1_5_72b_chat,0.5463669663669664
61
+ dbrx_instructruct,0.5379867046533713
 
62
  jurassic_2_jumbo_178b,0.532051282051282
63
+ mixtral_8x7b_v0_1,0.5310044893378227
64
+ openchat_3_5,0.5270655270655271
65
+ mistral_large_2402,0.5105672105672105
66
+ solar_10_7b_instruct_v1_0,0.5030864197530864
67
+ qwen2_7b_instruct,0.4970445192667415
68
+ phi_3_medium_4k_instruct,0.48541540763762986
69
+ dolphin_2_2_1_mistral_7b,0.4810606060606061
70
+ mistral_small_2402,0.47785547785547783
71
+ glm_4_9b_chat,0.4769547325102881
72
+ dbrx_instruct,0.4724025974025974
73
+ qwen1_5_14b_chat,0.45340153673487005
74
+ claude_3_haiku_20240307,0.44965034965034967
75
+ gemma_7b,0.4477682811016144
76
+ llama3_8b_instruct,0.4449662477440255
77
+ llama3_8b,0.4368471035137702
78
+ wizardlm_13b,0.42773892773892774
79
+ starling_lm_7b_alpha,0.42734323289878845
80
+ jurassic_2_grande_17b,0.4230769230769231
81
+ mistral_7b_v0_3,0.4228395061728395
82
+ llama_2_13b,0.4146881924659702
83
+ llama_2_70b_chat,0.412732329398996
84
+ phi_3_mini_4k_instruct,0.4048663270885493
85
+ openhermes_2_5_mistral_7b,0.40103708020374684
86
+ llama_2_13b_chat,0.38675213675213677
87
+ guanaco_33b,0.38374125874125875
88
+ phi_3_mini_128k_instruct,0.3778468445135112
89
+ mistral_7b_v0_2,0.3773849607182941
90
+ internlm2_chat_20b,0.37196969696969695
91
+ starling_lm_7b_beta,0.3611888111888112
92
+ gpt_3_5_turbo_0125,0.3591242091242091
93
+ tulu_2_dpo_70b,0.3585164835164835
94
+ qwen1_5_7b,0.35185185185185186
95
+ falcon_40b,0.3502690724912947
96
+ yi_1_5_6b_chat,0.33974132863021755
97
+ zephyr_7b_alpha,0.33875830959164294
98
+ command_r,0.3296911421911422
99
+ luminous_supreme_70b,0.32905982905982906
100
+ yi_6b,0.295346628679962
101
+ zephyr_7b_beta,0.28937667271000606
102
+ mixtral_8x7b_instruct_v0_1,0.284326167659501
103
+ qwen_14b_chat,0.2837995337995338
104
+ gemma_2_2b_it,0.28113553113553114
105
+ phi_3_small_8k_instruct,0.27051282051282055
106
+ gemma_1_1_7b_it,0.263927019482575
107
+ llama_2_7b,0.25466919911364355
108
+ mistral_7b_instruct_v0_2,0.250669392336059
109
+ mistral_7b_instruct_v0_3,0.24534231200897869
110
+ qwen1_5_7b_chat,0.24214088380755047
111
+ alpaca_7b,0.23484848484848483
112
  luminous_extended_30b,0.2329059829059829
113
+ llama_13b,0.2222222222222222
114
+ phi_2,0.19812080923192033
115
+ qwen2_1_5b_instruct,0.1968574635241302
116
+ yi_6b_chat,0.19393939393939394
117
+ vicuna_7b,0.1885198135198135
118
+ gemma_7b_it,0.18790982679871568
119
+ olmo_7b_instruct,0.15669515669515668
120
+ vicuna_7b_v1_5,0.15454545454545454
121
+ vicuna_13b,0.14714452214452214
122
+ gpt_neox_20b,0.1419753086419753
123
+ falcon_40b_instruct,0.13187429854096522
124
+ qwen1_5_4b_chat,0.12542806987251431
125
+ falcon_7b,0.11380183602405824
126
+ llama_2_7b_chat,0.1122679789346456
127
+ gpt_j_6b,0.09876543209876543
128
  luminous_base_13b,0.08333333333333333
129
+ gemma_2b_it,0.08119658119658119
130
+ gemma_1_1_2b_it,0.07454890788224121
131
+ olmo_7b,0.06220322886989553
132
+ qwen1_5_1_8b_chat,0.05544332210998878
133
+ qwen2_0_5b_instruct,0.055218855218855216
134
+ pythia_12b,0.05246913580246913
135
+ chatglm2_6b,0.029137529137529136
136
+ pythia_6_9b,0.018518518518518517
137
+ qwen1_5_0_5b_chat,0.012345679012345678
138
+ falcon_7b_instruct,0.011363636363636364
cache/agreements_cache_05c0405c5253dda90dc632e052accfd2.csv ADDED
@@ -0,0 +1,763 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ scenario,scenario_source,ref_scenario,ref_source,corr_type,model_select_strategy,model_subset_size_requested,exp_n,correlation,p_value
2
+ Helm Lite,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,0,0.5455447255899809,0.0614649096074132
3
+ Helm Lite,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,1,0.8571428571428571,0.001736111111111111
4
+ Helm Lite,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,2,0.5455447255899809,0.0614649096074132
5
+ Helm Lite NarrativeQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,0,0.2857142857142857,0.39875992063492066
6
+ Helm Lite NarrativeQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,1,0.3571428571428571,0.27509920634920637
7
+ Helm Lite NarrativeQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,2,0.47280542884465016,0.10506382347888965
8
+ Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,0,0.3571428571428571,0.27509920634920637
9
+ Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,1,0.2857142857142857,0.39875992063492066
10
+ Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,2,0.42857142857142855,0.17886904761904762
11
+ Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,0,0.4999999999999999,0.10868055555555556
12
+ Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,1,0.4999999999999999,0.10868055555555556
13
+ Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,2,0.42857142857142855,0.17886904761904762
14
+ Helm Lite OpenBookQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,0,0.2857142857142857,0.39875992063492066
15
+ Helm Lite OpenBookQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,1,0.6428571428571428,0.03115079365079365
16
+ Helm Lite OpenBookQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,2,0.5455447255899809,0.0614649096074132
17
+ Helm Lite MMLU,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,0,0.3571428571428571,0.27509920634920637
18
+ Helm Lite MMLU,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,1,0.5714285714285714,0.06101190476190476
19
+ Helm Lite MMLU,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,2,0.42857142857142855,0.17886904761904762
20
+ Helm Lite MathEquivalentCOT,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,0,0.42857142857142855,0.17886904761904762
21
+ Helm Lite MathEquivalentCOT,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7142857142857142,0.014136904761904762
22
+ Helm Lite MathEquivalentCOT,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,2,0.47280542884465016,0.10506382347888965
23
+ Helm Lite GSM8K,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,0,0.07142857142857142,0.9048611111111111
24
+ Helm Lite GSM8K,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,1,0.4999999999999999,0.10868055555555556
25
+ Helm Lite GSM8K,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,2,0.2857142857142857,0.39875992063492066
26
+ Helm Lite LegalBench,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,0,0.3571428571428571,0.27509920634920637
27
+ Helm Lite LegalBench,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,1,0.4999999999999999,0.10868055555555556
28
+ Helm Lite LegalBench,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,2,0.5714285714285714,0.06101190476190476
29
+ Helm Lite MedQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,0,0.2857142857142857,0.39875992063492066
30
+ Helm Lite MedQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,1,0.4999999999999999,0.10868055555555556
31
+ Helm Lite MedQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,2,0.42857142857142855,0.17886904761904762
32
+ Helm Lite WMT2014,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,0,0.2545875386086578,0.38281014365989596
33
+ Helm Lite WMT2014,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,1,0.42857142857142855,0.17886904761904762
34
+ Helm Lite WMT2014,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,2,0.5455447255899809,0.0614649096074132
35
+ LMSys Arena,chatbot_arena_241104.csv,aggregate,aggregate,kendall,random,8,0,0.9999999999999998,4.96031746031746e-05
36
+ LMSys Arena,chatbot_arena_241104.csv,aggregate,aggregate,kendall,random,8,1,0.9999999999999998,4.96031746031746e-05
37
+ LMSys Arena,chatbot_arena_241104.csv,aggregate,aggregate,kendall,random,8,2,0.9999999999999998,4.96031746031746e-05
38
+ HF OpenLLM v2,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,0,0.3571428571428571,0.27509920634920637
39
+ HF OpenLLM v2,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,1,0.42857142857142855,0.17886904761904762
40
+ HF OpenLLM v2,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7142857142857142,0.014136904761904762
41
+ HFv2 BBH,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,0,0.21428571428571427,0.5484126984126985
42
+ HFv2 BBH,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,1,0.42857142857142855,0.17886904761904762
43
+ HFv2 BBH,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,2,0.3571428571428571,0.27509920634920637
44
+ HFv2 GPQA,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,0,0.6428571428571428,0.03115079365079365
45
+ HFv2 GPQA,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,1,0.21428571428571427,0.5484126984126985
46
+ HFv2 GPQA,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,2,0.07142857142857142,0.9048611111111111
47
+ HFv2 IFEval,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,0,0.3571428571428571,0.27509920634920637
48
+ HFv2 IFEval,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,1,0.42857142857142855,0.17886904761904762
49
+ HFv2 IFEval,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7857142857142856,0.005505952380952381
50
+ HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,0,0.4999999999999999,0.10868055555555556
51
+ HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,1,0.42857142857142855,0.17886904761904762
52
+ HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,2,0.5714285714285714,0.06101190476190476
53
+ HFv2 Math Level 5,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,0,0.42857142857142855,0.17886904761904762
54
+ HFv2 Math Level 5,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,1,0.2857142857142857,0.39875992063492066
55
+ HFv2 Math Level 5,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,2,0.5714285714285714,0.06101190476190476
56
+ HFv2 MuSR,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,0,0.5714285714285714,0.06101190476190476
57
+ HFv2 MuSR,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,1,0.3571428571428571,0.27509920634920637
58
+ HFv2 MuSR,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,2,0.07142857142857142,0.9048611111111111
59
+ tablebench_overall_dp,tablebench_241002.csv,aggregate,aggregate,kendall,random,8,0,0.8571428571428571,0.001736111111111111
60
+ tablebench_overall_dp,tablebench_241002.csv,aggregate,aggregate,kendall,random,8,1,0.3571428571428571,0.27509920634920637
61
+ tablebench_overall_dp,tablebench_241002.csv,aggregate,aggregate,kendall,random,8,2,0.2857142857142857,0.39875992063492066
62
+ trustworthy_average,llm_trustworthy_241001.csv,aggregate,aggregate,kendall,random,8,0,0.5714285714285714,0.06101190476190476
63
+ trustworthy_average,llm_trustworthy_241001.csv,aggregate,aggregate,kendall,random,8,1,0.5714285714285714,0.06101190476190476
64
+ trustworthy_average,llm_trustworthy_241001.csv,aggregate,aggregate,kendall,random,8,2,0.5714285714285714,0.06101190476190476
65
+ trustworthy_non_toxicity,llm_trustworthy_241001.csv,aggregate,aggregate,kendall,random,8,0,0.14285714285714285,0.7195436507936508
66
+ trustworthy_non_toxicity,llm_trustworthy_241001.csv,aggregate,aggregate,kendall,random,8,1,0.14285714285714285,0.7195436507936508
67
+ trustworthy_non_toxicity,llm_trustworthy_241001.csv,aggregate,aggregate,kendall,random,8,2,0.14285714285714285,0.7195436507936508
68
+ trustworthy_non_stereotype,llm_trustworthy_241001.csv,aggregate,aggregate,kendall,random,8,0,0.3571428571428571,0.27509920634920637
69
+ trustworthy_non_stereotype,llm_trustworthy_241001.csv,aggregate,aggregate,kendall,random,8,1,0.3571428571428571,0.27509920634920637
70
+ trustworthy_non_stereotype,llm_trustworthy_241001.csv,aggregate,aggregate,kendall,random,8,2,0.3571428571428571,0.27509920634920637
71
+ trustworthy_advglue_pp,llm_trustworthy_241001.csv,aggregate,aggregate,kendall,random,8,0,0.42857142857142855,0.17886904761904762
72
+ trustworthy_advglue_pp,llm_trustworthy_241001.csv,aggregate,aggregate,kendall,random,8,1,0.42857142857142855,0.17886904761904762
73
+ trustworthy_advglue_pp,llm_trustworthy_241001.csv,aggregate,aggregate,kendall,random,8,2,0.42857142857142855,0.17886904761904762
74
+ trustworthy_ood,llm_trustworthy_241001.csv,aggregate,aggregate,kendall,random,8,0,0.6428571428571428,0.03115079365079365
75
+ trustworthy_ood,llm_trustworthy_241001.csv,aggregate,aggregate,kendall,random,8,1,0.6428571428571428,0.03115079365079365
76
+ trustworthy_ood,llm_trustworthy_241001.csv,aggregate,aggregate,kendall,random,8,2,0.6428571428571428,0.03115079365079365
77
+ trustworthy_adv_demo,llm_trustworthy_241001.csv,aggregate,aggregate,kendall,random,8,0,0.6428571428571428,0.03115079365079365
78
+ trustworthy_adv_demo,llm_trustworthy_241001.csv,aggregate,aggregate,kendall,random,8,1,0.6428571428571428,0.03115079365079365
79
+ trustworthy_adv_demo,llm_trustworthy_241001.csv,aggregate,aggregate,kendall,random,8,2,0.6428571428571428,0.03115079365079365
80
+ trustworthy_privacy,llm_trustworthy_241001.csv,aggregate,aggregate,kendall,random,8,0,0.07142857142857142,0.9048611111111111
81
+ trustworthy_privacy,llm_trustworthy_241001.csv,aggregate,aggregate,kendall,random,8,1,0.07142857142857142,0.9048611111111111
82
+ trustworthy_privacy,llm_trustworthy_241001.csv,aggregate,aggregate,kendall,random,8,2,0.07142857142857142,0.9048611111111111
83
+ trustworthy_ethics,llm_trustworthy_241001.csv,aggregate,aggregate,kendall,random,8,0,0.5714285714285714,0.06101190476190476
84
+ trustworthy_ethics,llm_trustworthy_241001.csv,aggregate,aggregate,kendall,random,8,1,0.5714285714285714,0.06101190476190476
85
+ trustworthy_ethics,llm_trustworthy_241001.csv,aggregate,aggregate,kendall,random,8,2,0.5714285714285714,0.06101190476190476
86
+ trustworthy_fairness,llm_trustworthy_241001.csv,aggregate,aggregate,kendall,random,8,0,-0.6910233190806424,0.017844011512848347
87
+ trustworthy_fairness,llm_trustworthy_241001.csv,aggregate,aggregate,kendall,random,8,1,-0.6910233190806424,0.017844011512848347
88
+ trustworthy_fairness,llm_trustworthy_241001.csv,aggregate,aggregate,kendall,random,8,2,-0.6910233190806424,0.017844011512848347
89
+ OpenCompass Academic,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,0,0.4999999999999999,0.10868055555555556
90
+ OpenCompass Academic,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,1,0.3571428571428571,0.27509920634920637
91
+ OpenCompass Academic,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,2,0.4999999999999999,0.10868055555555556
92
+ OpenCompass MMLU,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,0,0.3571428571428571,0.27509920634920637
93
+ OpenCompass MMLU,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,1,0.21428571428571427,0.5484126984126985
94
+ OpenCompass MMLU,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,2,0.42857142857142855,0.17886904761904762
95
+ OpenCompass MMLU Pro,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,0,0.42857142857142855,0.17886904761904762
96
+ OpenCompass MMLU Pro,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,1,0.2857142857142857,0.39875992063492066
97
+ OpenCompass MMLU Pro,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,2,0.3571428571428571,0.27509920634920637
98
+ OpenCompass CMMLU,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,0,0.3571428571428571,0.27509920634920637
99
+ OpenCompass CMMLU,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,1,0.21428571428571427,0.5484126984126985
100
+ OpenCompass CMMLU,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,2,0.2857142857142857,0.39875992063492066
101
+ OpenCompass BBH,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,0,0.4999999999999999,0.10868055555555556
102
+ OpenCompass BBH,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,1,0.2857142857142857,0.39875992063492066
103
+ OpenCompass BBH,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,2,0.4999999999999999,0.10868055555555556
104
+ OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,0,0.036369648372665396,0.9007802600472398
105
+ OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,1,0.07142857142857142,0.9048611111111111
106
+ OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,2,-0.036369648372665396,0.9007802600472398
107
+ OpenCompass HumanEval,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,0,0.4447495899966607,0.1315867602811863
108
+ OpenCompass HumanEval,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,1,0.2545875386086578,0.38281014365989596
109
+ OpenCompass HumanEval,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,2,0.40006613209931935,0.17023995462900499
110
+ OpenCompass IFEval,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,0,0.2857142857142857,0.39875992063492066
111
+ OpenCompass IFEval,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,1,0.21428571428571427,0.5484126984126985
112
+ OpenCompass IFEval,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,2,0.21428571428571427,0.5484126984126985
113
+ Helm MMLU,helm_mmlu_240829.csv,aggregate,aggregate,kendall,random,8,0,0.4999999999999999,0.10868055555555556
114
+ Helm MMLU,helm_mmlu_240829.csv,aggregate,aggregate,kendall,random,8,1,0.3571428571428571,0.27509920634920637
115
+ Helm MMLU,helm_mmlu_240829.csv,aggregate,aggregate,kendall,random,8,2,0.3571428571428571,0.27509920634920637
116
+ MMLU Pro,mmlu_pro_240829.csv,aggregate,aggregate,kendall,random,8,0,0.5714285714285714,0.06101190476190476
117
+ MMLU Pro,mmlu_pro_240829.csv,aggregate,aggregate,kendall,random,8,1,0.5714285714285714,0.06101190476190476
118
+ MMLU Pro,mmlu_pro_240829.csv,aggregate,aggregate,kendall,random,8,2,0.5714285714285714,0.06101190476190476
119
+ MixEval,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,0,0.6428571428571428,0.03115079365079365
120
+ MixEval,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,1,0.5714285714285714,0.06101190476190476
121
+ MixEval,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7142857142857142,0.014136904761904762
122
+ MixEval Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,0,0.9999999999999998,4.96031746031746e-05
123
+ MixEval Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,1,0.6428571428571428,0.03115079365079365
124
+ MixEval Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,2,0.8571428571428571,0.001736111111111111
125
+ MixEval TriviaQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,0,0.4999999999999999,0.10868055555555556
126
+ MixEval TriviaQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,1,0.14285714285714285,0.7195436507936508
127
+ MixEval TriviaQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,2,0.4999999999999999,0.10868055555555556
128
+ MixEval MMLU,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,0,0.8571428571428571,0.001736111111111111
129
+ MixEval MMLU,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7142857142857142,0.014136904761904762
130
+ MixEval MMLU,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7857142857142856,0.005505952380952381
131
+ MixEval DROP,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7142857142857142,0.014136904761904762
132
+ MixEval DROP,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,1,0.6428571428571428,0.03115079365079365
133
+ MixEval DROP,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7142857142857142,0.014136904761904762
134
+ MixEval HellaSwag,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,0,0.8571428571428571,0.001736111111111111
135
+ MixEval HellaSwag,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,1,0.4999999999999999,0.10868055555555556
136
+ MixEval HellaSwag,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,2,0.6910233190806425,0.017844011512848347
137
+ MixEval CommonsenseQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7142857142857142,0.014136904761904762
138
+ MixEval CommonsenseQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,1,0.47280542884465016,0.10506382347888965
139
+ MixEval CommonsenseQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,2,0.6182840223353117,0.0340492747686748
140
+ MixEval TriviaQA Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7142857142857142,0.014136904761904762
141
+ MixEval TriviaQA Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,1,0.4999999999999999,0.10868055555555556
142
+ MixEval TriviaQA Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,2,0.5714285714285714,0.06101190476190476
143
+ MixEval MMLU Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7857142857142856,0.005505952380952381
144
+ MixEval MMLU Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,1,0.6428571428571428,0.03115079365079365
145
+ MixEval MMLU Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,2,0.8571428571428571,0.001736111111111111
146
+ MixEval DROP Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,0,0.6428571428571428,0.03115079365079365
147
+ MixEval DROP Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,1,0.4999999999999999,0.10868055555555556
148
+ MixEval DROP Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,2,0.6428571428571428,0.03115079365079365
149
+ AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,aggregate,aggregate,kendall,random,8,0,0.21428571428571427,0.5484126984126985
150
+ AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7142857142857142,0.014136904761904762
151
+ AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,aggregate,aggregate,kendall,random,8,2,0.2545875386086578,0.38281014365989596
152
+ HELM AirBench Security Risks,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,0,-0.42857142857142855,0.17886904761904762
153
+ HELM AirBench Security Risks,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,1,-0.2857142857142857,0.39875992063492066
154
+ HELM AirBench Security Risks,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,2,-0.21428571428571427,0.5484126984126985
155
+ HELM AirBench Operational Misuses,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,0,-0.5714285714285714,0.06101190476190476
156
+ HELM AirBench Operational Misuses,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,1,-0.18184824186332696,0.5330356744917513
157
+ HELM AirBench Operational Misuses,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,2,-0.42857142857142855,0.17886904761904762
158
+ HELM AirBench Violence & Extremism,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,0,-0.5714285714285714,0.06101190476190476
159
+ HELM AirBench Violence & Extremism,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,1,-0.7142857142857142,0.014136904761904762
160
+ HELM AirBench Violence & Extremism,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,2,-0.3571428571428571,0.27509920634920637
161
+ HELM AirBench Hate/Toxicity,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,0,-0.5714285714285714,0.06101190476190476
162
+ HELM AirBench Hate/Toxicity,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,1,-0.7142857142857142,0.014136904761904762
163
+ HELM AirBench Hate/Toxicity,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,2,-0.2857142857142857,0.39875992063492066
164
+ HELM AirBench Sexual Content,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,0,-0.42857142857142855,0.17886904761904762
165
+ HELM AirBench Sexual Content,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,1,-0.5714285714285714,0.06101190476190476
166
+ HELM AirBench Sexual Content,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,2,-0.42857142857142855,0.17886904761904762
167
+ HELM AirBench Child Harm,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,0,-0.5714285714285714,0.06101190476190476
168
+ HELM AirBench Child Harm,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,1,-0.5714285714285714,0.06101190476190476
169
+ HELM AirBench Child Harm,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,2,-0.42857142857142855,0.17886904761904762
170
+ HELM AirBench Self Harm,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,0,-0.40006613209931935,0.17023995462900499
171
+ HELM AirBench Self Harm,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,1,-0.5714285714285714,0.06101190476190476
172
+ HELM AirBench Self Harm,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,2,-0.40006613209931935,0.17023995462900499
173
+ HELM AirBench Political Usage,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,0,-0.5714285714285714,0.06101190476190476
174
+ HELM AirBench Political Usage,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,1,-0.5714285714285714,0.06101190476190476
175
+ HELM AirBench Political Usage,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,2,-0.3571428571428571,0.27509920634920637
176
+ HELM AirBench Economic Harm,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,0,-0.5714285714285714,0.06101190476190476
177
+ HELM AirBench Economic Harm,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,1,-0.6182840223353117,0.0340492747686748
178
+ HELM AirBench Economic Harm,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,2,-0.3571428571428571,0.27509920634920637
179
+ HELM AirBench Deception,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,0,-0.4999999999999999,0.10868055555555556
180
+ HELM AirBench Deception,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,1,-0.6428571428571428,0.03115079365079365
181
+ HELM AirBench Deception,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,2,-0.40006613209931935,0.17023995462900499
182
+ HELM AirBench Manipulation,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,0,-0.5714285714285714,0.06101190476190476
183
+ HELM AirBench Manipulation,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,1,-0.6182840223353117,0.0340492747686748
184
+ HELM AirBench Manipulation,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,2,-0.3571428571428571,0.27509920634920637
185
+ HELM AirBench Defamation,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,0,-0.40006613209931935,0.17023995462900499
186
+ HELM AirBench Defamation,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,1,-0.4999999999999999,0.10868055555555556
187
+ HELM AirBench Defamation,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,2,-0.40006613209931935,0.17023995462900499
188
+ HELM AirBench Fundamental Rights,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,0,-0.6182840223353117,0.0340492747686748
189
+ HELM AirBench Fundamental Rights,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,1,-0.5714285714285714,0.06101190476190476
190
+ HELM AirBench Fundamental Rights,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,2,-0.41576092031014994,0.1612822677790775
191
+ HELM AirBench Discrimination/Bias,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,0,-0.5714285714285714,0.06101190476190476
192
+ HELM AirBench Discrimination/Bias,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,1,-0.5714285714285714,0.06101190476190476
193
+ HELM AirBench Discrimination/Bias,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,2,-0.4999999999999999,0.10868055555555556
194
+ HELM AirBench Privacy,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,0,-0.5714285714285714,0.06101190476190476
195
+ HELM AirBench Privacy,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,1,-0.3571428571428571,0.27509920634920637
196
+ HELM AirBench Privacy,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,2,-0.21428571428571427,0.5484126984126985
197
+ HELM AirBench Criminal Activities,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,0,-0.5714285714285714,0.06101190476190476
198
+ HELM AirBench Criminal Activities,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,1,-0.5714285714285714,0.06101190476190476
199
+ HELM AirBench Criminal Activities,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,2,-0.42857142857142855,0.17886904761904762
200
+ HELM AirBench AIR Score,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,0,-0.6428571428571428,0.03115079365079365
201
+ HELM AirBench AIR Score,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,1,-0.5714285714285714,0.06101190476190476
202
+ HELM AirBench AIR Score,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,2,-0.47280542884465016,0.10506382347888965
203
+ OpenCompass,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7857142857142856,0.005505952380952381
204
+ OpenCompass,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,1,0.4999999999999999,0.10868055555555556
205
+ OpenCompass,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,2,0.3571428571428571,0.27509920634920637
206
+ OpenCompass Language,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,0,0.21428571428571427,0.5484126984126985
207
+ OpenCompass Language,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,1,0.42857142857142855,0.17886904761904762
208
+ OpenCompass Language,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,2,0.4999999999999999,0.10868055555555556
209
+ OpenCompass Knowledge,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,0,0.4999999999999999,0.10868055555555556
210
+ OpenCompass Knowledge,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,1,0.3571428571428571,0.27509920634920637
211
+ OpenCompass Knowledge,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,2,-0.21428571428571427,0.5484126984126985
212
+ OpenCompass Reasoning,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,0,0.4999999999999999,0.10868055555555556
213
+ OpenCompass Reasoning,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,1,0.42857142857142855,0.17886904761904762
214
+ OpenCompass Reasoning,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,2,0.4999999999999999,0.10868055555555556
215
+ OpenCompass Math,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7142857142857142,0.014136904761904762
216
+ OpenCompass Math,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,1,0.42857142857142855,0.17886904761904762
217
+ OpenCompass Math,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,2,0.14285714285714285,0.7195436507936508
218
+ OpenCompass Code,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,0,0.6428571428571428,0.03115079365079365
219
+ OpenCompass Code,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,1,0.6910233190806425,0.017844011512848347
220
+ OpenCompass Code,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,2,0.40006613209931935,0.17023995462900499
221
+ OpenCompass Instruction,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7142857142857142,0.014136904761904762
222
+ OpenCompass Instruction,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7857142857142856,0.005505952380952381
223
+ OpenCompass Instruction,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7857142857142856,0.005505952380952381
224
+ OpenCompass Agent,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,0,0.5714285714285714,0.06101190476190476
225
+ OpenCompass Agent,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,1,0.21428571428571427,0.5484126984126985
226
+ OpenCompass Agent,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,2,-0.14285714285714285,0.7195436507936508
227
+ OpenCompass Arena,opencompass_arena_240829.csv,aggregate,aggregate,kendall,random,8,0,0.32732683535398854,0.2618277009271762
228
+ OpenCompass Arena,opencompass_arena_240829.csv,aggregate,aggregate,kendall,random,8,1,0.4999999999999999,0.10868055555555556
229
+ OpenCompass Arena,opencompass_arena_240829.csv,aggregate,aggregate,kendall,random,8,2,0.21428571428571427,0.5484126984126985
230
+ LiveBench 240725,livebench_240829.csv,aggregate,aggregate,kendall,random,8,0,0.6428571428571428,0.03115079365079365
231
+ LiveBench 240725,livebench_240829.csv,aggregate,aggregate,kendall,random,8,1,0.42857142857142855,0.17886904761904762
232
+ LiveBench 240725,livebench_240829.csv,aggregate,aggregate,kendall,random,8,2,0.3571428571428571,0.27509920634920637
233
+ LiveBench Reasoning,livebench_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7142857142857142,0.014136904761904762
234
+ LiveBench Reasoning,livebench_240829.csv,aggregate,aggregate,kendall,random,8,1,0.2545875386086578,0.38281014365989596
235
+ LiveBench Reasoning,livebench_240829.csv,aggregate,aggregate,kendall,random,8,2,0.3571428571428571,0.27509920634920637
236
+ LiveBench Coding,livebench_240829.csv,aggregate,aggregate,kendall,random,8,0,0.4999999999999999,0.10868055555555556
237
+ LiveBench Coding,livebench_240829.csv,aggregate,aggregate,kendall,random,8,1,0.6428571428571428,0.03115079365079365
238
+ LiveBench Coding,livebench_240829.csv,aggregate,aggregate,kendall,random,8,2,0.2857142857142857,0.39875992063492066
239
+ LiveBench Mathematics,livebench_240829.csv,aggregate,aggregate,kendall,random,8,0,0.4999999999999999,0.10868055555555556
240
+ LiveBench Mathematics,livebench_240829.csv,aggregate,aggregate,kendall,random,8,1,0.2857142857142857,0.39875992063492066
241
+ LiveBench Mathematics,livebench_240829.csv,aggregate,aggregate,kendall,random,8,2,0.42857142857142855,0.17886904761904762
242
+ LiveBench Data Analysis,livebench_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7142857142857142,0.014136904761904762
243
+ LiveBench Data Analysis,livebench_240829.csv,aggregate,aggregate,kendall,random,8,1,0.4999999999999999,0.10868055555555556
244
+ LiveBench Data Analysis,livebench_240829.csv,aggregate,aggregate,kendall,random,8,2,-0.07142857142857142,0.9048611111111111
245
+ LiveBench Language,livebench_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7857142857142856,0.005505952380952381
246
+ LiveBench Language,livebench_240829.csv,aggregate,aggregate,kendall,random,8,1,0.5714285714285714,0.06101190476190476
247
+ LiveBench Language,livebench_240829.csv,aggregate,aggregate,kendall,random,8,2,0.42857142857142855,0.17886904761904762
248
+ LiveBench Instruction Following,livebench_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7142857142857142,0.014136904761904762
249
+ LiveBench Instruction Following,livebench_240829.csv,aggregate,aggregate,kendall,random,8,1,0.5714285714285714,0.06101190476190476
250
+ LiveBench Instruction Following,livebench_240829.csv,aggregate,aggregate,kendall,random,8,2,0.5714285714285714,0.06101190476190476
251
+ Enkrypt AI Safety,enkrypt_ai_safety_240916.csv,aggregate,aggregate,kendall,random,8,0,0.4999999999999999,0.10868055555555556
252
+ Enkrypt AI Safety,enkrypt_ai_safety_240916.csv,aggregate,aggregate,kendall,random,8,1,0.3571428571428571,0.27509920634920637
253
+ Enkrypt AI Safety,enkrypt_ai_safety_240916.csv,aggregate,aggregate,kendall,random,8,2,0.2857142857142857,0.39875992063492066
254
+ WildBench Elo LC,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,0,0.5714285714285714,0.06101190476190476
255
+ WildBench Elo LC,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,1,0.21428571428571427,0.5484126984126985
256
+ WildBench Elo LC,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,2,0.5714285714285714,0.06101190476190476
257
+ WildBench Information Seeking,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7142857142857142,0.014136904761904762
258
+ WildBench Information Seeking,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,1,0.3571428571428571,0.27509920634920637
259
+ WildBench Information Seeking,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7857142857142856,0.005505952380952381
260
+ WildBench Creative,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7857142857142856,0.005505952380952381
261
+ WildBench Creative,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,1,0.3571428571428571,0.27509920634920637
262
+ WildBench Creative,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7857142857142856,0.005505952380952381
263
+ WildBench Code Debugging,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,0,0.5714285714285714,0.06101190476190476
264
+ WildBench Code Debugging,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,1,0.21428571428571427,0.5484126984126985
265
+ WildBench Code Debugging,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,2,0.6428571428571428,0.03115079365079365
266
+ WildBench Math & Data,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,0,0.5714285714285714,0.06101190476190476
267
+ WildBench Math & Data,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,1,0.2857142857142857,0.39875992063492066
268
+ WildBench Math & Data,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,2,0.5714285714285714,0.06101190476190476
269
+ WildBench Reasoning & Planning,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,0,0.6428571428571428,0.03115079365079365
270
+ WildBench Reasoning & Planning,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,1,0.2857142857142857,0.39875992063492066
271
+ WildBench Reasoning & Planning,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,2,0.6428571428571428,0.03115079365079365
272
+ WildBench Score,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,0,0.5714285714285714,0.06101190476190476
273
+ WildBench Score,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,1,0.2857142857142857,0.39875992063492066
274
+ WildBench Score,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,2,0.6428571428571428,0.03115079365079365
275
+ Decentralized Arena (0-1 Normalized),dec_arena_241022.csv,aggregate,aggregate,kendall,random,8,0,0.5714285714285714,0.06101190476190476
276
+ Decentralized Arena (0-1 Normalized),dec_arena_241022.csv,aggregate,aggregate,kendall,random,8,1,0.40006613209931935,0.17023995462900499
277
+ Decentralized Arena (0-1 Normalized),dec_arena_241022.csv,aggregate,aggregate,kendall,random,8,2,0.6428571428571428,0.03115079365079365
278
+ Arena Hard,arena_hard_240829.csv,aggregate,aggregate,kendall,random,8,0,0.6428571428571428,0.03115079365079365
279
+ Arena Hard,arena_hard_240829.csv,aggregate,aggregate,kendall,random,8,1,0.5714285714285714,0.06101190476190476
280
+ Arena Hard,arena_hard_240829.csv,aggregate,aggregate,kendall,random,8,2,0.21428571428571427,0.5484126984126985
281
+ AgentBench,agenbench_240829.csv,aggregate,aggregate,kendall,random,8,0,0.32732683535398854,0.2618277009271762
282
+ AgentBench,agenbench_240829.csv,aggregate,aggregate,kendall,random,8,1,0.32732683535398854,0.2618277009271762
283
+ AgentBench,agenbench_240829.csv,aggregate,aggregate,kendall,random,8,2,0.32732683535398854,0.2618277009271762
284
+ MT-Bench,mtbench_240829_frozen.csv,aggregate,aggregate,kendall,random,8,0,0.6428571428571428,0.03115079365079365
285
+ MT-Bench,mtbench_240829_frozen.csv,aggregate,aggregate,kendall,random,8,1,0.6182840223353117,0.0340492747686748
286
+ MT-Bench,mtbench_240829_frozen.csv,aggregate,aggregate,kendall,random,8,2,0.6428571428571428,0.03115079365079365
287
+ HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,0,0.6428571428571428,0.03115079365079365
288
+ HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,1,0.21428571428571427,0.5484126984126985
289
+ HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,2,0.07142857142857142,0.9048611111111111
290
+ HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,0,0.42857142857142855,0.17886904761904762
291
+ HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,1,0.3571428571428571,0.27509920634920637
292
+ HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,2,0.07142857142857142,0.9048611111111111
293
+ HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,0,0.6428571428571428,0.03115079365079365
294
+ HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,1,0.0,1.0
295
+ HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,2,0.14285714285714285,0.7195436507936508
296
+ HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,0,0.42857142857142855,0.17886904761904762
297
+ HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,1,0.42857142857142855,0.17886904761904762
298
+ HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,2,0.21428571428571427,0.5484126984126985
299
+ HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,0,0.5714285714285714,0.06101190476190476
300
+ HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,1,0.3571428571428571,0.27509920634920637
301
+ HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,2,0.07142857142857142,0.9048611111111111
302
+ HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,0,0.6428571428571428,0.03115079365079365
303
+ HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,1,0.3571428571428571,0.27509920634920637
304
+ HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,2,0.0,1.0
305
+ HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,0,0.4999999999999999,0.10868055555555556
306
+ HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,1,0.42857142857142855,0.17886904761904762
307
+ HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,2,0.42857142857142855,0.17886904761904762
308
+ BFCL,bfcl_240906.csv,aggregate,aggregate,kendall,random,8,0,0.4999999999999999,0.10868055555555556
309
+ BFCL,bfcl_240906.csv,aggregate,aggregate,kendall,random,8,1,0.2857142857142857,0.39875992063492066
310
+ BFCL,bfcl_240906.csv,aggregate,aggregate,kendall,random,8,2,0.42857142857142855,0.17886904761904762
311
+ eq_bench,eqbench_240912.csv,aggregate,aggregate,kendall,random,8,0,0.14285714285714285,0.7195436507936508
312
+ eq_bench,eqbench_240912.csv,aggregate,aggregate,kendall,random,8,1,0.07142857142857142,0.9048611111111111
313
+ eq_bench,eqbench_240912.csv,aggregate,aggregate,kendall,random,8,2,0.4999999999999999,0.10868055555555556
314
+ magi_hard,eqbench_240912.csv,aggregate,aggregate,kendall,random,8,0,0.4999999999999999,0.10868055555555556
315
+ magi_hard,eqbench_240912.csv,aggregate,aggregate,kendall,random,8,1,-0.07142857142857142,0.9048611111111111
316
+ magi_hard,eqbench_240912.csv,aggregate,aggregate,kendall,random,8,2,-0.07142857142857142,0.9048611111111111
317
+ BIGGEN,biggen_240829.csv,aggregate,aggregate,kendall,random,8,0,0.5714285714285714,0.06101190476190476
318
+ BIGGEN,biggen_240829.csv,aggregate,aggregate,kendall,random,8,1,-0.07142857142857142,0.9048611111111111
319
+ BIGGEN,biggen_240829.csv,aggregate,aggregate,kendall,random,8,2,0.5714285714285714,0.06101190476190476
320
+ BIGGEN Grounding,biggen_240829.csv,aggregate,aggregate,kendall,random,8,0,0.6428571428571428,0.03115079365079365
321
+ BIGGEN Grounding,biggen_240829.csv,aggregate,aggregate,kendall,random,8,1,0.07142857142857142,0.9048611111111111
322
+ BIGGEN Grounding,biggen_240829.csv,aggregate,aggregate,kendall,random,8,2,0.6428571428571428,0.03115079365079365
323
+ BIGGEN Instruction Following,biggen_240829.csv,aggregate,aggregate,kendall,random,8,0,0.47280542884465016,0.10506382347888965
324
+ BIGGEN Instruction Following,biggen_240829.csv,aggregate,aggregate,kendall,random,8,1,0.41576092031014994,0.1612822677790775
325
+ BIGGEN Instruction Following,biggen_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7142857142857142,0.014136904761904762
326
+ BIGGEN Planning,biggen_240829.csv,aggregate,aggregate,kendall,random,8,0,0.6428571428571428,0.03115079365079365
327
+ BIGGEN Planning,biggen_240829.csv,aggregate,aggregate,kendall,random,8,1,0.2857142857142857,0.39875992063492066
328
+ BIGGEN Planning,biggen_240829.csv,aggregate,aggregate,kendall,random,8,2,0.5714285714285714,0.06101190476190476
329
+ BIGGEN Reasoning,biggen_240829.csv,aggregate,aggregate,kendall,random,8,0,0.3571428571428571,0.27509920634920637
330
+ BIGGEN Reasoning,biggen_240829.csv,aggregate,aggregate,kendall,random,8,1,-0.21428571428571427,0.5484126984126985
331
+ BIGGEN Reasoning,biggen_240829.csv,aggregate,aggregate,kendall,random,8,2,0.3571428571428571,0.27509920634920637
332
+ BIGGEN Refinement,biggen_240829.csv,aggregate,aggregate,kendall,random,8,0,0.6425396041156862,0.030400749685896046
333
+ BIGGEN Refinement,biggen_240829.csv,aggregate,aggregate,kendall,random,8,1,-0.036369648372665396,0.9007802600472398
334
+ BIGGEN Refinement,biggen_240829.csv,aggregate,aggregate,kendall,random,8,2,0.3571428571428571,0.27509920634920637
335
+ BIGGEN Safety,biggen_240829.csv,aggregate,aggregate,kendall,random,8,0,0.6428571428571428,0.03115079365079365
336
+ BIGGEN Safety,biggen_240829.csv,aggregate,aggregate,kendall,random,8,1,0.21428571428571427,0.5484126984126985
337
+ BIGGEN Safety,biggen_240829.csv,aggregate,aggregate,kendall,random,8,2,0.47280542884465016,0.10506382347888965
338
+ BIGGEN Theory of Mind,biggen_240829.csv,aggregate,aggregate,kendall,random,8,0,0.5714285714285714,0.06101190476190476
339
+ BIGGEN Theory of Mind,biggen_240829.csv,aggregate,aggregate,kendall,random,8,1,0.14285714285714285,0.7195436507936508
340
+ BIGGEN Theory of Mind,biggen_240829.csv,aggregate,aggregate,kendall,random,8,2,0.6428571428571428,0.03115079365079365
341
+ BIGGEN Tool Usage,biggen_240829.csv,aggregate,aggregate,kendall,random,8,0,0.5714285714285714,0.06101190476190476
342
+ BIGGEN Tool Usage,biggen_240829.csv,aggregate,aggregate,kendall,random,8,1,0.14285714285714285,0.7195436507936508
343
+ BIGGEN Tool Usage,biggen_240829.csv,aggregate,aggregate,kendall,random,8,2,0.42857142857142855,0.17886904761904762
344
+ BIGGEN Multilingual,biggen_240829.csv,aggregate,aggregate,kendall,random,8,0,0.8571428571428571,0.001736111111111111
345
+ BIGGEN Multilingual,biggen_240829.csv,aggregate,aggregate,kendall,random,8,1,0.47280542884465016,0.10506382347888965
346
+ BIGGEN Multilingual,biggen_240829.csv,aggregate,aggregate,kendall,random,8,2,0.6428571428571428,0.03115079365079365
347
+ LiveBench 240624,livebench_240701.csv,aggregate,aggregate,kendall,random,8,0,0.5714285714285714,0.06101190476190476
348
+ LiveBench 240624,livebench_240701.csv,aggregate,aggregate,kendall,random,8,1,0.4999999999999999,0.10868055555555556
349
+ LiveBench 240624,livebench_240701.csv,aggregate,aggregate,kendall,random,8,2,0.5714285714285714,0.06101190476190476
350
+ LiveBench Reasoning Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,0,0.6428571428571428,0.03115079365079365
351
+ LiveBench Reasoning Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,1,0.42857142857142855,0.17886904761904762
352
+ LiveBench Reasoning Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,2,0.7637626158259734,0.008839740160738534
353
+ LiveBench Coding Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,0,0.4999999999999999,0.10868055555555556
354
+ LiveBench Coding Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,1,0.4999999999999999,0.10868055555555556
355
+ LiveBench Coding Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,2,0.3571428571428571,0.27509920634920637
356
+ LiveBench Mathematics Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,0,0.6428571428571428,0.03115079365079365
357
+ LiveBench Mathematics Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,1,0.5714285714285714,0.06101190476190476
358
+ LiveBench Mathematics Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,2,0.6428571428571428,0.03115079365079365
359
+ LiveBench Data Analysis Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,0,0.42857142857142855,0.17886904761904762
360
+ LiveBench Data Analysis Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,1,0.4999999999999999,0.10868055555555556
361
+ LiveBench Data Analysis Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,2,0.5714285714285714,0.06101190476190476
362
+ LiveBench Language Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,0,0.6428571428571428,0.03115079365079365
363
+ LiveBench Language Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,1,0.7142857142857142,0.014136904761904762
364
+ LiveBench Language Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,2,0.5714285714285714,0.06101190476190476
365
+ LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,0,0.7857142857142856,0.005505952380952381
366
+ LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,1,0.6428571428571428,0.03115079365079365
367
+ LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,2,0.4999999999999999,0.10868055555555556
368
+ hydrox_integrity,hydrox_safety_241001.csv,aggregate,aggregate,kendall,random,8,0,0.4999999999999999,0.10868055555555556
369
+ hydrox_integrity,hydrox_safety_241001.csv,aggregate,aggregate,kendall,random,8,1,0.0,1.0
370
+ hydrox_integrity,hydrox_safety_241001.csv,aggregate,aggregate,kendall,random,8,2,0.14285714285714285,0.7195436507936508
371
+ hydrox_overall_score,hydrox_safety_241001.csv,aggregate,aggregate,kendall,random,8,0,0.42857142857142855,0.17886904761904762
372
+ hydrox_overall_score,hydrox_safety_241001.csv,aggregate,aggregate,kendall,random,8,1,0.0,1.0
373
+ hydrox_overall_score,hydrox_safety_241001.csv,aggregate,aggregate,kendall,random,8,2,0.14285714285714285,0.7195436507936508
374
+ hydrox_privacy,hydrox_safety_241001.csv,aggregate,aggregate,kendall,random,8,0,0.42857142857142855,0.17886904761904762
375
+ hydrox_privacy,hydrox_safety_241001.csv,aggregate,aggregate,kendall,random,8,1,-0.07142857142857142,0.9048611111111111
376
+ hydrox_privacy,hydrox_safety_241001.csv,aggregate,aggregate,kendall,random,8,2,0.14285714285714285,0.7195436507936508
377
+ hydrox_safety,hydrox_safety_241001.csv,aggregate,aggregate,kendall,random,8,0,0.42857142857142855,0.17886904761904762
378
+ hydrox_safety,hydrox_safety_241001.csv,aggregate,aggregate,kendall,random,8,1,-0.07142857142857142,0.9048611111111111
379
+ hydrox_safety,hydrox_safety_241001.csv,aggregate,aggregate,kendall,random,8,2,0.07142857142857142,0.9048611111111111
380
+ hydrox_security,hydrox_safety_241001.csv,aggregate,aggregate,kendall,random,8,0,0.4999999999999999,0.10868055555555556
381
+ hydrox_security,hydrox_safety_241001.csv,aggregate,aggregate,kendall,random,8,1,0.0,1.0
382
+ hydrox_security,hydrox_safety_241001.csv,aggregate,aggregate,kendall,random,8,2,0.07142857142857142,0.9048611111111111
383
+ aggregate,aggregate,Helm Lite,helm_lite_240829.csv,kendall,random,8,0,0.5455447255899809,0.0614649096074132
384
+ aggregate,aggregate,Helm Lite,helm_lite_240829.csv,kendall,random,8,1,0.8571428571428571,0.001736111111111111
385
+ aggregate,aggregate,Helm Lite,helm_lite_240829.csv,kendall,random,8,2,0.5455447255899809,0.0614649096074132
386
+ aggregate,aggregate,Helm Lite NarrativeQA,helm_lite_240829.csv,kendall,random,8,0,0.2857142857142857,0.39875992063492066
387
+ aggregate,aggregate,Helm Lite NarrativeQA,helm_lite_240829.csv,kendall,random,8,1,0.3571428571428571,0.27509920634920637
388
+ aggregate,aggregate,Helm Lite NarrativeQA,helm_lite_240829.csv,kendall,random,8,2,0.47280542884465016,0.10506382347888965
389
+ aggregate,aggregate,Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,kendall,random,8,0,0.3571428571428571,0.27509920634920637
390
+ aggregate,aggregate,Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,kendall,random,8,1,0.2857142857142857,0.39875992063492066
391
+ aggregate,aggregate,Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,kendall,random,8,2,0.42857142857142855,0.17886904761904762
392
+ aggregate,aggregate,Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,kendall,random,8,0,0.4999999999999999,0.10868055555555556
393
+ aggregate,aggregate,Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,kendall,random,8,1,0.4999999999999999,0.10868055555555556
394
+ aggregate,aggregate,Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,kendall,random,8,2,0.42857142857142855,0.17886904761904762
395
+ aggregate,aggregate,Helm Lite OpenBookQA,helm_lite_240829.csv,kendall,random,8,0,0.2857142857142857,0.39875992063492066
396
+ aggregate,aggregate,Helm Lite OpenBookQA,helm_lite_240829.csv,kendall,random,8,1,0.6428571428571428,0.03115079365079365
397
+ aggregate,aggregate,Helm Lite OpenBookQA,helm_lite_240829.csv,kendall,random,8,2,0.5455447255899809,0.0614649096074132
398
+ aggregate,aggregate,Helm Lite MMLU,helm_lite_240829.csv,kendall,random,8,0,0.3571428571428571,0.27509920634920637
399
+ aggregate,aggregate,Helm Lite MMLU,helm_lite_240829.csv,kendall,random,8,1,0.5714285714285714,0.06101190476190476
400
+ aggregate,aggregate,Helm Lite MMLU,helm_lite_240829.csv,kendall,random,8,2,0.42857142857142855,0.17886904761904762
401
+ aggregate,aggregate,Helm Lite MathEquivalentCOT,helm_lite_240829.csv,kendall,random,8,0,0.42857142857142855,0.17886904761904762
402
+ aggregate,aggregate,Helm Lite MathEquivalentCOT,helm_lite_240829.csv,kendall,random,8,1,0.7142857142857142,0.014136904761904762
403
+ aggregate,aggregate,Helm Lite MathEquivalentCOT,helm_lite_240829.csv,kendall,random,8,2,0.47280542884465016,0.10506382347888965
404
+ aggregate,aggregate,Helm Lite GSM8K,helm_lite_240829.csv,kendall,random,8,0,0.07142857142857142,0.9048611111111111
405
+ aggregate,aggregate,Helm Lite GSM8K,helm_lite_240829.csv,kendall,random,8,1,0.4999999999999999,0.10868055555555556
406
+ aggregate,aggregate,Helm Lite GSM8K,helm_lite_240829.csv,kendall,random,8,2,0.2857142857142857,0.39875992063492066
407
+ aggregate,aggregate,Helm Lite LegalBench,helm_lite_240829.csv,kendall,random,8,0,0.3571428571428571,0.27509920634920637
408
+ aggregate,aggregate,Helm Lite LegalBench,helm_lite_240829.csv,kendall,random,8,1,0.4999999999999999,0.10868055555555556
409
+ aggregate,aggregate,Helm Lite LegalBench,helm_lite_240829.csv,kendall,random,8,2,0.5714285714285714,0.06101190476190476
410
+ aggregate,aggregate,Helm Lite MedQA,helm_lite_240829.csv,kendall,random,8,0,0.2857142857142857,0.39875992063492066
411
+ aggregate,aggregate,Helm Lite MedQA,helm_lite_240829.csv,kendall,random,8,1,0.4999999999999999,0.10868055555555556
412
+ aggregate,aggregate,Helm Lite MedQA,helm_lite_240829.csv,kendall,random,8,2,0.42857142857142855,0.17886904761904762
413
+ aggregate,aggregate,Helm Lite WMT2014,helm_lite_240829.csv,kendall,random,8,0,0.2545875386086578,0.38281014365989596
414
+ aggregate,aggregate,Helm Lite WMT2014,helm_lite_240829.csv,kendall,random,8,1,0.42857142857142855,0.17886904761904762
415
+ aggregate,aggregate,Helm Lite WMT2014,helm_lite_240829.csv,kendall,random,8,2,0.5455447255899809,0.0614649096074132
416
+ aggregate,aggregate,LMSys Arena,chatbot_arena_241104.csv,kendall,random,8,0,0.9999999999999998,4.96031746031746e-05
417
+ aggregate,aggregate,LMSys Arena,chatbot_arena_241104.csv,kendall,random,8,1,0.9999999999999998,4.96031746031746e-05
418
+ aggregate,aggregate,LMSys Arena,chatbot_arena_241104.csv,kendall,random,8,2,0.9999999999999998,4.96031746031746e-05
419
+ aggregate,aggregate,HF OpenLLM v2,hf_open_llm_v2_240829.csv,kendall,random,8,0,0.3571428571428571,0.27509920634920637
420
+ aggregate,aggregate,HF OpenLLM v2,hf_open_llm_v2_240829.csv,kendall,random,8,1,0.42857142857142855,0.17886904761904762
421
+ aggregate,aggregate,HF OpenLLM v2,hf_open_llm_v2_240829.csv,kendall,random,8,2,0.7142857142857142,0.014136904761904762
422
+ aggregate,aggregate,HFv2 BBH,hf_open_llm_v2_240829.csv,kendall,random,8,0,0.21428571428571427,0.5484126984126985
423
+ aggregate,aggregate,HFv2 BBH,hf_open_llm_v2_240829.csv,kendall,random,8,1,0.42857142857142855,0.17886904761904762
424
+ aggregate,aggregate,HFv2 BBH,hf_open_llm_v2_240829.csv,kendall,random,8,2,0.3571428571428571,0.27509920634920637
425
+ aggregate,aggregate,HFv2 GPQA,hf_open_llm_v2_240829.csv,kendall,random,8,0,0.6428571428571428,0.03115079365079365
426
+ aggregate,aggregate,HFv2 GPQA,hf_open_llm_v2_240829.csv,kendall,random,8,1,0.21428571428571427,0.5484126984126985
427
+ aggregate,aggregate,HFv2 GPQA,hf_open_llm_v2_240829.csv,kendall,random,8,2,0.07142857142857142,0.9048611111111111
428
+ aggregate,aggregate,HFv2 IFEval,hf_open_llm_v2_240829.csv,kendall,random,8,0,0.3571428571428571,0.27509920634920637
429
+ aggregate,aggregate,HFv2 IFEval,hf_open_llm_v2_240829.csv,kendall,random,8,1,0.42857142857142855,0.17886904761904762
430
+ aggregate,aggregate,HFv2 IFEval,hf_open_llm_v2_240829.csv,kendall,random,8,2,0.7857142857142856,0.005505952380952381
431
+ aggregate,aggregate,HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,kendall,random,8,0,0.4999999999999999,0.10868055555555556
432
+ aggregate,aggregate,HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,kendall,random,8,1,0.42857142857142855,0.17886904761904762
433
+ aggregate,aggregate,HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,kendall,random,8,2,0.5714285714285714,0.06101190476190476
434
+ aggregate,aggregate,HFv2 Math Level 5,hf_open_llm_v2_240829.csv,kendall,random,8,0,0.42857142857142855,0.17886904761904762
435
+ aggregate,aggregate,HFv2 Math Level 5,hf_open_llm_v2_240829.csv,kendall,random,8,1,0.2857142857142857,0.39875992063492066
436
+ aggregate,aggregate,HFv2 Math Level 5,hf_open_llm_v2_240829.csv,kendall,random,8,2,0.5714285714285714,0.06101190476190476
437
+ aggregate,aggregate,HFv2 MuSR,hf_open_llm_v2_240829.csv,kendall,random,8,0,0.5714285714285714,0.06101190476190476
438
+ aggregate,aggregate,HFv2 MuSR,hf_open_llm_v2_240829.csv,kendall,random,8,1,0.3571428571428571,0.27509920634920637
439
+ aggregate,aggregate,HFv2 MuSR,hf_open_llm_v2_240829.csv,kendall,random,8,2,0.07142857142857142,0.9048611111111111
440
+ aggregate,aggregate,tablebench_overall_dp,tablebench_241002.csv,kendall,random,8,0,0.8571428571428571,0.001736111111111111
441
+ aggregate,aggregate,tablebench_overall_dp,tablebench_241002.csv,kendall,random,8,1,0.3571428571428571,0.27509920634920637
442
+ aggregate,aggregate,tablebench_overall_dp,tablebench_241002.csv,kendall,random,8,2,0.2857142857142857,0.39875992063492066
443
+ aggregate,aggregate,trustworthy_average,llm_trustworthy_241001.csv,kendall,random,8,0,0.5714285714285714,0.06101190476190476
444
+ aggregate,aggregate,trustworthy_average,llm_trustworthy_241001.csv,kendall,random,8,1,0.5714285714285714,0.06101190476190476
445
+ aggregate,aggregate,trustworthy_average,llm_trustworthy_241001.csv,kendall,random,8,2,0.5714285714285714,0.06101190476190476
446
+ aggregate,aggregate,trustworthy_non_toxicity,llm_trustworthy_241001.csv,kendall,random,8,0,0.14285714285714285,0.7195436507936508
447
+ aggregate,aggregate,trustworthy_non_toxicity,llm_trustworthy_241001.csv,kendall,random,8,1,0.14285714285714285,0.7195436507936508
448
+ aggregate,aggregate,trustworthy_non_toxicity,llm_trustworthy_241001.csv,kendall,random,8,2,0.14285714285714285,0.7195436507936508
449
+ aggregate,aggregate,trustworthy_non_stereotype,llm_trustworthy_241001.csv,kendall,random,8,0,0.3571428571428571,0.27509920634920637
450
+ aggregate,aggregate,trustworthy_non_stereotype,llm_trustworthy_241001.csv,kendall,random,8,1,0.3571428571428571,0.27509920634920637
451
+ aggregate,aggregate,trustworthy_non_stereotype,llm_trustworthy_241001.csv,kendall,random,8,2,0.3571428571428571,0.27509920634920637
452
+ aggregate,aggregate,trustworthy_advglue_pp,llm_trustworthy_241001.csv,kendall,random,8,0,0.42857142857142855,0.17886904761904762
453
+ aggregate,aggregate,trustworthy_advglue_pp,llm_trustworthy_241001.csv,kendall,random,8,1,0.42857142857142855,0.17886904761904762
454
+ aggregate,aggregate,trustworthy_advglue_pp,llm_trustworthy_241001.csv,kendall,random,8,2,0.42857142857142855,0.17886904761904762
455
+ aggregate,aggregate,trustworthy_ood,llm_trustworthy_241001.csv,kendall,random,8,0,0.6428571428571428,0.03115079365079365
456
+ aggregate,aggregate,trustworthy_ood,llm_trustworthy_241001.csv,kendall,random,8,1,0.6428571428571428,0.03115079365079365
457
+ aggregate,aggregate,trustworthy_ood,llm_trustworthy_241001.csv,kendall,random,8,2,0.6428571428571428,0.03115079365079365
458
+ aggregate,aggregate,trustworthy_adv_demo,llm_trustworthy_241001.csv,kendall,random,8,0,0.6428571428571428,0.03115079365079365
459
+ aggregate,aggregate,trustworthy_adv_demo,llm_trustworthy_241001.csv,kendall,random,8,1,0.6428571428571428,0.03115079365079365
460
+ aggregate,aggregate,trustworthy_adv_demo,llm_trustworthy_241001.csv,kendall,random,8,2,0.6428571428571428,0.03115079365079365
461
+ aggregate,aggregate,trustworthy_privacy,llm_trustworthy_241001.csv,kendall,random,8,0,0.07142857142857142,0.9048611111111111
462
+ aggregate,aggregate,trustworthy_privacy,llm_trustworthy_241001.csv,kendall,random,8,1,0.07142857142857142,0.9048611111111111
463
+ aggregate,aggregate,trustworthy_privacy,llm_trustworthy_241001.csv,kendall,random,8,2,0.07142857142857142,0.9048611111111111
464
+ aggregate,aggregate,trustworthy_ethics,llm_trustworthy_241001.csv,kendall,random,8,0,0.5714285714285714,0.06101190476190476
465
+ aggregate,aggregate,trustworthy_ethics,llm_trustworthy_241001.csv,kendall,random,8,1,0.5714285714285714,0.06101190476190476
466
+ aggregate,aggregate,trustworthy_ethics,llm_trustworthy_241001.csv,kendall,random,8,2,0.5714285714285714,0.06101190476190476
467
+ aggregate,aggregate,trustworthy_fairness,llm_trustworthy_241001.csv,kendall,random,8,0,-0.6910233190806424,0.017844011512848347
468
+ aggregate,aggregate,trustworthy_fairness,llm_trustworthy_241001.csv,kendall,random,8,1,-0.6910233190806424,0.017844011512848347
469
+ aggregate,aggregate,trustworthy_fairness,llm_trustworthy_241001.csv,kendall,random,8,2,-0.6910233190806424,0.017844011512848347
470
+ aggregate,aggregate,OpenCompass Academic,opencompass_academic_240829.csv,kendall,random,8,0,0.4999999999999999,0.10868055555555556
471
+ aggregate,aggregate,OpenCompass Academic,opencompass_academic_240829.csv,kendall,random,8,1,0.3571428571428571,0.27509920634920637
472
+ aggregate,aggregate,OpenCompass Academic,opencompass_academic_240829.csv,kendall,random,8,2,0.4999999999999999,0.10868055555555556
473
+ aggregate,aggregate,OpenCompass MMLU,opencompass_academic_240829.csv,kendall,random,8,0,0.3571428571428571,0.27509920634920637
474
+ aggregate,aggregate,OpenCompass MMLU,opencompass_academic_240829.csv,kendall,random,8,1,0.21428571428571427,0.5484126984126985
475
+ aggregate,aggregate,OpenCompass MMLU,opencompass_academic_240829.csv,kendall,random,8,2,0.42857142857142855,0.17886904761904762
476
+ aggregate,aggregate,OpenCompass MMLU Pro,opencompass_academic_240829.csv,kendall,random,8,0,0.42857142857142855,0.17886904761904762
477
+ aggregate,aggregate,OpenCompass MMLU Pro,opencompass_academic_240829.csv,kendall,random,8,1,0.2857142857142857,0.39875992063492066
478
+ aggregate,aggregate,OpenCompass MMLU Pro,opencompass_academic_240829.csv,kendall,random,8,2,0.3571428571428571,0.27509920634920637
479
+ aggregate,aggregate,OpenCompass CMMLU,opencompass_academic_240829.csv,kendall,random,8,0,0.3571428571428571,0.27509920634920637
480
+ aggregate,aggregate,OpenCompass CMMLU,opencompass_academic_240829.csv,kendall,random,8,1,0.21428571428571427,0.5484126984126985
481
+ aggregate,aggregate,OpenCompass CMMLU,opencompass_academic_240829.csv,kendall,random,8,2,0.2857142857142857,0.39875992063492066
482
+ aggregate,aggregate,OpenCompass BBH,opencompass_academic_240829.csv,kendall,random,8,0,0.4999999999999999,0.10868055555555556
483
+ aggregate,aggregate,OpenCompass BBH,opencompass_academic_240829.csv,kendall,random,8,1,0.2857142857142857,0.39875992063492066
484
+ aggregate,aggregate,OpenCompass BBH,opencompass_academic_240829.csv,kendall,random,8,2,0.4999999999999999,0.10868055555555556
485
+ aggregate,aggregate,OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,kendall,random,8,0,0.036369648372665396,0.9007802600472398
486
+ aggregate,aggregate,OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,kendall,random,8,1,0.07142857142857142,0.9048611111111111
487
+ aggregate,aggregate,OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,kendall,random,8,2,-0.036369648372665396,0.9007802600472398
488
+ aggregate,aggregate,OpenCompass HumanEval,opencompass_academic_240829.csv,kendall,random,8,0,0.4447495899966607,0.1315867602811863
489
+ aggregate,aggregate,OpenCompass HumanEval,opencompass_academic_240829.csv,kendall,random,8,1,0.2545875386086578,0.38281014365989596
490
+ aggregate,aggregate,OpenCompass HumanEval,opencompass_academic_240829.csv,kendall,random,8,2,0.40006613209931935,0.17023995462900499
491
+ aggregate,aggregate,OpenCompass IFEval,opencompass_academic_240829.csv,kendall,random,8,0,0.2857142857142857,0.39875992063492066
492
+ aggregate,aggregate,OpenCompass IFEval,opencompass_academic_240829.csv,kendall,random,8,1,0.21428571428571427,0.5484126984126985
493
+ aggregate,aggregate,OpenCompass IFEval,opencompass_academic_240829.csv,kendall,random,8,2,0.21428571428571427,0.5484126984126985
494
+ aggregate,aggregate,Helm MMLU,helm_mmlu_240829.csv,kendall,random,8,0,0.4999999999999999,0.10868055555555556
495
+ aggregate,aggregate,Helm MMLU,helm_mmlu_240829.csv,kendall,random,8,1,0.3571428571428571,0.27509920634920637
496
+ aggregate,aggregate,Helm MMLU,helm_mmlu_240829.csv,kendall,random,8,2,0.3571428571428571,0.27509920634920637
497
+ aggregate,aggregate,MMLU Pro,mmlu_pro_240829.csv,kendall,random,8,0,0.5714285714285714,0.06101190476190476
498
+ aggregate,aggregate,MMLU Pro,mmlu_pro_240829.csv,kendall,random,8,1,0.5714285714285714,0.06101190476190476
499
+ aggregate,aggregate,MMLU Pro,mmlu_pro_240829.csv,kendall,random,8,2,0.5714285714285714,0.06101190476190476
500
+ aggregate,aggregate,MixEval,mixeval_240829.csv,kendall,random,8,0,0.6428571428571428,0.03115079365079365
501
+ aggregate,aggregate,MixEval,mixeval_240829.csv,kendall,random,8,1,0.5714285714285714,0.06101190476190476
502
+ aggregate,aggregate,MixEval,mixeval_240829.csv,kendall,random,8,2,0.7142857142857142,0.014136904761904762
503
+ aggregate,aggregate,MixEval Hard,mixeval_240829.csv,kendall,random,8,0,0.9999999999999998,4.96031746031746e-05
504
+ aggregate,aggregate,MixEval Hard,mixeval_240829.csv,kendall,random,8,1,0.6428571428571428,0.03115079365079365
505
+ aggregate,aggregate,MixEval Hard,mixeval_240829.csv,kendall,random,8,2,0.8571428571428571,0.001736111111111111
506
+ aggregate,aggregate,MixEval TriviaQA,mixeval_240829.csv,kendall,random,8,0,0.4999999999999999,0.10868055555555556
507
+ aggregate,aggregate,MixEval TriviaQA,mixeval_240829.csv,kendall,random,8,1,0.14285714285714285,0.7195436507936508
508
+ aggregate,aggregate,MixEval TriviaQA,mixeval_240829.csv,kendall,random,8,2,0.4999999999999999,0.10868055555555556
509
+ aggregate,aggregate,MixEval MMLU,mixeval_240829.csv,kendall,random,8,0,0.8571428571428571,0.001736111111111111
510
+ aggregate,aggregate,MixEval MMLU,mixeval_240829.csv,kendall,random,8,1,0.7142857142857142,0.014136904761904762
511
+ aggregate,aggregate,MixEval MMLU,mixeval_240829.csv,kendall,random,8,2,0.7857142857142856,0.005505952380952381
512
+ aggregate,aggregate,MixEval DROP,mixeval_240829.csv,kendall,random,8,0,0.7142857142857142,0.014136904761904762
513
+ aggregate,aggregate,MixEval DROP,mixeval_240829.csv,kendall,random,8,1,0.6428571428571428,0.03115079365079365
514
+ aggregate,aggregate,MixEval DROP,mixeval_240829.csv,kendall,random,8,2,0.7142857142857142,0.014136904761904762
515
+ aggregate,aggregate,MixEval HellaSwag,mixeval_240829.csv,kendall,random,8,0,0.8571428571428571,0.001736111111111111
516
+ aggregate,aggregate,MixEval HellaSwag,mixeval_240829.csv,kendall,random,8,1,0.4999999999999999,0.10868055555555556
517
+ aggregate,aggregate,MixEval HellaSwag,mixeval_240829.csv,kendall,random,8,2,0.6910233190806425,0.017844011512848347
518
+ aggregate,aggregate,MixEval CommonsenseQA,mixeval_240829.csv,kendall,random,8,0,0.7142857142857142,0.014136904761904762
519
+ aggregate,aggregate,MixEval CommonsenseQA,mixeval_240829.csv,kendall,random,8,1,0.47280542884465016,0.10506382347888965
520
+ aggregate,aggregate,MixEval CommonsenseQA,mixeval_240829.csv,kendall,random,8,2,0.6182840223353117,0.0340492747686748
521
+ aggregate,aggregate,MixEval TriviaQA Hard,mixeval_240829.csv,kendall,random,8,0,0.7142857142857142,0.014136904761904762
522
+ aggregate,aggregate,MixEval TriviaQA Hard,mixeval_240829.csv,kendall,random,8,1,0.4999999999999999,0.10868055555555556
523
+ aggregate,aggregate,MixEval TriviaQA Hard,mixeval_240829.csv,kendall,random,8,2,0.5714285714285714,0.06101190476190476
524
+ aggregate,aggregate,MixEval MMLU Hard,mixeval_240829.csv,kendall,random,8,0,0.7857142857142856,0.005505952380952381
525
+ aggregate,aggregate,MixEval MMLU Hard,mixeval_240829.csv,kendall,random,8,1,0.6428571428571428,0.03115079365079365
526
+ aggregate,aggregate,MixEval MMLU Hard,mixeval_240829.csv,kendall,random,8,2,0.8571428571428571,0.001736111111111111
527
+ aggregate,aggregate,MixEval DROP Hard,mixeval_240829.csv,kendall,random,8,0,0.6428571428571428,0.03115079365079365
528
+ aggregate,aggregate,MixEval DROP Hard,mixeval_240829.csv,kendall,random,8,1,0.4999999999999999,0.10868055555555556
529
+ aggregate,aggregate,MixEval DROP Hard,mixeval_240829.csv,kendall,random,8,2,0.6428571428571428,0.03115079365079365
530
+ aggregate,aggregate,AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,kendall,random,8,0,0.21428571428571427,0.5484126984126985
531
+ aggregate,aggregate,AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,kendall,random,8,1,0.7142857142857142,0.014136904761904762
532
+ aggregate,aggregate,AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,kendall,random,8,2,0.2545875386086578,0.38281014365989596
533
+ aggregate,aggregate,HELM AirBench Security Risks,helm_airbench_240916.csv,kendall,random,8,0,-0.42857142857142855,0.17886904761904762
534
+ aggregate,aggregate,HELM AirBench Security Risks,helm_airbench_240916.csv,kendall,random,8,1,-0.2857142857142857,0.39875992063492066
535
+ aggregate,aggregate,HELM AirBench Security Risks,helm_airbench_240916.csv,kendall,random,8,2,-0.21428571428571427,0.5484126984126985
536
+ aggregate,aggregate,HELM AirBench Operational Misuses,helm_airbench_240916.csv,kendall,random,8,0,-0.5714285714285714,0.06101190476190476
537
+ aggregate,aggregate,HELM AirBench Operational Misuses,helm_airbench_240916.csv,kendall,random,8,1,-0.18184824186332696,0.5330356744917513
538
+ aggregate,aggregate,HELM AirBench Operational Misuses,helm_airbench_240916.csv,kendall,random,8,2,-0.42857142857142855,0.17886904761904762
539
+ aggregate,aggregate,HELM AirBench Violence & Extremism,helm_airbench_240916.csv,kendall,random,8,0,-0.5714285714285714,0.06101190476190476
540
+ aggregate,aggregate,HELM AirBench Violence & Extremism,helm_airbench_240916.csv,kendall,random,8,1,-0.7142857142857142,0.014136904761904762
541
+ aggregate,aggregate,HELM AirBench Violence & Extremism,helm_airbench_240916.csv,kendall,random,8,2,-0.3571428571428571,0.27509920634920637
542
+ aggregate,aggregate,HELM AirBench Hate/Toxicity,helm_airbench_240916.csv,kendall,random,8,0,-0.5714285714285714,0.06101190476190476
543
+ aggregate,aggregate,HELM AirBench Hate/Toxicity,helm_airbench_240916.csv,kendall,random,8,1,-0.7142857142857142,0.014136904761904762
544
+ aggregate,aggregate,HELM AirBench Hate/Toxicity,helm_airbench_240916.csv,kendall,random,8,2,-0.2857142857142857,0.39875992063492066
545
+ aggregate,aggregate,HELM AirBench Sexual Content,helm_airbench_240916.csv,kendall,random,8,0,-0.42857142857142855,0.17886904761904762
546
+ aggregate,aggregate,HELM AirBench Sexual Content,helm_airbench_240916.csv,kendall,random,8,1,-0.5714285714285714,0.06101190476190476
547
+ aggregate,aggregate,HELM AirBench Sexual Content,helm_airbench_240916.csv,kendall,random,8,2,-0.42857142857142855,0.17886904761904762
548
+ aggregate,aggregate,HELM AirBench Child Harm,helm_airbench_240916.csv,kendall,random,8,0,-0.5714285714285714,0.06101190476190476
549
+ aggregate,aggregate,HELM AirBench Child Harm,helm_airbench_240916.csv,kendall,random,8,1,-0.5714285714285714,0.06101190476190476
550
+ aggregate,aggregate,HELM AirBench Child Harm,helm_airbench_240916.csv,kendall,random,8,2,-0.42857142857142855,0.17886904761904762
551
+ aggregate,aggregate,HELM AirBench Self Harm,helm_airbench_240916.csv,kendall,random,8,0,-0.40006613209931935,0.17023995462900499
552
+ aggregate,aggregate,HELM AirBench Self Harm,helm_airbench_240916.csv,kendall,random,8,1,-0.5714285714285714,0.06101190476190476
553
+ aggregate,aggregate,HELM AirBench Self Harm,helm_airbench_240916.csv,kendall,random,8,2,-0.40006613209931935,0.17023995462900499
554
+ aggregate,aggregate,HELM AirBench Political Usage,helm_airbench_240916.csv,kendall,random,8,0,-0.5714285714285714,0.06101190476190476
555
+ aggregate,aggregate,HELM AirBench Political Usage,helm_airbench_240916.csv,kendall,random,8,1,-0.5714285714285714,0.06101190476190476
556
+ aggregate,aggregate,HELM AirBench Political Usage,helm_airbench_240916.csv,kendall,random,8,2,-0.3571428571428571,0.27509920634920637
557
+ aggregate,aggregate,HELM AirBench Economic Harm,helm_airbench_240916.csv,kendall,random,8,0,-0.5714285714285714,0.06101190476190476
558
+ aggregate,aggregate,HELM AirBench Economic Harm,helm_airbench_240916.csv,kendall,random,8,1,-0.6182840223353117,0.0340492747686748
559
+ aggregate,aggregate,HELM AirBench Economic Harm,helm_airbench_240916.csv,kendall,random,8,2,-0.3571428571428571,0.27509920634920637
560
+ aggregate,aggregate,HELM AirBench Deception,helm_airbench_240916.csv,kendall,random,8,0,-0.4999999999999999,0.10868055555555556
561
+ aggregate,aggregate,HELM AirBench Deception,helm_airbench_240916.csv,kendall,random,8,1,-0.6428571428571428,0.03115079365079365
562
+ aggregate,aggregate,HELM AirBench Deception,helm_airbench_240916.csv,kendall,random,8,2,-0.40006613209931935,0.17023995462900499
563
+ aggregate,aggregate,HELM AirBench Manipulation,helm_airbench_240916.csv,kendall,random,8,0,-0.5714285714285714,0.06101190476190476
564
+ aggregate,aggregate,HELM AirBench Manipulation,helm_airbench_240916.csv,kendall,random,8,1,-0.6182840223353117,0.0340492747686748
565
+ aggregate,aggregate,HELM AirBench Manipulation,helm_airbench_240916.csv,kendall,random,8,2,-0.3571428571428571,0.27509920634920637
566
+ aggregate,aggregate,HELM AirBench Defamation,helm_airbench_240916.csv,kendall,random,8,0,-0.40006613209931935,0.17023995462900499
567
+ aggregate,aggregate,HELM AirBench Defamation,helm_airbench_240916.csv,kendall,random,8,1,-0.4999999999999999,0.10868055555555556
568
+ aggregate,aggregate,HELM AirBench Defamation,helm_airbench_240916.csv,kendall,random,8,2,-0.40006613209931935,0.17023995462900499
569
+ aggregate,aggregate,HELM AirBench Fundamental Rights,helm_airbench_240916.csv,kendall,random,8,0,-0.6182840223353117,0.0340492747686748
570
+ aggregate,aggregate,HELM AirBench Fundamental Rights,helm_airbench_240916.csv,kendall,random,8,1,-0.5714285714285714,0.06101190476190476
571
+ aggregate,aggregate,HELM AirBench Fundamental Rights,helm_airbench_240916.csv,kendall,random,8,2,-0.41576092031014994,0.1612822677790775
572
+ aggregate,aggregate,HELM AirBench Discrimination/Bias,helm_airbench_240916.csv,kendall,random,8,0,-0.5714285714285714,0.06101190476190476
573
+ aggregate,aggregate,HELM AirBench Discrimination/Bias,helm_airbench_240916.csv,kendall,random,8,1,-0.5714285714285714,0.06101190476190476
574
+ aggregate,aggregate,HELM AirBench Discrimination/Bias,helm_airbench_240916.csv,kendall,random,8,2,-0.4999999999999999,0.10868055555555556
575
+ aggregate,aggregate,HELM AirBench Privacy,helm_airbench_240916.csv,kendall,random,8,0,-0.5714285714285714,0.06101190476190476
576
+ aggregate,aggregate,HELM AirBench Privacy,helm_airbench_240916.csv,kendall,random,8,1,-0.3571428571428571,0.27509920634920637
577
+ aggregate,aggregate,HELM AirBench Privacy,helm_airbench_240916.csv,kendall,random,8,2,-0.21428571428571427,0.5484126984126985
578
+ aggregate,aggregate,HELM AirBench Criminal Activities,helm_airbench_240916.csv,kendall,random,8,0,-0.5714285714285714,0.06101190476190476
579
+ aggregate,aggregate,HELM AirBench Criminal Activities,helm_airbench_240916.csv,kendall,random,8,1,-0.5714285714285714,0.06101190476190476
580
+ aggregate,aggregate,HELM AirBench Criminal Activities,helm_airbench_240916.csv,kendall,random,8,2,-0.42857142857142855,0.17886904761904762
581
+ aggregate,aggregate,HELM AirBench AIR Score,helm_airbench_240916.csv,kendall,random,8,0,-0.6428571428571428,0.03115079365079365
582
+ aggregate,aggregate,HELM AirBench AIR Score,helm_airbench_240916.csv,kendall,random,8,1,-0.5714285714285714,0.06101190476190476
583
+ aggregate,aggregate,HELM AirBench AIR Score,helm_airbench_240916.csv,kendall,random,8,2,-0.47280542884465016,0.10506382347888965
584
+ aggregate,aggregate,OpenCompass,opencompass_240829.csv,kendall,random,8,0,0.7857142857142856,0.005505952380952381
585
+ aggregate,aggregate,OpenCompass,opencompass_240829.csv,kendall,random,8,1,0.4999999999999999,0.10868055555555556
586
+ aggregate,aggregate,OpenCompass,opencompass_240829.csv,kendall,random,8,2,0.3571428571428571,0.27509920634920637
587
+ aggregate,aggregate,OpenCompass Language,opencompass_240829.csv,kendall,random,8,0,0.21428571428571427,0.5484126984126985
588
+ aggregate,aggregate,OpenCompass Language,opencompass_240829.csv,kendall,random,8,1,0.42857142857142855,0.17886904761904762
589
+ aggregate,aggregate,OpenCompass Language,opencompass_240829.csv,kendall,random,8,2,0.4999999999999999,0.10868055555555556
590
+ aggregate,aggregate,OpenCompass Knowledge,opencompass_240829.csv,kendall,random,8,0,0.4999999999999999,0.10868055555555556
591
+ aggregate,aggregate,OpenCompass Knowledge,opencompass_240829.csv,kendall,random,8,1,0.3571428571428571,0.27509920634920637
592
+ aggregate,aggregate,OpenCompass Knowledge,opencompass_240829.csv,kendall,random,8,2,-0.21428571428571427,0.5484126984126985
593
+ aggregate,aggregate,OpenCompass Reasoning,opencompass_240829.csv,kendall,random,8,0,0.4999999999999999,0.10868055555555556
594
+ aggregate,aggregate,OpenCompass Reasoning,opencompass_240829.csv,kendall,random,8,1,0.42857142857142855,0.17886904761904762
595
+ aggregate,aggregate,OpenCompass Reasoning,opencompass_240829.csv,kendall,random,8,2,0.4999999999999999,0.10868055555555556
596
+ aggregate,aggregate,OpenCompass Math,opencompass_240829.csv,kendall,random,8,0,0.7142857142857142,0.014136904761904762
597
+ aggregate,aggregate,OpenCompass Math,opencompass_240829.csv,kendall,random,8,1,0.42857142857142855,0.17886904761904762
598
+ aggregate,aggregate,OpenCompass Math,opencompass_240829.csv,kendall,random,8,2,0.14285714285714285,0.7195436507936508
599
+ aggregate,aggregate,OpenCompass Code,opencompass_240829.csv,kendall,random,8,0,0.6428571428571428,0.03115079365079365
600
+ aggregate,aggregate,OpenCompass Code,opencompass_240829.csv,kendall,random,8,1,0.6910233190806425,0.017844011512848347
601
+ aggregate,aggregate,OpenCompass Code,opencompass_240829.csv,kendall,random,8,2,0.40006613209931935,0.17023995462900499
602
+ aggregate,aggregate,OpenCompass Instruction,opencompass_240829.csv,kendall,random,8,0,0.7142857142857142,0.014136904761904762
603
+ aggregate,aggregate,OpenCompass Instruction,opencompass_240829.csv,kendall,random,8,1,0.7857142857142856,0.005505952380952381
604
+ aggregate,aggregate,OpenCompass Instruction,opencompass_240829.csv,kendall,random,8,2,0.7857142857142856,0.005505952380952381
605
+ aggregate,aggregate,OpenCompass Agent,opencompass_240829.csv,kendall,random,8,0,0.5714285714285714,0.06101190476190476
606
+ aggregate,aggregate,OpenCompass Agent,opencompass_240829.csv,kendall,random,8,1,0.21428571428571427,0.5484126984126985
607
+ aggregate,aggregate,OpenCompass Agent,opencompass_240829.csv,kendall,random,8,2,-0.14285714285714285,0.7195436507936508
608
+ aggregate,aggregate,OpenCompass Arena,opencompass_arena_240829.csv,kendall,random,8,0,0.32732683535398854,0.2618277009271762
609
+ aggregate,aggregate,OpenCompass Arena,opencompass_arena_240829.csv,kendall,random,8,1,0.4999999999999999,0.10868055555555556
610
+ aggregate,aggregate,OpenCompass Arena,opencompass_arena_240829.csv,kendall,random,8,2,0.21428571428571427,0.5484126984126985
611
+ aggregate,aggregate,LiveBench 240725,livebench_240829.csv,kendall,random,8,0,0.6428571428571428,0.03115079365079365
612
+ aggregate,aggregate,LiveBench 240725,livebench_240829.csv,kendall,random,8,1,0.42857142857142855,0.17886904761904762
613
+ aggregate,aggregate,LiveBench 240725,livebench_240829.csv,kendall,random,8,2,0.3571428571428571,0.27509920634920637
614
+ aggregate,aggregate,LiveBench Reasoning,livebench_240829.csv,kendall,random,8,0,0.7142857142857142,0.014136904761904762
615
+ aggregate,aggregate,LiveBench Reasoning,livebench_240829.csv,kendall,random,8,1,0.2545875386086578,0.38281014365989596
616
+ aggregate,aggregate,LiveBench Reasoning,livebench_240829.csv,kendall,random,8,2,0.3571428571428571,0.27509920634920637
617
+ aggregate,aggregate,LiveBench Coding,livebench_240829.csv,kendall,random,8,0,0.4999999999999999,0.10868055555555556
618
+ aggregate,aggregate,LiveBench Coding,livebench_240829.csv,kendall,random,8,1,0.6428571428571428,0.03115079365079365
619
+ aggregate,aggregate,LiveBench Coding,livebench_240829.csv,kendall,random,8,2,0.2857142857142857,0.39875992063492066
620
+ aggregate,aggregate,LiveBench Mathematics,livebench_240829.csv,kendall,random,8,0,0.4999999999999999,0.10868055555555556
621
+ aggregate,aggregate,LiveBench Mathematics,livebench_240829.csv,kendall,random,8,1,0.2857142857142857,0.39875992063492066
622
+ aggregate,aggregate,LiveBench Mathematics,livebench_240829.csv,kendall,random,8,2,0.42857142857142855,0.17886904761904762
623
+ aggregate,aggregate,LiveBench Data Analysis,livebench_240829.csv,kendall,random,8,0,0.7142857142857142,0.014136904761904762
624
+ aggregate,aggregate,LiveBench Data Analysis,livebench_240829.csv,kendall,random,8,1,0.4999999999999999,0.10868055555555556
625
+ aggregate,aggregate,LiveBench Data Analysis,livebench_240829.csv,kendall,random,8,2,-0.07142857142857142,0.9048611111111111
626
+ aggregate,aggregate,LiveBench Language,livebench_240829.csv,kendall,random,8,0,0.7857142857142856,0.005505952380952381
627
+ aggregate,aggregate,LiveBench Language,livebench_240829.csv,kendall,random,8,1,0.5714285714285714,0.06101190476190476
628
+ aggregate,aggregate,LiveBench Language,livebench_240829.csv,kendall,random,8,2,0.42857142857142855,0.17886904761904762
629
+ aggregate,aggregate,LiveBench Instruction Following,livebench_240829.csv,kendall,random,8,0,0.7142857142857142,0.014136904761904762
630
+ aggregate,aggregate,LiveBench Instruction Following,livebench_240829.csv,kendall,random,8,1,0.5714285714285714,0.06101190476190476
631
+ aggregate,aggregate,LiveBench Instruction Following,livebench_240829.csv,kendall,random,8,2,0.5714285714285714,0.06101190476190476
632
+ aggregate,aggregate,Enkrypt AI Safety,enkrypt_ai_safety_240916.csv,kendall,random,8,0,0.4999999999999999,0.10868055555555556
633
+ aggregate,aggregate,Enkrypt AI Safety,enkrypt_ai_safety_240916.csv,kendall,random,8,1,0.3571428571428571,0.27509920634920637
634
+ aggregate,aggregate,Enkrypt AI Safety,enkrypt_ai_safety_240916.csv,kendall,random,8,2,0.2857142857142857,0.39875992063492066
635
+ aggregate,aggregate,WildBench Elo LC,wildbench_240829.csv,kendall,random,8,0,0.5714285714285714,0.06101190476190476
636
+ aggregate,aggregate,WildBench Elo LC,wildbench_240829.csv,kendall,random,8,1,0.21428571428571427,0.5484126984126985
637
+ aggregate,aggregate,WildBench Elo LC,wildbench_240829.csv,kendall,random,8,2,0.5714285714285714,0.06101190476190476
638
+ aggregate,aggregate,WildBench Information Seeking,wildbench_240829.csv,kendall,random,8,0,0.7142857142857142,0.014136904761904762
639
+ aggregate,aggregate,WildBench Information Seeking,wildbench_240829.csv,kendall,random,8,1,0.3571428571428571,0.27509920634920637
640
+ aggregate,aggregate,WildBench Information Seeking,wildbench_240829.csv,kendall,random,8,2,0.7857142857142856,0.005505952380952381
641
+ aggregate,aggregate,WildBench Creative,wildbench_240829.csv,kendall,random,8,0,0.7857142857142856,0.005505952380952381
642
+ aggregate,aggregate,WildBench Creative,wildbench_240829.csv,kendall,random,8,1,0.3571428571428571,0.27509920634920637
643
+ aggregate,aggregate,WildBench Creative,wildbench_240829.csv,kendall,random,8,2,0.7857142857142856,0.005505952380952381
644
+ aggregate,aggregate,WildBench Code Debugging,wildbench_240829.csv,kendall,random,8,0,0.5714285714285714,0.06101190476190476
645
+ aggregate,aggregate,WildBench Code Debugging,wildbench_240829.csv,kendall,random,8,1,0.21428571428571427,0.5484126984126985
646
+ aggregate,aggregate,WildBench Code Debugging,wildbench_240829.csv,kendall,random,8,2,0.6428571428571428,0.03115079365079365
647
+ aggregate,aggregate,WildBench Math & Data,wildbench_240829.csv,kendall,random,8,0,0.5714285714285714,0.06101190476190476
648
+ aggregate,aggregate,WildBench Math & Data,wildbench_240829.csv,kendall,random,8,1,0.2857142857142857,0.39875992063492066
649
+ aggregate,aggregate,WildBench Math & Data,wildbench_240829.csv,kendall,random,8,2,0.5714285714285714,0.06101190476190476
650
+ aggregate,aggregate,WildBench Reasoning & Planning,wildbench_240829.csv,kendall,random,8,0,0.6428571428571428,0.03115079365079365
651
+ aggregate,aggregate,WildBench Reasoning & Planning,wildbench_240829.csv,kendall,random,8,1,0.2857142857142857,0.39875992063492066
652
+ aggregate,aggregate,WildBench Reasoning & Planning,wildbench_240829.csv,kendall,random,8,2,0.6428571428571428,0.03115079365079365
653
+ aggregate,aggregate,WildBench Score,wildbench_240829.csv,kendall,random,8,0,0.5714285714285714,0.06101190476190476
654
+ aggregate,aggregate,WildBench Score,wildbench_240829.csv,kendall,random,8,1,0.2857142857142857,0.39875992063492066
655
+ aggregate,aggregate,WildBench Score,wildbench_240829.csv,kendall,random,8,2,0.6428571428571428,0.03115079365079365
656
+ aggregate,aggregate,Decentralized Arena (0-1 Normalized),dec_arena_241022.csv,kendall,random,8,0,0.5714285714285714,0.06101190476190476
657
+ aggregate,aggregate,Decentralized Arena (0-1 Normalized),dec_arena_241022.csv,kendall,random,8,1,0.40006613209931935,0.17023995462900499
658
+ aggregate,aggregate,Decentralized Arena (0-1 Normalized),dec_arena_241022.csv,kendall,random,8,2,0.6428571428571428,0.03115079365079365
659
+ aggregate,aggregate,Arena Hard,arena_hard_240829.csv,kendall,random,8,0,0.6428571428571428,0.03115079365079365
660
+ aggregate,aggregate,Arena Hard,arena_hard_240829.csv,kendall,random,8,1,0.5714285714285714,0.06101190476190476
661
+ aggregate,aggregate,Arena Hard,arena_hard_240829.csv,kendall,random,8,2,0.21428571428571427,0.5484126984126985
662
+ aggregate,aggregate,AgentBench,agenbench_240829.csv,kendall,random,8,0,0.32732683535398854,0.2618277009271762
663
+ aggregate,aggregate,AgentBench,agenbench_240829.csv,kendall,random,8,1,0.32732683535398854,0.2618277009271762
664
+ aggregate,aggregate,AgentBench,agenbench_240829.csv,kendall,random,8,2,0.32732683535398854,0.2618277009271762
665
+ aggregate,aggregate,MT-Bench,mtbench_240829_frozen.csv,kendall,random,8,0,0.6428571428571428,0.03115079365079365
666
+ aggregate,aggregate,MT-Bench,mtbench_240829_frozen.csv,kendall,random,8,1,0.6182840223353117,0.0340492747686748
667
+ aggregate,aggregate,MT-Bench,mtbench_240829_frozen.csv,kendall,random,8,2,0.6428571428571428,0.03115079365079365
668
+ aggregate,aggregate,HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,0,0.6428571428571428,0.03115079365079365
669
+ aggregate,aggregate,HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,1,0.21428571428571427,0.5484126984126985
670
+ aggregate,aggregate,HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,2,0.07142857142857142,0.9048611111111111
671
+ aggregate,aggregate,HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,0,0.42857142857142855,0.17886904761904762
672
+ aggregate,aggregate,HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,1,0.3571428571428571,0.27509920634920637
673
+ aggregate,aggregate,HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,2,0.07142857142857142,0.9048611111111111
674
+ aggregate,aggregate,HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,0,0.6428571428571428,0.03115079365079365
675
+ aggregate,aggregate,HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,1,0.0,1.0
676
+ aggregate,aggregate,HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,2,0.14285714285714285,0.7195436507936508
677
+ aggregate,aggregate,HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,0,0.42857142857142855,0.17886904761904762
678
+ aggregate,aggregate,HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,1,0.42857142857142855,0.17886904761904762
679
+ aggregate,aggregate,HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,2,0.21428571428571427,0.5484126984126985
680
+ aggregate,aggregate,HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,0,0.5714285714285714,0.06101190476190476
681
+ aggregate,aggregate,HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,1,0.3571428571428571,0.27509920634920637
682
+ aggregate,aggregate,HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,2,0.07142857142857142,0.9048611111111111
683
+ aggregate,aggregate,HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,0,0.6428571428571428,0.03115079365079365
684
+ aggregate,aggregate,HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,1,0.3571428571428571,0.27509920634920637
685
+ aggregate,aggregate,HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,2,0.0,1.0
686
+ aggregate,aggregate,HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,0,0.4999999999999999,0.10868055555555556
687
+ aggregate,aggregate,HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,1,0.42857142857142855,0.17886904761904762
688
+ aggregate,aggregate,HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,2,0.42857142857142855,0.17886904761904762
689
+ aggregate,aggregate,BFCL,bfcl_240906.csv,kendall,random,8,0,0.4999999999999999,0.10868055555555556
690
+ aggregate,aggregate,BFCL,bfcl_240906.csv,kendall,random,8,1,0.2857142857142857,0.39875992063492066
691
+ aggregate,aggregate,BFCL,bfcl_240906.csv,kendall,random,8,2,0.42857142857142855,0.17886904761904762
692
+ aggregate,aggregate,eq_bench,eqbench_240912.csv,kendall,random,8,0,0.14285714285714285,0.7195436507936508
693
+ aggregate,aggregate,eq_bench,eqbench_240912.csv,kendall,random,8,1,0.07142857142857142,0.9048611111111111
694
+ aggregate,aggregate,eq_bench,eqbench_240912.csv,kendall,random,8,2,0.4999999999999999,0.10868055555555556
695
+ aggregate,aggregate,magi_hard,eqbench_240912.csv,kendall,random,8,0,0.4999999999999999,0.10868055555555556
696
+ aggregate,aggregate,magi_hard,eqbench_240912.csv,kendall,random,8,1,-0.07142857142857142,0.9048611111111111
697
+ aggregate,aggregate,magi_hard,eqbench_240912.csv,kendall,random,8,2,-0.07142857142857142,0.9048611111111111
698
+ aggregate,aggregate,BIGGEN,biggen_240829.csv,kendall,random,8,0,0.5714285714285714,0.06101190476190476
699
+ aggregate,aggregate,BIGGEN,biggen_240829.csv,kendall,random,8,1,-0.07142857142857142,0.9048611111111111
700
+ aggregate,aggregate,BIGGEN,biggen_240829.csv,kendall,random,8,2,0.5714285714285714,0.06101190476190476
701
+ aggregate,aggregate,BIGGEN Grounding,biggen_240829.csv,kendall,random,8,0,0.6428571428571428,0.03115079365079365
702
+ aggregate,aggregate,BIGGEN Grounding,biggen_240829.csv,kendall,random,8,1,0.07142857142857142,0.9048611111111111
703
+ aggregate,aggregate,BIGGEN Grounding,biggen_240829.csv,kendall,random,8,2,0.6428571428571428,0.03115079365079365
704
+ aggregate,aggregate,BIGGEN Instruction Following,biggen_240829.csv,kendall,random,8,0,0.47280542884465016,0.10506382347888965
705
+ aggregate,aggregate,BIGGEN Instruction Following,biggen_240829.csv,kendall,random,8,1,0.41576092031014994,0.1612822677790775
706
+ aggregate,aggregate,BIGGEN Instruction Following,biggen_240829.csv,kendall,random,8,2,0.7142857142857142,0.014136904761904762
707
+ aggregate,aggregate,BIGGEN Planning,biggen_240829.csv,kendall,random,8,0,0.6428571428571428,0.03115079365079365
708
+ aggregate,aggregate,BIGGEN Planning,biggen_240829.csv,kendall,random,8,1,0.2857142857142857,0.39875992063492066
709
+ aggregate,aggregate,BIGGEN Planning,biggen_240829.csv,kendall,random,8,2,0.5714285714285714,0.06101190476190476
710
+ aggregate,aggregate,BIGGEN Reasoning,biggen_240829.csv,kendall,random,8,0,0.3571428571428571,0.27509920634920637
711
+ aggregate,aggregate,BIGGEN Reasoning,biggen_240829.csv,kendall,random,8,1,-0.21428571428571427,0.5484126984126985
712
+ aggregate,aggregate,BIGGEN Reasoning,biggen_240829.csv,kendall,random,8,2,0.3571428571428571,0.27509920634920637
713
+ aggregate,aggregate,BIGGEN Refinement,biggen_240829.csv,kendall,random,8,0,0.6425396041156862,0.030400749685896046
714
+ aggregate,aggregate,BIGGEN Refinement,biggen_240829.csv,kendall,random,8,1,-0.036369648372665396,0.9007802600472398
715
+ aggregate,aggregate,BIGGEN Refinement,biggen_240829.csv,kendall,random,8,2,0.3571428571428571,0.27509920634920637
716
+ aggregate,aggregate,BIGGEN Safety,biggen_240829.csv,kendall,random,8,0,0.6428571428571428,0.03115079365079365
717
+ aggregate,aggregate,BIGGEN Safety,biggen_240829.csv,kendall,random,8,1,0.21428571428571427,0.5484126984126985
718
+ aggregate,aggregate,BIGGEN Safety,biggen_240829.csv,kendall,random,8,2,0.47280542884465016,0.10506382347888965
719
+ aggregate,aggregate,BIGGEN Theory of Mind,biggen_240829.csv,kendall,random,8,0,0.5714285714285714,0.06101190476190476
720
+ aggregate,aggregate,BIGGEN Theory of Mind,biggen_240829.csv,kendall,random,8,1,0.14285714285714285,0.7195436507936508
721
+ aggregate,aggregate,BIGGEN Theory of Mind,biggen_240829.csv,kendall,random,8,2,0.6428571428571428,0.03115079365079365
722
+ aggregate,aggregate,BIGGEN Tool Usage,biggen_240829.csv,kendall,random,8,0,0.5714285714285714,0.06101190476190476
723
+ aggregate,aggregate,BIGGEN Tool Usage,biggen_240829.csv,kendall,random,8,1,0.14285714285714285,0.7195436507936508
724
+ aggregate,aggregate,BIGGEN Tool Usage,biggen_240829.csv,kendall,random,8,2,0.42857142857142855,0.17886904761904762
725
+ aggregate,aggregate,BIGGEN Multilingual,biggen_240829.csv,kendall,random,8,0,0.8571428571428571,0.001736111111111111
726
+ aggregate,aggregate,BIGGEN Multilingual,biggen_240829.csv,kendall,random,8,1,0.47280542884465016,0.10506382347888965
727
+ aggregate,aggregate,BIGGEN Multilingual,biggen_240829.csv,kendall,random,8,2,0.6428571428571428,0.03115079365079365
728
+ aggregate,aggregate,LiveBench 240624,livebench_240701.csv,kendall,random,8,0,0.5714285714285714,0.06101190476190476
729
+ aggregate,aggregate,LiveBench 240624,livebench_240701.csv,kendall,random,8,1,0.4999999999999999,0.10868055555555556
730
+ aggregate,aggregate,LiveBench 240624,livebench_240701.csv,kendall,random,8,2,0.5714285714285714,0.06101190476190476
731
+ aggregate,aggregate,LiveBench Reasoning Average,livebench_240701.csv,kendall,random,8,0,0.6428571428571428,0.03115079365079365
732
+ aggregate,aggregate,LiveBench Reasoning Average,livebench_240701.csv,kendall,random,8,1,0.42857142857142855,0.17886904761904762
733
+ aggregate,aggregate,LiveBench Reasoning Average,livebench_240701.csv,kendall,random,8,2,0.7637626158259734,0.008839740160738534
734
+ aggregate,aggregate,LiveBench Coding Average,livebench_240701.csv,kendall,random,8,0,0.4999999999999999,0.10868055555555556
735
+ aggregate,aggregate,LiveBench Coding Average,livebench_240701.csv,kendall,random,8,1,0.4999999999999999,0.10868055555555556
736
+ aggregate,aggregate,LiveBench Coding Average,livebench_240701.csv,kendall,random,8,2,0.3571428571428571,0.27509920634920637
737
+ aggregate,aggregate,LiveBench Mathematics Average,livebench_240701.csv,kendall,random,8,0,0.6428571428571428,0.03115079365079365
738
+ aggregate,aggregate,LiveBench Mathematics Average,livebench_240701.csv,kendall,random,8,1,0.5714285714285714,0.06101190476190476
739
+ aggregate,aggregate,LiveBench Mathematics Average,livebench_240701.csv,kendall,random,8,2,0.6428571428571428,0.03115079365079365
740
+ aggregate,aggregate,LiveBench Data Analysis Average,livebench_240701.csv,kendall,random,8,0,0.42857142857142855,0.17886904761904762
741
+ aggregate,aggregate,LiveBench Data Analysis Average,livebench_240701.csv,kendall,random,8,1,0.4999999999999999,0.10868055555555556
742
+ aggregate,aggregate,LiveBench Data Analysis Average,livebench_240701.csv,kendall,random,8,2,0.5714285714285714,0.06101190476190476
743
+ aggregate,aggregate,LiveBench Language Average,livebench_240701.csv,kendall,random,8,0,0.6428571428571428,0.03115079365079365
744
+ aggregate,aggregate,LiveBench Language Average,livebench_240701.csv,kendall,random,8,1,0.7142857142857142,0.014136904761904762
745
+ aggregate,aggregate,LiveBench Language Average,livebench_240701.csv,kendall,random,8,2,0.5714285714285714,0.06101190476190476
746
+ aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,kendall,random,8,0,0.7857142857142856,0.005505952380952381
747
+ aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,kendall,random,8,1,0.6428571428571428,0.03115079365079365
748
+ aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,kendall,random,8,2,0.4999999999999999,0.10868055555555556
749
+ aggregate,aggregate,hydrox_integrity,hydrox_safety_241001.csv,kendall,random,8,0,0.4999999999999999,0.10868055555555556
750
+ aggregate,aggregate,hydrox_integrity,hydrox_safety_241001.csv,kendall,random,8,1,0.0,1.0
751
+ aggregate,aggregate,hydrox_integrity,hydrox_safety_241001.csv,kendall,random,8,2,0.14285714285714285,0.7195436507936508
752
+ aggregate,aggregate,hydrox_overall_score,hydrox_safety_241001.csv,kendall,random,8,0,0.42857142857142855,0.17886904761904762
753
+ aggregate,aggregate,hydrox_overall_score,hydrox_safety_241001.csv,kendall,random,8,1,0.0,1.0
754
+ aggregate,aggregate,hydrox_overall_score,hydrox_safety_241001.csv,kendall,random,8,2,0.14285714285714285,0.7195436507936508
755
+ aggregate,aggregate,hydrox_privacy,hydrox_safety_241001.csv,kendall,random,8,0,0.42857142857142855,0.17886904761904762
756
+ aggregate,aggregate,hydrox_privacy,hydrox_safety_241001.csv,kendall,random,8,1,-0.07142857142857142,0.9048611111111111
757
+ aggregate,aggregate,hydrox_privacy,hydrox_safety_241001.csv,kendall,random,8,2,0.14285714285714285,0.7195436507936508
758
+ aggregate,aggregate,hydrox_safety,hydrox_safety_241001.csv,kendall,random,8,0,0.42857142857142855,0.17886904761904762
759
+ aggregate,aggregate,hydrox_safety,hydrox_safety_241001.csv,kendall,random,8,1,-0.07142857142857142,0.9048611111111111
760
+ aggregate,aggregate,hydrox_safety,hydrox_safety_241001.csv,kendall,random,8,2,0.07142857142857142,0.9048611111111111
761
+ aggregate,aggregate,hydrox_security,hydrox_safety_241001.csv,kendall,random,8,0,0.4999999999999999,0.10868055555555556
762
+ aggregate,aggregate,hydrox_security,hydrox_safety_241001.csv,kendall,random,8,1,0.0,1.0
763
+ aggregate,aggregate,hydrox_security,hydrox_safety_241001.csv,kendall,random,8,2,0.07142857142857142,0.9048611111111111
cache/agreements_cache_5e66a88dab42480065db47711c55c458.csv CHANGED
The diff for this file is too large to render. See raw diff
 
cache/allbenchs_cache_05c0405c5253dda90dc632e052accfd2.csv ADDED
The diff for this file is too large to render. See raw diff
 
cache/allbenchs_cache_5e66a88dab42480065db47711c55c458.csv CHANGED
The diff for this file is too large to render. See raw diff