Yotam-Perlitz commited on
Commit
871f49c
1 Parent(s): 697d2f9

re-write cache

Browse files

Signed-off-by: Yotam-Perlitz <y.perlitz@ibm.com>

cache/aggregate_scoress_cache_05c0405c5253dda90dc632e052accfd2.csv DELETED
@@ -1,130 +0,0 @@
1
- model,score
2
- gemini_1_5_pro_api_0409_preview,1.0
3
- gemini_1_5_pro_exp_0801,0.9921875
4
- chatgpt_4o_latest,0.984375
5
- gpt_3_5_turbo_0314,0.9765625
6
- bard_jan_24_gemini_pro,0.96875
7
- claude_1,0.9609375
8
- gemini_advanced_0514,0.953125
9
- llama3_1_70b_instruct,0.9453125
10
- gpt_4o_2024_05_13,0.9375
11
- gpt_4o_2024_08_06,0.9296875
12
- gpt_4o_mini_2024_07_18,0.921875
13
- claude_3_5_sonnet_20240620,0.9140625
14
- claude_3_opus_20240229,0.90625
15
- athene_70b_0725,0.8984375
16
- gemini_pro_dev_api,0.890625
17
- claude_2_0,0.8828125
18
- glm_4_0520,0.875
19
- nemotron_4_340b_instruct,0.8671875
20
- yi_large_preview,0.859375
21
- llama_2_70b_chat,0.8515625
22
- reka_core_20240722,0.84375
23
- gemini_1_5_pro_api_0514,0.8359375
24
- gemini_pro,0.828125
25
- llama3_1_405b_instruct,0.8203125
26
- mistral_large_2407,0.8125
27
- gpt_4_turbo_2024_04_09,0.8046875
28
- gpt_4_1106_preview,0.796875
29
- gpt_3_5_turbo_0613,0.7890625
30
- gpt_4_0125_preview,0.78125
31
- glm_4_0116,0.7734375
32
- zephyr_orpo_141b_a35b_v0_1,0.765625
33
- qwen_max_0428,0.7578125
34
- claude_instant_1,0.75
35
- yi_large,0.7421875
36
- deepseek_coder_v2_0724,0.734375
37
- deepseek_v2_api_0628,0.7265625
38
- gemini_1_5_flash_api_0514,0.71875
39
- llama3_70b_instruct,0.7109375
40
- command_r_plus,0.703125
41
- gpt_4_0314,0.6953125
42
- claude_2_1,0.6875
43
- wizardlm_70b,0.6796875
44
- gemma_2_27b_it,0.671875
45
- dolphin_2_2_1_mistral_7b,0.6640625
46
- guanaco_33b,0.65625
47
- nous_hermes_2_mixtral_8x7b_dpo,0.6484375
48
- wizardlm_13b,0.640625
49
- mpt_30b_chat,0.6328125
50
- qwen1_5_110b_chat,0.625
51
- claude_3_sonnet_20240229,0.6171875
52
- mistral_next,0.609375
53
- deepseek_coder_v2,0.6015625
54
- reka_flash_21b_20240226_online,0.59375
55
- starling_lm_7b_beta,0.5859375
56
- llama2_70b_steerlm_chat,0.578125
57
- mistral_medium,0.5703125
58
- llama_2_13b_chat,0.5625
59
- tulu_2_dpo_70b,0.5546875
60
- reka_core_20240501,0.546875
61
- gpt_4_0613,0.5390625
62
- deepseek_llm_67b_chat,0.53125
63
- solar_10_7b_instruct_v1_0,0.5234375
64
- openchat_3_5_0106,0.515625
65
- reka_flash_20240722,0.5078125
66
- gemma_2_9b_it,0.5
67
- llama3_1_8b_instruct,0.4921875
68
- openchat_3_5,0.484375
69
- pplx_7b_online,0.4765625
70
- qwen1_5_72b_chat,0.46875
71
- zephyr_7b_alpha,0.4609375
72
- claude_3_haiku_20240307,0.453125
73
- starling_lm_7b_alpha,0.4453125
74
- reka_flash_21b_20240226,0.4375
75
- mistral_large_2402,0.4296875
76
- gpt_3_5_turbo_1106,0.421875
77
- qwen1_5_7b_chat,0.4140625
78
- reka_flash_preview_20240611,0.40625
79
- yi_1_5_34b_chat,0.3984375
80
- openhermes_2_5_mistral_7b,0.390625
81
- codellama34b_instruct,0.3828125
82
- qwen1_5_14b_chat,0.375
83
- yi_34b_chat,0.3671875
84
- pplx_70b_online,0.359375
85
- qwen2_72b_instruct,0.3515625
86
- dbrx_instructruct_preview,0.34375
87
- llama3_8b_instruct,0.3359375
88
- falcon_180b_chat,0.328125
89
- palm_2,0.3203125
90
- qwen_14b_chat,0.3125
91
- stripedhyena_nous_7b,0.3046875
92
- qwen1_5_32b_chat,0.296875
93
- command_r,0.2890625
94
- gemma_7b_it,0.28125
95
- zephyr_7b_beta,0.2734375
96
- mixtral_8x22b_instruct_v0_1,0.265625
97
- vicuna_7b,0.2578125
98
- snowflake_arctic_instruct,0.25
99
- vicuna_33b,0.2421875
100
- gemma_2_2b_it,0.234375
101
- koala_13b,0.2265625
102
- gpt_3_5_turbo_0125,0.21875
103
- mistral_7b_instruct,0.2109375
104
- llama_2_7b_chat,0.203125
105
- mistral_7b_instruct_v0_2,0.1953125
106
- gemma_1_1_7b_it,0.1875
107
- gpt4all_13b_snoozy,0.1796875
108
- phi_3_small_8k_instruct,0.171875
109
- olmo_7b_instruct,0.1640625
110
- phi_3_mini_4k_instruct,0.15625
111
- phi_3_mini_128k_instruct,0.1484375
112
- rwkv_4_raven_14b,0.140625
113
- vicuna_13b,0.1328125
114
- codellama_70b_instruct,0.125
115
- mpt_7b_chat,0.1171875
116
- mixtral_8x7b_instruct_v0_1,0.109375
117
- phi_3_medium_4k_instruct,0.1015625
118
- gemma_2b_it,0.09375
119
- phi_3_mini_4k_instruct_june_2024,0.0859375
120
- qwen1_5_4b_chat,0.078125
121
- chatglm_6b,0.0703125
122
- alpaca_13b,0.0625
123
- gemma_1_1_2b_it,0.0546875
124
- chatglm2_6b,0.046875
125
- stablelm_tuned_alpha_7b,0.0390625
126
- chatglm3_6b,0.03125
127
- oasst_pythia_12b,0.0234375
128
- llama_13b,0.015625
129
- fastchat_t5_3b,0.0078125
130
- dolly_v2_12b,0.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cache/agreements_cache_05c0405c5253dda90dc632e052accfd2.csv DELETED
@@ -1,763 +0,0 @@
1
- scenario,scenario_source,ref_scenario,ref_source,corr_type,model_select_strategy,model_subset_size_requested,exp_n,correlation,p_value
2
- Helm Lite,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,0,0.5455447255899809,0.0614649096074132
3
- Helm Lite,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,1,0.8571428571428571,0.001736111111111111
4
- Helm Lite,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,2,0.5455447255899809,0.0614649096074132
5
- Helm Lite NarrativeQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,0,0.2857142857142857,0.39875992063492066
6
- Helm Lite NarrativeQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,1,0.3571428571428571,0.27509920634920637
7
- Helm Lite NarrativeQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,2,0.47280542884465016,0.10506382347888965
8
- Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,0,0.3571428571428571,0.27509920634920637
9
- Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,1,0.2857142857142857,0.39875992063492066
10
- Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,2,0.42857142857142855,0.17886904761904762
11
- Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,0,0.4999999999999999,0.10868055555555556
12
- Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,1,0.4999999999999999,0.10868055555555556
13
- Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,2,0.42857142857142855,0.17886904761904762
14
- Helm Lite OpenBookQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,0,0.2857142857142857,0.39875992063492066
15
- Helm Lite OpenBookQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,1,0.6428571428571428,0.03115079365079365
16
- Helm Lite OpenBookQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,2,0.5455447255899809,0.0614649096074132
17
- Helm Lite MMLU,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,0,0.3571428571428571,0.27509920634920637
18
- Helm Lite MMLU,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,1,0.5714285714285714,0.06101190476190476
19
- Helm Lite MMLU,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,2,0.42857142857142855,0.17886904761904762
20
- Helm Lite MathEquivalentCOT,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,0,0.42857142857142855,0.17886904761904762
21
- Helm Lite MathEquivalentCOT,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7142857142857142,0.014136904761904762
22
- Helm Lite MathEquivalentCOT,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,2,0.47280542884465016,0.10506382347888965
23
- Helm Lite GSM8K,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,0,0.07142857142857142,0.9048611111111111
24
- Helm Lite GSM8K,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,1,0.4999999999999999,0.10868055555555556
25
- Helm Lite GSM8K,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,2,0.2857142857142857,0.39875992063492066
26
- Helm Lite LegalBench,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,0,0.3571428571428571,0.27509920634920637
27
- Helm Lite LegalBench,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,1,0.4999999999999999,0.10868055555555556
28
- Helm Lite LegalBench,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,2,0.5714285714285714,0.06101190476190476
29
- Helm Lite MedQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,0,0.2857142857142857,0.39875992063492066
30
- Helm Lite MedQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,1,0.4999999999999999,0.10868055555555556
31
- Helm Lite MedQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,2,0.42857142857142855,0.17886904761904762
32
- Helm Lite WMT2014,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,0,0.2545875386086578,0.38281014365989596
33
- Helm Lite WMT2014,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,1,0.42857142857142855,0.17886904761904762
34
- Helm Lite WMT2014,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,2,0.5455447255899809,0.0614649096074132
35
- LMSys Arena,chatbot_arena_241104.csv,aggregate,aggregate,kendall,random,8,0,0.9999999999999998,4.96031746031746e-05
36
- LMSys Arena,chatbot_arena_241104.csv,aggregate,aggregate,kendall,random,8,1,0.9999999999999998,4.96031746031746e-05
37
- LMSys Arena,chatbot_arena_241104.csv,aggregate,aggregate,kendall,random,8,2,0.9999999999999998,4.96031746031746e-05
38
- HF OpenLLM v2,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,0,0.3571428571428571,0.27509920634920637
39
- HF OpenLLM v2,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,1,0.42857142857142855,0.17886904761904762
40
- HF OpenLLM v2,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7142857142857142,0.014136904761904762
41
- HFv2 BBH,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,0,0.21428571428571427,0.5484126984126985
42
- HFv2 BBH,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,1,0.42857142857142855,0.17886904761904762
43
- HFv2 BBH,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,2,0.3571428571428571,0.27509920634920637
44
- HFv2 GPQA,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,0,0.6428571428571428,0.03115079365079365
45
- HFv2 GPQA,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,1,0.21428571428571427,0.5484126984126985
46
- HFv2 GPQA,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,2,0.07142857142857142,0.9048611111111111
47
- HFv2 IFEval,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,0,0.3571428571428571,0.27509920634920637
48
- HFv2 IFEval,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,1,0.42857142857142855,0.17886904761904762
49
- HFv2 IFEval,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7857142857142856,0.005505952380952381
50
- HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,0,0.4999999999999999,0.10868055555555556
51
- HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,1,0.42857142857142855,0.17886904761904762
52
- HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,2,0.5714285714285714,0.06101190476190476
53
- HFv2 Math Level 5,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,0,0.42857142857142855,0.17886904761904762
54
- HFv2 Math Level 5,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,1,0.2857142857142857,0.39875992063492066
55
- HFv2 Math Level 5,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,2,0.5714285714285714,0.06101190476190476
56
- HFv2 MuSR,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,0,0.5714285714285714,0.06101190476190476
57
- HFv2 MuSR,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,1,0.3571428571428571,0.27509920634920637
58
- HFv2 MuSR,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,2,0.07142857142857142,0.9048611111111111
59
- tablebench_overall_dp,tablebench_241002.csv,aggregate,aggregate,kendall,random,8,0,0.8571428571428571,0.001736111111111111
60
- tablebench_overall_dp,tablebench_241002.csv,aggregate,aggregate,kendall,random,8,1,0.3571428571428571,0.27509920634920637
61
- tablebench_overall_dp,tablebench_241002.csv,aggregate,aggregate,kendall,random,8,2,0.2857142857142857,0.39875992063492066
62
- trustworthy_average,llm_trustworthy_241001.csv,aggregate,aggregate,kendall,random,8,0,0.5714285714285714,0.06101190476190476
63
- trustworthy_average,llm_trustworthy_241001.csv,aggregate,aggregate,kendall,random,8,1,0.5714285714285714,0.06101190476190476
64
- trustworthy_average,llm_trustworthy_241001.csv,aggregate,aggregate,kendall,random,8,2,0.5714285714285714,0.06101190476190476
65
- trustworthy_non_toxicity,llm_trustworthy_241001.csv,aggregate,aggregate,kendall,random,8,0,0.14285714285714285,0.7195436507936508
66
- trustworthy_non_toxicity,llm_trustworthy_241001.csv,aggregate,aggregate,kendall,random,8,1,0.14285714285714285,0.7195436507936508
67
- trustworthy_non_toxicity,llm_trustworthy_241001.csv,aggregate,aggregate,kendall,random,8,2,0.14285714285714285,0.7195436507936508
68
- trustworthy_non_stereotype,llm_trustworthy_241001.csv,aggregate,aggregate,kendall,random,8,0,0.3571428571428571,0.27509920634920637
69
- trustworthy_non_stereotype,llm_trustworthy_241001.csv,aggregate,aggregate,kendall,random,8,1,0.3571428571428571,0.27509920634920637
70
- trustworthy_non_stereotype,llm_trustworthy_241001.csv,aggregate,aggregate,kendall,random,8,2,0.3571428571428571,0.27509920634920637
71
- trustworthy_advglue_pp,llm_trustworthy_241001.csv,aggregate,aggregate,kendall,random,8,0,0.42857142857142855,0.17886904761904762
72
- trustworthy_advglue_pp,llm_trustworthy_241001.csv,aggregate,aggregate,kendall,random,8,1,0.42857142857142855,0.17886904761904762
73
- trustworthy_advglue_pp,llm_trustworthy_241001.csv,aggregate,aggregate,kendall,random,8,2,0.42857142857142855,0.17886904761904762
74
- trustworthy_ood,llm_trustworthy_241001.csv,aggregate,aggregate,kendall,random,8,0,0.6428571428571428,0.03115079365079365
75
- trustworthy_ood,llm_trustworthy_241001.csv,aggregate,aggregate,kendall,random,8,1,0.6428571428571428,0.03115079365079365
76
- trustworthy_ood,llm_trustworthy_241001.csv,aggregate,aggregate,kendall,random,8,2,0.6428571428571428,0.03115079365079365
77
- trustworthy_adv_demo,llm_trustworthy_241001.csv,aggregate,aggregate,kendall,random,8,0,0.6428571428571428,0.03115079365079365
78
- trustworthy_adv_demo,llm_trustworthy_241001.csv,aggregate,aggregate,kendall,random,8,1,0.6428571428571428,0.03115079365079365
79
- trustworthy_adv_demo,llm_trustworthy_241001.csv,aggregate,aggregate,kendall,random,8,2,0.6428571428571428,0.03115079365079365
80
- trustworthy_privacy,llm_trustworthy_241001.csv,aggregate,aggregate,kendall,random,8,0,0.07142857142857142,0.9048611111111111
81
- trustworthy_privacy,llm_trustworthy_241001.csv,aggregate,aggregate,kendall,random,8,1,0.07142857142857142,0.9048611111111111
82
- trustworthy_privacy,llm_trustworthy_241001.csv,aggregate,aggregate,kendall,random,8,2,0.07142857142857142,0.9048611111111111
83
- trustworthy_ethics,llm_trustworthy_241001.csv,aggregate,aggregate,kendall,random,8,0,0.5714285714285714,0.06101190476190476
84
- trustworthy_ethics,llm_trustworthy_241001.csv,aggregate,aggregate,kendall,random,8,1,0.5714285714285714,0.06101190476190476
85
- trustworthy_ethics,llm_trustworthy_241001.csv,aggregate,aggregate,kendall,random,8,2,0.5714285714285714,0.06101190476190476
86
- trustworthy_fairness,llm_trustworthy_241001.csv,aggregate,aggregate,kendall,random,8,0,-0.6910233190806424,0.017844011512848347
87
- trustworthy_fairness,llm_trustworthy_241001.csv,aggregate,aggregate,kendall,random,8,1,-0.6910233190806424,0.017844011512848347
88
- trustworthy_fairness,llm_trustworthy_241001.csv,aggregate,aggregate,kendall,random,8,2,-0.6910233190806424,0.017844011512848347
89
- OpenCompass Academic,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,0,0.4999999999999999,0.10868055555555556
90
- OpenCompass Academic,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,1,0.3571428571428571,0.27509920634920637
91
- OpenCompass Academic,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,2,0.4999999999999999,0.10868055555555556
92
- OpenCompass MMLU,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,0,0.3571428571428571,0.27509920634920637
93
- OpenCompass MMLU,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,1,0.21428571428571427,0.5484126984126985
94
- OpenCompass MMLU,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,2,0.42857142857142855,0.17886904761904762
95
- OpenCompass MMLU Pro,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,0,0.42857142857142855,0.17886904761904762
96
- OpenCompass MMLU Pro,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,1,0.2857142857142857,0.39875992063492066
97
- OpenCompass MMLU Pro,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,2,0.3571428571428571,0.27509920634920637
98
- OpenCompass CMMLU,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,0,0.3571428571428571,0.27509920634920637
99
- OpenCompass CMMLU,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,1,0.21428571428571427,0.5484126984126985
100
- OpenCompass CMMLU,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,2,0.2857142857142857,0.39875992063492066
101
- OpenCompass BBH,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,0,0.4999999999999999,0.10868055555555556
102
- OpenCompass BBH,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,1,0.2857142857142857,0.39875992063492066
103
- OpenCompass BBH,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,2,0.4999999999999999,0.10868055555555556
104
- OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,0,0.036369648372665396,0.9007802600472398
105
- OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,1,0.07142857142857142,0.9048611111111111
106
- OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,2,-0.036369648372665396,0.9007802600472398
107
- OpenCompass HumanEval,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,0,0.4447495899966607,0.1315867602811863
108
- OpenCompass HumanEval,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,1,0.2545875386086578,0.38281014365989596
109
- OpenCompass HumanEval,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,2,0.40006613209931935,0.17023995462900499
110
- OpenCompass IFEval,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,0,0.2857142857142857,0.39875992063492066
111
- OpenCompass IFEval,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,1,0.21428571428571427,0.5484126984126985
112
- OpenCompass IFEval,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,2,0.21428571428571427,0.5484126984126985
113
- Helm MMLU,helm_mmlu_240829.csv,aggregate,aggregate,kendall,random,8,0,0.4999999999999999,0.10868055555555556
114
- Helm MMLU,helm_mmlu_240829.csv,aggregate,aggregate,kendall,random,8,1,0.3571428571428571,0.27509920634920637
115
- Helm MMLU,helm_mmlu_240829.csv,aggregate,aggregate,kendall,random,8,2,0.3571428571428571,0.27509920634920637
116
- MMLU Pro,mmlu_pro_240829.csv,aggregate,aggregate,kendall,random,8,0,0.5714285714285714,0.06101190476190476
117
- MMLU Pro,mmlu_pro_240829.csv,aggregate,aggregate,kendall,random,8,1,0.5714285714285714,0.06101190476190476
118
- MMLU Pro,mmlu_pro_240829.csv,aggregate,aggregate,kendall,random,8,2,0.5714285714285714,0.06101190476190476
119
- MixEval,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,0,0.6428571428571428,0.03115079365079365
120
- MixEval,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,1,0.5714285714285714,0.06101190476190476
121
- MixEval,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7142857142857142,0.014136904761904762
122
- MixEval Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,0,0.9999999999999998,4.96031746031746e-05
123
- MixEval Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,1,0.6428571428571428,0.03115079365079365
124
- MixEval Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,2,0.8571428571428571,0.001736111111111111
125
- MixEval TriviaQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,0,0.4999999999999999,0.10868055555555556
126
- MixEval TriviaQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,1,0.14285714285714285,0.7195436507936508
127
- MixEval TriviaQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,2,0.4999999999999999,0.10868055555555556
128
- MixEval MMLU,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,0,0.8571428571428571,0.001736111111111111
129
- MixEval MMLU,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7142857142857142,0.014136904761904762
130
- MixEval MMLU,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7857142857142856,0.005505952380952381
131
- MixEval DROP,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7142857142857142,0.014136904761904762
132
- MixEval DROP,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,1,0.6428571428571428,0.03115079365079365
133
- MixEval DROP,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7142857142857142,0.014136904761904762
134
- MixEval HellaSwag,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,0,0.8571428571428571,0.001736111111111111
135
- MixEval HellaSwag,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,1,0.4999999999999999,0.10868055555555556
136
- MixEval HellaSwag,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,2,0.6910233190806425,0.017844011512848347
137
- MixEval CommonsenseQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7142857142857142,0.014136904761904762
138
- MixEval CommonsenseQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,1,0.47280542884465016,0.10506382347888965
139
- MixEval CommonsenseQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,2,0.6182840223353117,0.0340492747686748
140
- MixEval TriviaQA Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7142857142857142,0.014136904761904762
141
- MixEval TriviaQA Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,1,0.4999999999999999,0.10868055555555556
142
- MixEval TriviaQA Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,2,0.5714285714285714,0.06101190476190476
143
- MixEval MMLU Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7857142857142856,0.005505952380952381
144
- MixEval MMLU Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,1,0.6428571428571428,0.03115079365079365
145
- MixEval MMLU Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,2,0.8571428571428571,0.001736111111111111
146
- MixEval DROP Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,0,0.6428571428571428,0.03115079365079365
147
- MixEval DROP Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,1,0.4999999999999999,0.10868055555555556
148
- MixEval DROP Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,2,0.6428571428571428,0.03115079365079365
149
- AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,aggregate,aggregate,kendall,random,8,0,0.21428571428571427,0.5484126984126985
150
- AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7142857142857142,0.014136904761904762
151
- AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,aggregate,aggregate,kendall,random,8,2,0.2545875386086578,0.38281014365989596
152
- HELM AirBench Security Risks,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,0,-0.42857142857142855,0.17886904761904762
153
- HELM AirBench Security Risks,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,1,-0.2857142857142857,0.39875992063492066
154
- HELM AirBench Security Risks,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,2,-0.21428571428571427,0.5484126984126985
155
- HELM AirBench Operational Misuses,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,0,-0.5714285714285714,0.06101190476190476
156
- HELM AirBench Operational Misuses,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,1,-0.18184824186332696,0.5330356744917513
157
- HELM AirBench Operational Misuses,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,2,-0.42857142857142855,0.17886904761904762
158
- HELM AirBench Violence & Extremism,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,0,-0.5714285714285714,0.06101190476190476
159
- HELM AirBench Violence & Extremism,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,1,-0.7142857142857142,0.014136904761904762
160
- HELM AirBench Violence & Extremism,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,2,-0.3571428571428571,0.27509920634920637
161
- HELM AirBench Hate/Toxicity,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,0,-0.5714285714285714,0.06101190476190476
162
- HELM AirBench Hate/Toxicity,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,1,-0.7142857142857142,0.014136904761904762
163
- HELM AirBench Hate/Toxicity,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,2,-0.2857142857142857,0.39875992063492066
164
- HELM AirBench Sexual Content,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,0,-0.42857142857142855,0.17886904761904762
165
- HELM AirBench Sexual Content,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,1,-0.5714285714285714,0.06101190476190476
166
- HELM AirBench Sexual Content,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,2,-0.42857142857142855,0.17886904761904762
167
- HELM AirBench Child Harm,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,0,-0.5714285714285714,0.06101190476190476
168
- HELM AirBench Child Harm,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,1,-0.5714285714285714,0.06101190476190476
169
- HELM AirBench Child Harm,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,2,-0.42857142857142855,0.17886904761904762
170
- HELM AirBench Self Harm,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,0,-0.40006613209931935,0.17023995462900499
171
- HELM AirBench Self Harm,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,1,-0.5714285714285714,0.06101190476190476
172
- HELM AirBench Self Harm,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,2,-0.40006613209931935,0.17023995462900499
173
- HELM AirBench Political Usage,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,0,-0.5714285714285714,0.06101190476190476
174
- HELM AirBench Political Usage,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,1,-0.5714285714285714,0.06101190476190476
175
- HELM AirBench Political Usage,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,2,-0.3571428571428571,0.27509920634920637
176
- HELM AirBench Economic Harm,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,0,-0.5714285714285714,0.06101190476190476
177
- HELM AirBench Economic Harm,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,1,-0.6182840223353117,0.0340492747686748
178
- HELM AirBench Economic Harm,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,2,-0.3571428571428571,0.27509920634920637
179
- HELM AirBench Deception,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,0,-0.4999999999999999,0.10868055555555556
180
- HELM AirBench Deception,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,1,-0.6428571428571428,0.03115079365079365
181
- HELM AirBench Deception,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,2,-0.40006613209931935,0.17023995462900499
182
- HELM AirBench Manipulation,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,0,-0.5714285714285714,0.06101190476190476
183
- HELM AirBench Manipulation,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,1,-0.6182840223353117,0.0340492747686748
184
- HELM AirBench Manipulation,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,2,-0.3571428571428571,0.27509920634920637
185
- HELM AirBench Defamation,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,0,-0.40006613209931935,0.17023995462900499
186
- HELM AirBench Defamation,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,1,-0.4999999999999999,0.10868055555555556
187
- HELM AirBench Defamation,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,2,-0.40006613209931935,0.17023995462900499
188
- HELM AirBench Fundamental Rights,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,0,-0.6182840223353117,0.0340492747686748
189
- HELM AirBench Fundamental Rights,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,1,-0.5714285714285714,0.06101190476190476
190
- HELM AirBench Fundamental Rights,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,2,-0.41576092031014994,0.1612822677790775
191
- HELM AirBench Discrimination/Bias,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,0,-0.5714285714285714,0.06101190476190476
192
- HELM AirBench Discrimination/Bias,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,1,-0.5714285714285714,0.06101190476190476
193
- HELM AirBench Discrimination/Bias,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,2,-0.4999999999999999,0.10868055555555556
194
- HELM AirBench Privacy,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,0,-0.5714285714285714,0.06101190476190476
195
- HELM AirBench Privacy,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,1,-0.3571428571428571,0.27509920634920637
196
- HELM AirBench Privacy,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,2,-0.21428571428571427,0.5484126984126985
197
- HELM AirBench Criminal Activities,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,0,-0.5714285714285714,0.06101190476190476
198
- HELM AirBench Criminal Activities,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,1,-0.5714285714285714,0.06101190476190476
199
- HELM AirBench Criminal Activities,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,2,-0.42857142857142855,0.17886904761904762
200
- HELM AirBench AIR Score,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,0,-0.6428571428571428,0.03115079365079365
201
- HELM AirBench AIR Score,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,1,-0.5714285714285714,0.06101190476190476
202
- HELM AirBench AIR Score,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,2,-0.47280542884465016,0.10506382347888965
203
- OpenCompass,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7857142857142856,0.005505952380952381
204
- OpenCompass,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,1,0.4999999999999999,0.10868055555555556
205
- OpenCompass,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,2,0.3571428571428571,0.27509920634920637
206
- OpenCompass Language,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,0,0.21428571428571427,0.5484126984126985
207
- OpenCompass Language,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,1,0.42857142857142855,0.17886904761904762
208
- OpenCompass Language,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,2,0.4999999999999999,0.10868055555555556
209
- OpenCompass Knowledge,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,0,0.4999999999999999,0.10868055555555556
210
- OpenCompass Knowledge,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,1,0.3571428571428571,0.27509920634920637
211
- OpenCompass Knowledge,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,2,-0.21428571428571427,0.5484126984126985
212
- OpenCompass Reasoning,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,0,0.4999999999999999,0.10868055555555556
213
- OpenCompass Reasoning,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,1,0.42857142857142855,0.17886904761904762
214
- OpenCompass Reasoning,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,2,0.4999999999999999,0.10868055555555556
215
- OpenCompass Math,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7142857142857142,0.014136904761904762
216
- OpenCompass Math,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,1,0.42857142857142855,0.17886904761904762
217
- OpenCompass Math,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,2,0.14285714285714285,0.7195436507936508
218
- OpenCompass Code,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,0,0.6428571428571428,0.03115079365079365
219
- OpenCompass Code,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,1,0.6910233190806425,0.017844011512848347
220
- OpenCompass Code,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,2,0.40006613209931935,0.17023995462900499
221
- OpenCompass Instruction,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7142857142857142,0.014136904761904762
222
- OpenCompass Instruction,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7857142857142856,0.005505952380952381
223
- OpenCompass Instruction,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7857142857142856,0.005505952380952381
224
- OpenCompass Agent,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,0,0.5714285714285714,0.06101190476190476
225
- OpenCompass Agent,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,1,0.21428571428571427,0.5484126984126985
226
- OpenCompass Agent,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,2,-0.14285714285714285,0.7195436507936508
227
- OpenCompass Arena,opencompass_arena_240829.csv,aggregate,aggregate,kendall,random,8,0,0.32732683535398854,0.2618277009271762
228
- OpenCompass Arena,opencompass_arena_240829.csv,aggregate,aggregate,kendall,random,8,1,0.4999999999999999,0.10868055555555556
229
- OpenCompass Arena,opencompass_arena_240829.csv,aggregate,aggregate,kendall,random,8,2,0.21428571428571427,0.5484126984126985
230
- LiveBench 240725,livebench_240829.csv,aggregate,aggregate,kendall,random,8,0,0.6428571428571428,0.03115079365079365
231
- LiveBench 240725,livebench_240829.csv,aggregate,aggregate,kendall,random,8,1,0.42857142857142855,0.17886904761904762
232
- LiveBench 240725,livebench_240829.csv,aggregate,aggregate,kendall,random,8,2,0.3571428571428571,0.27509920634920637
233
- LiveBench Reasoning,livebench_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7142857142857142,0.014136904761904762
234
- LiveBench Reasoning,livebench_240829.csv,aggregate,aggregate,kendall,random,8,1,0.2545875386086578,0.38281014365989596
235
- LiveBench Reasoning,livebench_240829.csv,aggregate,aggregate,kendall,random,8,2,0.3571428571428571,0.27509920634920637
236
- LiveBench Coding,livebench_240829.csv,aggregate,aggregate,kendall,random,8,0,0.4999999999999999,0.10868055555555556
237
- LiveBench Coding,livebench_240829.csv,aggregate,aggregate,kendall,random,8,1,0.6428571428571428,0.03115079365079365
238
- LiveBench Coding,livebench_240829.csv,aggregate,aggregate,kendall,random,8,2,0.2857142857142857,0.39875992063492066
239
- LiveBench Mathematics,livebench_240829.csv,aggregate,aggregate,kendall,random,8,0,0.4999999999999999,0.10868055555555556
240
- LiveBench Mathematics,livebench_240829.csv,aggregate,aggregate,kendall,random,8,1,0.2857142857142857,0.39875992063492066
241
- LiveBench Mathematics,livebench_240829.csv,aggregate,aggregate,kendall,random,8,2,0.42857142857142855,0.17886904761904762
242
- LiveBench Data Analysis,livebench_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7142857142857142,0.014136904761904762
243
- LiveBench Data Analysis,livebench_240829.csv,aggregate,aggregate,kendall,random,8,1,0.4999999999999999,0.10868055555555556
244
- LiveBench Data Analysis,livebench_240829.csv,aggregate,aggregate,kendall,random,8,2,-0.07142857142857142,0.9048611111111111
245
- LiveBench Language,livebench_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7857142857142856,0.005505952380952381
246
- LiveBench Language,livebench_240829.csv,aggregate,aggregate,kendall,random,8,1,0.5714285714285714,0.06101190476190476
247
- LiveBench Language,livebench_240829.csv,aggregate,aggregate,kendall,random,8,2,0.42857142857142855,0.17886904761904762
248
- LiveBench Instruction Following,livebench_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7142857142857142,0.014136904761904762
249
- LiveBench Instruction Following,livebench_240829.csv,aggregate,aggregate,kendall,random,8,1,0.5714285714285714,0.06101190476190476
250
- LiveBench Instruction Following,livebench_240829.csv,aggregate,aggregate,kendall,random,8,2,0.5714285714285714,0.06101190476190476
251
- Enkrypt AI Safety,enkrypt_ai_safety_240916.csv,aggregate,aggregate,kendall,random,8,0,0.4999999999999999,0.10868055555555556
252
- Enkrypt AI Safety,enkrypt_ai_safety_240916.csv,aggregate,aggregate,kendall,random,8,1,0.3571428571428571,0.27509920634920637
253
- Enkrypt AI Safety,enkrypt_ai_safety_240916.csv,aggregate,aggregate,kendall,random,8,2,0.2857142857142857,0.39875992063492066
254
- WildBench Elo LC,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,0,0.5714285714285714,0.06101190476190476
255
- WildBench Elo LC,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,1,0.21428571428571427,0.5484126984126985
256
- WildBench Elo LC,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,2,0.5714285714285714,0.06101190476190476
257
- WildBench Information Seeking,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7142857142857142,0.014136904761904762
258
- WildBench Information Seeking,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,1,0.3571428571428571,0.27509920634920637
259
- WildBench Information Seeking,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7857142857142856,0.005505952380952381
260
- WildBench Creative,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7857142857142856,0.005505952380952381
261
- WildBench Creative,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,1,0.3571428571428571,0.27509920634920637
262
- WildBench Creative,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7857142857142856,0.005505952380952381
263
- WildBench Code Debugging,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,0,0.5714285714285714,0.06101190476190476
264
- WildBench Code Debugging,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,1,0.21428571428571427,0.5484126984126985
265
- WildBench Code Debugging,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,2,0.6428571428571428,0.03115079365079365
266
- WildBench Math & Data,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,0,0.5714285714285714,0.06101190476190476
267
- WildBench Math & Data,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,1,0.2857142857142857,0.39875992063492066
268
- WildBench Math & Data,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,2,0.5714285714285714,0.06101190476190476
269
- WildBench Reasoning & Planning,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,0,0.6428571428571428,0.03115079365079365
270
- WildBench Reasoning & Planning,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,1,0.2857142857142857,0.39875992063492066
271
- WildBench Reasoning & Planning,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,2,0.6428571428571428,0.03115079365079365
272
- WildBench Score,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,0,0.5714285714285714,0.06101190476190476
273
- WildBench Score,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,1,0.2857142857142857,0.39875992063492066
274
- WildBench Score,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,2,0.6428571428571428,0.03115079365079365
275
- Decentralized Arena (0-1 Normalized),dec_arena_241022.csv,aggregate,aggregate,kendall,random,8,0,0.5714285714285714,0.06101190476190476
276
- Decentralized Arena (0-1 Normalized),dec_arena_241022.csv,aggregate,aggregate,kendall,random,8,1,0.40006613209931935,0.17023995462900499
277
- Decentralized Arena (0-1 Normalized),dec_arena_241022.csv,aggregate,aggregate,kendall,random,8,2,0.6428571428571428,0.03115079365079365
278
- Arena Hard,arena_hard_240829.csv,aggregate,aggregate,kendall,random,8,0,0.6428571428571428,0.03115079365079365
279
- Arena Hard,arena_hard_240829.csv,aggregate,aggregate,kendall,random,8,1,0.5714285714285714,0.06101190476190476
280
- Arena Hard,arena_hard_240829.csv,aggregate,aggregate,kendall,random,8,2,0.21428571428571427,0.5484126984126985
281
- AgentBench,agenbench_240829.csv,aggregate,aggregate,kendall,random,8,0,0.32732683535398854,0.2618277009271762
282
- AgentBench,agenbench_240829.csv,aggregate,aggregate,kendall,random,8,1,0.32732683535398854,0.2618277009271762
283
- AgentBench,agenbench_240829.csv,aggregate,aggregate,kendall,random,8,2,0.32732683535398854,0.2618277009271762
284
- MT-Bench,mtbench_240829_frozen.csv,aggregate,aggregate,kendall,random,8,0,0.6428571428571428,0.03115079365079365
285
- MT-Bench,mtbench_240829_frozen.csv,aggregate,aggregate,kendall,random,8,1,0.6182840223353117,0.0340492747686748
286
- MT-Bench,mtbench_240829_frozen.csv,aggregate,aggregate,kendall,random,8,2,0.6428571428571428,0.03115079365079365
287
- HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,0,0.6428571428571428,0.03115079365079365
288
- HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,1,0.21428571428571427,0.5484126984126985
289
- HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,2,0.07142857142857142,0.9048611111111111
290
- HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,0,0.42857142857142855,0.17886904761904762
291
- HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,1,0.3571428571428571,0.27509920634920637
292
- HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,2,0.07142857142857142,0.9048611111111111
293
- HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,0,0.6428571428571428,0.03115079365079365
294
- HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,1,0.0,1.0
295
- HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,2,0.14285714285714285,0.7195436507936508
296
- HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,0,0.42857142857142855,0.17886904761904762
297
- HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,1,0.42857142857142855,0.17886904761904762
298
- HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,2,0.21428571428571427,0.5484126984126985
299
- HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,0,0.5714285714285714,0.06101190476190476
300
- HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,1,0.3571428571428571,0.27509920634920637
301
- HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,2,0.07142857142857142,0.9048611111111111
302
- HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,0,0.6428571428571428,0.03115079365079365
303
- HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,1,0.3571428571428571,0.27509920634920637
304
- HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,2,0.0,1.0
305
- HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,0,0.4999999999999999,0.10868055555555556
306
- HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,1,0.42857142857142855,0.17886904761904762
307
- HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,2,0.42857142857142855,0.17886904761904762
308
- BFCL,bfcl_240906.csv,aggregate,aggregate,kendall,random,8,0,0.4999999999999999,0.10868055555555556
309
- BFCL,bfcl_240906.csv,aggregate,aggregate,kendall,random,8,1,0.2857142857142857,0.39875992063492066
310
- BFCL,bfcl_240906.csv,aggregate,aggregate,kendall,random,8,2,0.42857142857142855,0.17886904761904762
311
- eq_bench,eqbench_240912.csv,aggregate,aggregate,kendall,random,8,0,0.14285714285714285,0.7195436507936508
312
- eq_bench,eqbench_240912.csv,aggregate,aggregate,kendall,random,8,1,0.07142857142857142,0.9048611111111111
313
- eq_bench,eqbench_240912.csv,aggregate,aggregate,kendall,random,8,2,0.4999999999999999,0.10868055555555556
314
- magi_hard,eqbench_240912.csv,aggregate,aggregate,kendall,random,8,0,0.4999999999999999,0.10868055555555556
315
- magi_hard,eqbench_240912.csv,aggregate,aggregate,kendall,random,8,1,-0.07142857142857142,0.9048611111111111
316
- magi_hard,eqbench_240912.csv,aggregate,aggregate,kendall,random,8,2,-0.07142857142857142,0.9048611111111111
317
- BIGGEN,biggen_240829.csv,aggregate,aggregate,kendall,random,8,0,0.5714285714285714,0.06101190476190476
318
- BIGGEN,biggen_240829.csv,aggregate,aggregate,kendall,random,8,1,-0.07142857142857142,0.9048611111111111
319
- BIGGEN,biggen_240829.csv,aggregate,aggregate,kendall,random,8,2,0.5714285714285714,0.06101190476190476
320
- BIGGEN Grounding,biggen_240829.csv,aggregate,aggregate,kendall,random,8,0,0.6428571428571428,0.03115079365079365
321
- BIGGEN Grounding,biggen_240829.csv,aggregate,aggregate,kendall,random,8,1,0.07142857142857142,0.9048611111111111
322
- BIGGEN Grounding,biggen_240829.csv,aggregate,aggregate,kendall,random,8,2,0.6428571428571428,0.03115079365079365
323
- BIGGEN Instruction Following,biggen_240829.csv,aggregate,aggregate,kendall,random,8,0,0.47280542884465016,0.10506382347888965
324
- BIGGEN Instruction Following,biggen_240829.csv,aggregate,aggregate,kendall,random,8,1,0.41576092031014994,0.1612822677790775
325
- BIGGEN Instruction Following,biggen_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7142857142857142,0.014136904761904762
326
- BIGGEN Planning,biggen_240829.csv,aggregate,aggregate,kendall,random,8,0,0.6428571428571428,0.03115079365079365
327
- BIGGEN Planning,biggen_240829.csv,aggregate,aggregate,kendall,random,8,1,0.2857142857142857,0.39875992063492066
328
- BIGGEN Planning,biggen_240829.csv,aggregate,aggregate,kendall,random,8,2,0.5714285714285714,0.06101190476190476
329
- BIGGEN Reasoning,biggen_240829.csv,aggregate,aggregate,kendall,random,8,0,0.3571428571428571,0.27509920634920637
330
- BIGGEN Reasoning,biggen_240829.csv,aggregate,aggregate,kendall,random,8,1,-0.21428571428571427,0.5484126984126985
331
- BIGGEN Reasoning,biggen_240829.csv,aggregate,aggregate,kendall,random,8,2,0.3571428571428571,0.27509920634920637
332
- BIGGEN Refinement,biggen_240829.csv,aggregate,aggregate,kendall,random,8,0,0.6425396041156862,0.030400749685896046
333
- BIGGEN Refinement,biggen_240829.csv,aggregate,aggregate,kendall,random,8,1,-0.036369648372665396,0.9007802600472398
334
- BIGGEN Refinement,biggen_240829.csv,aggregate,aggregate,kendall,random,8,2,0.3571428571428571,0.27509920634920637
335
- BIGGEN Safety,biggen_240829.csv,aggregate,aggregate,kendall,random,8,0,0.6428571428571428,0.03115079365079365
336
- BIGGEN Safety,biggen_240829.csv,aggregate,aggregate,kendall,random,8,1,0.21428571428571427,0.5484126984126985
337
- BIGGEN Safety,biggen_240829.csv,aggregate,aggregate,kendall,random,8,2,0.47280542884465016,0.10506382347888965
338
- BIGGEN Theory of Mind,biggen_240829.csv,aggregate,aggregate,kendall,random,8,0,0.5714285714285714,0.06101190476190476
339
- BIGGEN Theory of Mind,biggen_240829.csv,aggregate,aggregate,kendall,random,8,1,0.14285714285714285,0.7195436507936508
340
- BIGGEN Theory of Mind,biggen_240829.csv,aggregate,aggregate,kendall,random,8,2,0.6428571428571428,0.03115079365079365
341
- BIGGEN Tool Usage,biggen_240829.csv,aggregate,aggregate,kendall,random,8,0,0.5714285714285714,0.06101190476190476
342
- BIGGEN Tool Usage,biggen_240829.csv,aggregate,aggregate,kendall,random,8,1,0.14285714285714285,0.7195436507936508
343
- BIGGEN Tool Usage,biggen_240829.csv,aggregate,aggregate,kendall,random,8,2,0.42857142857142855,0.17886904761904762
344
- BIGGEN Multilingual,biggen_240829.csv,aggregate,aggregate,kendall,random,8,0,0.8571428571428571,0.001736111111111111
345
- BIGGEN Multilingual,biggen_240829.csv,aggregate,aggregate,kendall,random,8,1,0.47280542884465016,0.10506382347888965
346
- BIGGEN Multilingual,biggen_240829.csv,aggregate,aggregate,kendall,random,8,2,0.6428571428571428,0.03115079365079365
347
- LiveBench 240624,livebench_240701.csv,aggregate,aggregate,kendall,random,8,0,0.5714285714285714,0.06101190476190476
348
- LiveBench 240624,livebench_240701.csv,aggregate,aggregate,kendall,random,8,1,0.4999999999999999,0.10868055555555556
349
- LiveBench 240624,livebench_240701.csv,aggregate,aggregate,kendall,random,8,2,0.5714285714285714,0.06101190476190476
350
- LiveBench Reasoning Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,0,0.6428571428571428,0.03115079365079365
351
- LiveBench Reasoning Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,1,0.42857142857142855,0.17886904761904762
352
- LiveBench Reasoning Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,2,0.7637626158259734,0.008839740160738534
353
- LiveBench Coding Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,0,0.4999999999999999,0.10868055555555556
354
- LiveBench Coding Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,1,0.4999999999999999,0.10868055555555556
355
- LiveBench Coding Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,2,0.3571428571428571,0.27509920634920637
356
- LiveBench Mathematics Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,0,0.6428571428571428,0.03115079365079365
357
- LiveBench Mathematics Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,1,0.5714285714285714,0.06101190476190476
358
- LiveBench Mathematics Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,2,0.6428571428571428,0.03115079365079365
359
- LiveBench Data Analysis Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,0,0.42857142857142855,0.17886904761904762
360
- LiveBench Data Analysis Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,1,0.4999999999999999,0.10868055555555556
361
- LiveBench Data Analysis Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,2,0.5714285714285714,0.06101190476190476
362
- LiveBench Language Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,0,0.6428571428571428,0.03115079365079365
363
- LiveBench Language Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,1,0.7142857142857142,0.014136904761904762
364
- LiveBench Language Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,2,0.5714285714285714,0.06101190476190476
365
- LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,0,0.7857142857142856,0.005505952380952381
366
- LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,1,0.6428571428571428,0.03115079365079365
367
- LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,2,0.4999999999999999,0.10868055555555556
368
- hydrox_integrity,hydrox_safety_241001.csv,aggregate,aggregate,kendall,random,8,0,0.4999999999999999,0.10868055555555556
369
- hydrox_integrity,hydrox_safety_241001.csv,aggregate,aggregate,kendall,random,8,1,0.0,1.0
370
- hydrox_integrity,hydrox_safety_241001.csv,aggregate,aggregate,kendall,random,8,2,0.14285714285714285,0.7195436507936508
371
- hydrox_overall_score,hydrox_safety_241001.csv,aggregate,aggregate,kendall,random,8,0,0.42857142857142855,0.17886904761904762
372
- hydrox_overall_score,hydrox_safety_241001.csv,aggregate,aggregate,kendall,random,8,1,0.0,1.0
373
- hydrox_overall_score,hydrox_safety_241001.csv,aggregate,aggregate,kendall,random,8,2,0.14285714285714285,0.7195436507936508
374
- hydrox_privacy,hydrox_safety_241001.csv,aggregate,aggregate,kendall,random,8,0,0.42857142857142855,0.17886904761904762
375
- hydrox_privacy,hydrox_safety_241001.csv,aggregate,aggregate,kendall,random,8,1,-0.07142857142857142,0.9048611111111111
376
- hydrox_privacy,hydrox_safety_241001.csv,aggregate,aggregate,kendall,random,8,2,0.14285714285714285,0.7195436507936508
377
- hydrox_safety,hydrox_safety_241001.csv,aggregate,aggregate,kendall,random,8,0,0.42857142857142855,0.17886904761904762
378
- hydrox_safety,hydrox_safety_241001.csv,aggregate,aggregate,kendall,random,8,1,-0.07142857142857142,0.9048611111111111
379
- hydrox_safety,hydrox_safety_241001.csv,aggregate,aggregate,kendall,random,8,2,0.07142857142857142,0.9048611111111111
380
- hydrox_security,hydrox_safety_241001.csv,aggregate,aggregate,kendall,random,8,0,0.4999999999999999,0.10868055555555556
381
- hydrox_security,hydrox_safety_241001.csv,aggregate,aggregate,kendall,random,8,1,0.0,1.0
382
- hydrox_security,hydrox_safety_241001.csv,aggregate,aggregate,kendall,random,8,2,0.07142857142857142,0.9048611111111111
383
- aggregate,aggregate,Helm Lite,helm_lite_240829.csv,kendall,random,8,0,0.5455447255899809,0.0614649096074132
384
- aggregate,aggregate,Helm Lite,helm_lite_240829.csv,kendall,random,8,1,0.8571428571428571,0.001736111111111111
385
- aggregate,aggregate,Helm Lite,helm_lite_240829.csv,kendall,random,8,2,0.5455447255899809,0.0614649096074132
386
- aggregate,aggregate,Helm Lite NarrativeQA,helm_lite_240829.csv,kendall,random,8,0,0.2857142857142857,0.39875992063492066
387
- aggregate,aggregate,Helm Lite NarrativeQA,helm_lite_240829.csv,kendall,random,8,1,0.3571428571428571,0.27509920634920637
388
- aggregate,aggregate,Helm Lite NarrativeQA,helm_lite_240829.csv,kendall,random,8,2,0.47280542884465016,0.10506382347888965
389
- aggregate,aggregate,Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,kendall,random,8,0,0.3571428571428571,0.27509920634920637
390
- aggregate,aggregate,Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,kendall,random,8,1,0.2857142857142857,0.39875992063492066
391
- aggregate,aggregate,Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,kendall,random,8,2,0.42857142857142855,0.17886904761904762
392
- aggregate,aggregate,Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,kendall,random,8,0,0.4999999999999999,0.10868055555555556
393
- aggregate,aggregate,Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,kendall,random,8,1,0.4999999999999999,0.10868055555555556
394
- aggregate,aggregate,Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,kendall,random,8,2,0.42857142857142855,0.17886904761904762
395
- aggregate,aggregate,Helm Lite OpenBookQA,helm_lite_240829.csv,kendall,random,8,0,0.2857142857142857,0.39875992063492066
396
- aggregate,aggregate,Helm Lite OpenBookQA,helm_lite_240829.csv,kendall,random,8,1,0.6428571428571428,0.03115079365079365
397
- aggregate,aggregate,Helm Lite OpenBookQA,helm_lite_240829.csv,kendall,random,8,2,0.5455447255899809,0.0614649096074132
398
- aggregate,aggregate,Helm Lite MMLU,helm_lite_240829.csv,kendall,random,8,0,0.3571428571428571,0.27509920634920637
399
- aggregate,aggregate,Helm Lite MMLU,helm_lite_240829.csv,kendall,random,8,1,0.5714285714285714,0.06101190476190476
400
- aggregate,aggregate,Helm Lite MMLU,helm_lite_240829.csv,kendall,random,8,2,0.42857142857142855,0.17886904761904762
401
- aggregate,aggregate,Helm Lite MathEquivalentCOT,helm_lite_240829.csv,kendall,random,8,0,0.42857142857142855,0.17886904761904762
402
- aggregate,aggregate,Helm Lite MathEquivalentCOT,helm_lite_240829.csv,kendall,random,8,1,0.7142857142857142,0.014136904761904762
403
- aggregate,aggregate,Helm Lite MathEquivalentCOT,helm_lite_240829.csv,kendall,random,8,2,0.47280542884465016,0.10506382347888965
404
- aggregate,aggregate,Helm Lite GSM8K,helm_lite_240829.csv,kendall,random,8,0,0.07142857142857142,0.9048611111111111
405
- aggregate,aggregate,Helm Lite GSM8K,helm_lite_240829.csv,kendall,random,8,1,0.4999999999999999,0.10868055555555556
406
- aggregate,aggregate,Helm Lite GSM8K,helm_lite_240829.csv,kendall,random,8,2,0.2857142857142857,0.39875992063492066
407
- aggregate,aggregate,Helm Lite LegalBench,helm_lite_240829.csv,kendall,random,8,0,0.3571428571428571,0.27509920634920637
408
- aggregate,aggregate,Helm Lite LegalBench,helm_lite_240829.csv,kendall,random,8,1,0.4999999999999999,0.10868055555555556
409
- aggregate,aggregate,Helm Lite LegalBench,helm_lite_240829.csv,kendall,random,8,2,0.5714285714285714,0.06101190476190476
410
- aggregate,aggregate,Helm Lite MedQA,helm_lite_240829.csv,kendall,random,8,0,0.2857142857142857,0.39875992063492066
411
- aggregate,aggregate,Helm Lite MedQA,helm_lite_240829.csv,kendall,random,8,1,0.4999999999999999,0.10868055555555556
412
- aggregate,aggregate,Helm Lite MedQA,helm_lite_240829.csv,kendall,random,8,2,0.42857142857142855,0.17886904761904762
413
- aggregate,aggregate,Helm Lite WMT2014,helm_lite_240829.csv,kendall,random,8,0,0.2545875386086578,0.38281014365989596
414
- aggregate,aggregate,Helm Lite WMT2014,helm_lite_240829.csv,kendall,random,8,1,0.42857142857142855,0.17886904761904762
415
- aggregate,aggregate,Helm Lite WMT2014,helm_lite_240829.csv,kendall,random,8,2,0.5455447255899809,0.0614649096074132
416
- aggregate,aggregate,LMSys Arena,chatbot_arena_241104.csv,kendall,random,8,0,0.9999999999999998,4.96031746031746e-05
417
- aggregate,aggregate,LMSys Arena,chatbot_arena_241104.csv,kendall,random,8,1,0.9999999999999998,4.96031746031746e-05
418
- aggregate,aggregate,LMSys Arena,chatbot_arena_241104.csv,kendall,random,8,2,0.9999999999999998,4.96031746031746e-05
419
- aggregate,aggregate,HF OpenLLM v2,hf_open_llm_v2_240829.csv,kendall,random,8,0,0.3571428571428571,0.27509920634920637
420
- aggregate,aggregate,HF OpenLLM v2,hf_open_llm_v2_240829.csv,kendall,random,8,1,0.42857142857142855,0.17886904761904762
421
- aggregate,aggregate,HF OpenLLM v2,hf_open_llm_v2_240829.csv,kendall,random,8,2,0.7142857142857142,0.014136904761904762
422
- aggregate,aggregate,HFv2 BBH,hf_open_llm_v2_240829.csv,kendall,random,8,0,0.21428571428571427,0.5484126984126985
423
- aggregate,aggregate,HFv2 BBH,hf_open_llm_v2_240829.csv,kendall,random,8,1,0.42857142857142855,0.17886904761904762
424
- aggregate,aggregate,HFv2 BBH,hf_open_llm_v2_240829.csv,kendall,random,8,2,0.3571428571428571,0.27509920634920637
425
- aggregate,aggregate,HFv2 GPQA,hf_open_llm_v2_240829.csv,kendall,random,8,0,0.6428571428571428,0.03115079365079365
426
- aggregate,aggregate,HFv2 GPQA,hf_open_llm_v2_240829.csv,kendall,random,8,1,0.21428571428571427,0.5484126984126985
427
- aggregate,aggregate,HFv2 GPQA,hf_open_llm_v2_240829.csv,kendall,random,8,2,0.07142857142857142,0.9048611111111111
428
- aggregate,aggregate,HFv2 IFEval,hf_open_llm_v2_240829.csv,kendall,random,8,0,0.3571428571428571,0.27509920634920637
429
- aggregate,aggregate,HFv2 IFEval,hf_open_llm_v2_240829.csv,kendall,random,8,1,0.42857142857142855,0.17886904761904762
430
- aggregate,aggregate,HFv2 IFEval,hf_open_llm_v2_240829.csv,kendall,random,8,2,0.7857142857142856,0.005505952380952381
431
- aggregate,aggregate,HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,kendall,random,8,0,0.4999999999999999,0.10868055555555556
432
- aggregate,aggregate,HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,kendall,random,8,1,0.42857142857142855,0.17886904761904762
433
- aggregate,aggregate,HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,kendall,random,8,2,0.5714285714285714,0.06101190476190476
434
- aggregate,aggregate,HFv2 Math Level 5,hf_open_llm_v2_240829.csv,kendall,random,8,0,0.42857142857142855,0.17886904761904762
435
- aggregate,aggregate,HFv2 Math Level 5,hf_open_llm_v2_240829.csv,kendall,random,8,1,0.2857142857142857,0.39875992063492066
436
- aggregate,aggregate,HFv2 Math Level 5,hf_open_llm_v2_240829.csv,kendall,random,8,2,0.5714285714285714,0.06101190476190476
437
- aggregate,aggregate,HFv2 MuSR,hf_open_llm_v2_240829.csv,kendall,random,8,0,0.5714285714285714,0.06101190476190476
438
- aggregate,aggregate,HFv2 MuSR,hf_open_llm_v2_240829.csv,kendall,random,8,1,0.3571428571428571,0.27509920634920637
439
- aggregate,aggregate,HFv2 MuSR,hf_open_llm_v2_240829.csv,kendall,random,8,2,0.07142857142857142,0.9048611111111111
440
- aggregate,aggregate,tablebench_overall_dp,tablebench_241002.csv,kendall,random,8,0,0.8571428571428571,0.001736111111111111
441
- aggregate,aggregate,tablebench_overall_dp,tablebench_241002.csv,kendall,random,8,1,0.3571428571428571,0.27509920634920637
442
- aggregate,aggregate,tablebench_overall_dp,tablebench_241002.csv,kendall,random,8,2,0.2857142857142857,0.39875992063492066
443
- aggregate,aggregate,trustworthy_average,llm_trustworthy_241001.csv,kendall,random,8,0,0.5714285714285714,0.06101190476190476
444
- aggregate,aggregate,trustworthy_average,llm_trustworthy_241001.csv,kendall,random,8,1,0.5714285714285714,0.06101190476190476
445
- aggregate,aggregate,trustworthy_average,llm_trustworthy_241001.csv,kendall,random,8,2,0.5714285714285714,0.06101190476190476
446
- aggregate,aggregate,trustworthy_non_toxicity,llm_trustworthy_241001.csv,kendall,random,8,0,0.14285714285714285,0.7195436507936508
447
- aggregate,aggregate,trustworthy_non_toxicity,llm_trustworthy_241001.csv,kendall,random,8,1,0.14285714285714285,0.7195436507936508
448
- aggregate,aggregate,trustworthy_non_toxicity,llm_trustworthy_241001.csv,kendall,random,8,2,0.14285714285714285,0.7195436507936508
449
- aggregate,aggregate,trustworthy_non_stereotype,llm_trustworthy_241001.csv,kendall,random,8,0,0.3571428571428571,0.27509920634920637
450
- aggregate,aggregate,trustworthy_non_stereotype,llm_trustworthy_241001.csv,kendall,random,8,1,0.3571428571428571,0.27509920634920637
451
- aggregate,aggregate,trustworthy_non_stereotype,llm_trustworthy_241001.csv,kendall,random,8,2,0.3571428571428571,0.27509920634920637
452
- aggregate,aggregate,trustworthy_advglue_pp,llm_trustworthy_241001.csv,kendall,random,8,0,0.42857142857142855,0.17886904761904762
453
- aggregate,aggregate,trustworthy_advglue_pp,llm_trustworthy_241001.csv,kendall,random,8,1,0.42857142857142855,0.17886904761904762
454
- aggregate,aggregate,trustworthy_advglue_pp,llm_trustworthy_241001.csv,kendall,random,8,2,0.42857142857142855,0.17886904761904762
455
- aggregate,aggregate,trustworthy_ood,llm_trustworthy_241001.csv,kendall,random,8,0,0.6428571428571428,0.03115079365079365
456
- aggregate,aggregate,trustworthy_ood,llm_trustworthy_241001.csv,kendall,random,8,1,0.6428571428571428,0.03115079365079365
457
- aggregate,aggregate,trustworthy_ood,llm_trustworthy_241001.csv,kendall,random,8,2,0.6428571428571428,0.03115079365079365
458
- aggregate,aggregate,trustworthy_adv_demo,llm_trustworthy_241001.csv,kendall,random,8,0,0.6428571428571428,0.03115079365079365
459
- aggregate,aggregate,trustworthy_adv_demo,llm_trustworthy_241001.csv,kendall,random,8,1,0.6428571428571428,0.03115079365079365
460
- aggregate,aggregate,trustworthy_adv_demo,llm_trustworthy_241001.csv,kendall,random,8,2,0.6428571428571428,0.03115079365079365
461
- aggregate,aggregate,trustworthy_privacy,llm_trustworthy_241001.csv,kendall,random,8,0,0.07142857142857142,0.9048611111111111
462
- aggregate,aggregate,trustworthy_privacy,llm_trustworthy_241001.csv,kendall,random,8,1,0.07142857142857142,0.9048611111111111
463
- aggregate,aggregate,trustworthy_privacy,llm_trustworthy_241001.csv,kendall,random,8,2,0.07142857142857142,0.9048611111111111
464
- aggregate,aggregate,trustworthy_ethics,llm_trustworthy_241001.csv,kendall,random,8,0,0.5714285714285714,0.06101190476190476
465
- aggregate,aggregate,trustworthy_ethics,llm_trustworthy_241001.csv,kendall,random,8,1,0.5714285714285714,0.06101190476190476
466
- aggregate,aggregate,trustworthy_ethics,llm_trustworthy_241001.csv,kendall,random,8,2,0.5714285714285714,0.06101190476190476
467
- aggregate,aggregate,trustworthy_fairness,llm_trustworthy_241001.csv,kendall,random,8,0,-0.6910233190806424,0.017844011512848347
468
- aggregate,aggregate,trustworthy_fairness,llm_trustworthy_241001.csv,kendall,random,8,1,-0.6910233190806424,0.017844011512848347
469
- aggregate,aggregate,trustworthy_fairness,llm_trustworthy_241001.csv,kendall,random,8,2,-0.6910233190806424,0.017844011512848347
470
- aggregate,aggregate,OpenCompass Academic,opencompass_academic_240829.csv,kendall,random,8,0,0.4999999999999999,0.10868055555555556
471
- aggregate,aggregate,OpenCompass Academic,opencompass_academic_240829.csv,kendall,random,8,1,0.3571428571428571,0.27509920634920637
472
- aggregate,aggregate,OpenCompass Academic,opencompass_academic_240829.csv,kendall,random,8,2,0.4999999999999999,0.10868055555555556
473
- aggregate,aggregate,OpenCompass MMLU,opencompass_academic_240829.csv,kendall,random,8,0,0.3571428571428571,0.27509920634920637
474
- aggregate,aggregate,OpenCompass MMLU,opencompass_academic_240829.csv,kendall,random,8,1,0.21428571428571427,0.5484126984126985
475
- aggregate,aggregate,OpenCompass MMLU,opencompass_academic_240829.csv,kendall,random,8,2,0.42857142857142855,0.17886904761904762
476
- aggregate,aggregate,OpenCompass MMLU Pro,opencompass_academic_240829.csv,kendall,random,8,0,0.42857142857142855,0.17886904761904762
477
- aggregate,aggregate,OpenCompass MMLU Pro,opencompass_academic_240829.csv,kendall,random,8,1,0.2857142857142857,0.39875992063492066
478
- aggregate,aggregate,OpenCompass MMLU Pro,opencompass_academic_240829.csv,kendall,random,8,2,0.3571428571428571,0.27509920634920637
479
- aggregate,aggregate,OpenCompass CMMLU,opencompass_academic_240829.csv,kendall,random,8,0,0.3571428571428571,0.27509920634920637
480
- aggregate,aggregate,OpenCompass CMMLU,opencompass_academic_240829.csv,kendall,random,8,1,0.21428571428571427,0.5484126984126985
481
- aggregate,aggregate,OpenCompass CMMLU,opencompass_academic_240829.csv,kendall,random,8,2,0.2857142857142857,0.39875992063492066
482
- aggregate,aggregate,OpenCompass BBH,opencompass_academic_240829.csv,kendall,random,8,0,0.4999999999999999,0.10868055555555556
483
- aggregate,aggregate,OpenCompass BBH,opencompass_academic_240829.csv,kendall,random,8,1,0.2857142857142857,0.39875992063492066
484
- aggregate,aggregate,OpenCompass BBH,opencompass_academic_240829.csv,kendall,random,8,2,0.4999999999999999,0.10868055555555556
485
- aggregate,aggregate,OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,kendall,random,8,0,0.036369648372665396,0.9007802600472398
486
- aggregate,aggregate,OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,kendall,random,8,1,0.07142857142857142,0.9048611111111111
487
- aggregate,aggregate,OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,kendall,random,8,2,-0.036369648372665396,0.9007802600472398
488
- aggregate,aggregate,OpenCompass HumanEval,opencompass_academic_240829.csv,kendall,random,8,0,0.4447495899966607,0.1315867602811863
489
- aggregate,aggregate,OpenCompass HumanEval,opencompass_academic_240829.csv,kendall,random,8,1,0.2545875386086578,0.38281014365989596
490
- aggregate,aggregate,OpenCompass HumanEval,opencompass_academic_240829.csv,kendall,random,8,2,0.40006613209931935,0.17023995462900499
491
- aggregate,aggregate,OpenCompass IFEval,opencompass_academic_240829.csv,kendall,random,8,0,0.2857142857142857,0.39875992063492066
492
- aggregate,aggregate,OpenCompass IFEval,opencompass_academic_240829.csv,kendall,random,8,1,0.21428571428571427,0.5484126984126985
493
- aggregate,aggregate,OpenCompass IFEval,opencompass_academic_240829.csv,kendall,random,8,2,0.21428571428571427,0.5484126984126985
494
- aggregate,aggregate,Helm MMLU,helm_mmlu_240829.csv,kendall,random,8,0,0.4999999999999999,0.10868055555555556
495
- aggregate,aggregate,Helm MMLU,helm_mmlu_240829.csv,kendall,random,8,1,0.3571428571428571,0.27509920634920637
496
- aggregate,aggregate,Helm MMLU,helm_mmlu_240829.csv,kendall,random,8,2,0.3571428571428571,0.27509920634920637
497
- aggregate,aggregate,MMLU Pro,mmlu_pro_240829.csv,kendall,random,8,0,0.5714285714285714,0.06101190476190476
498
- aggregate,aggregate,MMLU Pro,mmlu_pro_240829.csv,kendall,random,8,1,0.5714285714285714,0.06101190476190476
499
- aggregate,aggregate,MMLU Pro,mmlu_pro_240829.csv,kendall,random,8,2,0.5714285714285714,0.06101190476190476
500
- aggregate,aggregate,MixEval,mixeval_240829.csv,kendall,random,8,0,0.6428571428571428,0.03115079365079365
501
- aggregate,aggregate,MixEval,mixeval_240829.csv,kendall,random,8,1,0.5714285714285714,0.06101190476190476
502
- aggregate,aggregate,MixEval,mixeval_240829.csv,kendall,random,8,2,0.7142857142857142,0.014136904761904762
503
- aggregate,aggregate,MixEval Hard,mixeval_240829.csv,kendall,random,8,0,0.9999999999999998,4.96031746031746e-05
504
- aggregate,aggregate,MixEval Hard,mixeval_240829.csv,kendall,random,8,1,0.6428571428571428,0.03115079365079365
505
- aggregate,aggregate,MixEval Hard,mixeval_240829.csv,kendall,random,8,2,0.8571428571428571,0.001736111111111111
506
- aggregate,aggregate,MixEval TriviaQA,mixeval_240829.csv,kendall,random,8,0,0.4999999999999999,0.10868055555555556
507
- aggregate,aggregate,MixEval TriviaQA,mixeval_240829.csv,kendall,random,8,1,0.14285714285714285,0.7195436507936508
508
- aggregate,aggregate,MixEval TriviaQA,mixeval_240829.csv,kendall,random,8,2,0.4999999999999999,0.10868055555555556
509
- aggregate,aggregate,MixEval MMLU,mixeval_240829.csv,kendall,random,8,0,0.8571428571428571,0.001736111111111111
510
- aggregate,aggregate,MixEval MMLU,mixeval_240829.csv,kendall,random,8,1,0.7142857142857142,0.014136904761904762
511
- aggregate,aggregate,MixEval MMLU,mixeval_240829.csv,kendall,random,8,2,0.7857142857142856,0.005505952380952381
512
- aggregate,aggregate,MixEval DROP,mixeval_240829.csv,kendall,random,8,0,0.7142857142857142,0.014136904761904762
513
- aggregate,aggregate,MixEval DROP,mixeval_240829.csv,kendall,random,8,1,0.6428571428571428,0.03115079365079365
514
- aggregate,aggregate,MixEval DROP,mixeval_240829.csv,kendall,random,8,2,0.7142857142857142,0.014136904761904762
515
- aggregate,aggregate,MixEval HellaSwag,mixeval_240829.csv,kendall,random,8,0,0.8571428571428571,0.001736111111111111
516
- aggregate,aggregate,MixEval HellaSwag,mixeval_240829.csv,kendall,random,8,1,0.4999999999999999,0.10868055555555556
517
- aggregate,aggregate,MixEval HellaSwag,mixeval_240829.csv,kendall,random,8,2,0.6910233190806425,0.017844011512848347
518
- aggregate,aggregate,MixEval CommonsenseQA,mixeval_240829.csv,kendall,random,8,0,0.7142857142857142,0.014136904761904762
519
- aggregate,aggregate,MixEval CommonsenseQA,mixeval_240829.csv,kendall,random,8,1,0.47280542884465016,0.10506382347888965
520
- aggregate,aggregate,MixEval CommonsenseQA,mixeval_240829.csv,kendall,random,8,2,0.6182840223353117,0.0340492747686748
521
- aggregate,aggregate,MixEval TriviaQA Hard,mixeval_240829.csv,kendall,random,8,0,0.7142857142857142,0.014136904761904762
522
- aggregate,aggregate,MixEval TriviaQA Hard,mixeval_240829.csv,kendall,random,8,1,0.4999999999999999,0.10868055555555556
523
- aggregate,aggregate,MixEval TriviaQA Hard,mixeval_240829.csv,kendall,random,8,2,0.5714285714285714,0.06101190476190476
524
- aggregate,aggregate,MixEval MMLU Hard,mixeval_240829.csv,kendall,random,8,0,0.7857142857142856,0.005505952380952381
525
- aggregate,aggregate,MixEval MMLU Hard,mixeval_240829.csv,kendall,random,8,1,0.6428571428571428,0.03115079365079365
526
- aggregate,aggregate,MixEval MMLU Hard,mixeval_240829.csv,kendall,random,8,2,0.8571428571428571,0.001736111111111111
527
- aggregate,aggregate,MixEval DROP Hard,mixeval_240829.csv,kendall,random,8,0,0.6428571428571428,0.03115079365079365
528
- aggregate,aggregate,MixEval DROP Hard,mixeval_240829.csv,kendall,random,8,1,0.4999999999999999,0.10868055555555556
529
- aggregate,aggregate,MixEval DROP Hard,mixeval_240829.csv,kendall,random,8,2,0.6428571428571428,0.03115079365079365
530
- aggregate,aggregate,AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,kendall,random,8,0,0.21428571428571427,0.5484126984126985
531
- aggregate,aggregate,AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,kendall,random,8,1,0.7142857142857142,0.014136904761904762
532
- aggregate,aggregate,AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,kendall,random,8,2,0.2545875386086578,0.38281014365989596
533
- aggregate,aggregate,HELM AirBench Security Risks,helm_airbench_240916.csv,kendall,random,8,0,-0.42857142857142855,0.17886904761904762
534
- aggregate,aggregate,HELM AirBench Security Risks,helm_airbench_240916.csv,kendall,random,8,1,-0.2857142857142857,0.39875992063492066
535
- aggregate,aggregate,HELM AirBench Security Risks,helm_airbench_240916.csv,kendall,random,8,2,-0.21428571428571427,0.5484126984126985
536
- aggregate,aggregate,HELM AirBench Operational Misuses,helm_airbench_240916.csv,kendall,random,8,0,-0.5714285714285714,0.06101190476190476
537
- aggregate,aggregate,HELM AirBench Operational Misuses,helm_airbench_240916.csv,kendall,random,8,1,-0.18184824186332696,0.5330356744917513
538
- aggregate,aggregate,HELM AirBench Operational Misuses,helm_airbench_240916.csv,kendall,random,8,2,-0.42857142857142855,0.17886904761904762
539
- aggregate,aggregate,HELM AirBench Violence & Extremism,helm_airbench_240916.csv,kendall,random,8,0,-0.5714285714285714,0.06101190476190476
540
- aggregate,aggregate,HELM AirBench Violence & Extremism,helm_airbench_240916.csv,kendall,random,8,1,-0.7142857142857142,0.014136904761904762
541
- aggregate,aggregate,HELM AirBench Violence & Extremism,helm_airbench_240916.csv,kendall,random,8,2,-0.3571428571428571,0.27509920634920637
542
- aggregate,aggregate,HELM AirBench Hate/Toxicity,helm_airbench_240916.csv,kendall,random,8,0,-0.5714285714285714,0.06101190476190476
543
- aggregate,aggregate,HELM AirBench Hate/Toxicity,helm_airbench_240916.csv,kendall,random,8,1,-0.7142857142857142,0.014136904761904762
544
- aggregate,aggregate,HELM AirBench Hate/Toxicity,helm_airbench_240916.csv,kendall,random,8,2,-0.2857142857142857,0.39875992063492066
545
- aggregate,aggregate,HELM AirBench Sexual Content,helm_airbench_240916.csv,kendall,random,8,0,-0.42857142857142855,0.17886904761904762
546
- aggregate,aggregate,HELM AirBench Sexual Content,helm_airbench_240916.csv,kendall,random,8,1,-0.5714285714285714,0.06101190476190476
547
- aggregate,aggregate,HELM AirBench Sexual Content,helm_airbench_240916.csv,kendall,random,8,2,-0.42857142857142855,0.17886904761904762
548
- aggregate,aggregate,HELM AirBench Child Harm,helm_airbench_240916.csv,kendall,random,8,0,-0.5714285714285714,0.06101190476190476
549
- aggregate,aggregate,HELM AirBench Child Harm,helm_airbench_240916.csv,kendall,random,8,1,-0.5714285714285714,0.06101190476190476
550
- aggregate,aggregate,HELM AirBench Child Harm,helm_airbench_240916.csv,kendall,random,8,2,-0.42857142857142855,0.17886904761904762
551
- aggregate,aggregate,HELM AirBench Self Harm,helm_airbench_240916.csv,kendall,random,8,0,-0.40006613209931935,0.17023995462900499
552
- aggregate,aggregate,HELM AirBench Self Harm,helm_airbench_240916.csv,kendall,random,8,1,-0.5714285714285714,0.06101190476190476
553
- aggregate,aggregate,HELM AirBench Self Harm,helm_airbench_240916.csv,kendall,random,8,2,-0.40006613209931935,0.17023995462900499
554
- aggregate,aggregate,HELM AirBench Political Usage,helm_airbench_240916.csv,kendall,random,8,0,-0.5714285714285714,0.06101190476190476
555
- aggregate,aggregate,HELM AirBench Political Usage,helm_airbench_240916.csv,kendall,random,8,1,-0.5714285714285714,0.06101190476190476
556
- aggregate,aggregate,HELM AirBench Political Usage,helm_airbench_240916.csv,kendall,random,8,2,-0.3571428571428571,0.27509920634920637
557
- aggregate,aggregate,HELM AirBench Economic Harm,helm_airbench_240916.csv,kendall,random,8,0,-0.5714285714285714,0.06101190476190476
558
- aggregate,aggregate,HELM AirBench Economic Harm,helm_airbench_240916.csv,kendall,random,8,1,-0.6182840223353117,0.0340492747686748
559
- aggregate,aggregate,HELM AirBench Economic Harm,helm_airbench_240916.csv,kendall,random,8,2,-0.3571428571428571,0.27509920634920637
560
- aggregate,aggregate,HELM AirBench Deception,helm_airbench_240916.csv,kendall,random,8,0,-0.4999999999999999,0.10868055555555556
561
- aggregate,aggregate,HELM AirBench Deception,helm_airbench_240916.csv,kendall,random,8,1,-0.6428571428571428,0.03115079365079365
562
- aggregate,aggregate,HELM AirBench Deception,helm_airbench_240916.csv,kendall,random,8,2,-0.40006613209931935,0.17023995462900499
563
- aggregate,aggregate,HELM AirBench Manipulation,helm_airbench_240916.csv,kendall,random,8,0,-0.5714285714285714,0.06101190476190476
564
- aggregate,aggregate,HELM AirBench Manipulation,helm_airbench_240916.csv,kendall,random,8,1,-0.6182840223353117,0.0340492747686748
565
- aggregate,aggregate,HELM AirBench Manipulation,helm_airbench_240916.csv,kendall,random,8,2,-0.3571428571428571,0.27509920634920637
566
- aggregate,aggregate,HELM AirBench Defamation,helm_airbench_240916.csv,kendall,random,8,0,-0.40006613209931935,0.17023995462900499
567
- aggregate,aggregate,HELM AirBench Defamation,helm_airbench_240916.csv,kendall,random,8,1,-0.4999999999999999,0.10868055555555556
568
- aggregate,aggregate,HELM AirBench Defamation,helm_airbench_240916.csv,kendall,random,8,2,-0.40006613209931935,0.17023995462900499
569
- aggregate,aggregate,HELM AirBench Fundamental Rights,helm_airbench_240916.csv,kendall,random,8,0,-0.6182840223353117,0.0340492747686748
570
- aggregate,aggregate,HELM AirBench Fundamental Rights,helm_airbench_240916.csv,kendall,random,8,1,-0.5714285714285714,0.06101190476190476
571
- aggregate,aggregate,HELM AirBench Fundamental Rights,helm_airbench_240916.csv,kendall,random,8,2,-0.41576092031014994,0.1612822677790775
572
- aggregate,aggregate,HELM AirBench Discrimination/Bias,helm_airbench_240916.csv,kendall,random,8,0,-0.5714285714285714,0.06101190476190476
573
- aggregate,aggregate,HELM AirBench Discrimination/Bias,helm_airbench_240916.csv,kendall,random,8,1,-0.5714285714285714,0.06101190476190476
574
- aggregate,aggregate,HELM AirBench Discrimination/Bias,helm_airbench_240916.csv,kendall,random,8,2,-0.4999999999999999,0.10868055555555556
575
- aggregate,aggregate,HELM AirBench Privacy,helm_airbench_240916.csv,kendall,random,8,0,-0.5714285714285714,0.06101190476190476
576
- aggregate,aggregate,HELM AirBench Privacy,helm_airbench_240916.csv,kendall,random,8,1,-0.3571428571428571,0.27509920634920637
577
- aggregate,aggregate,HELM AirBench Privacy,helm_airbench_240916.csv,kendall,random,8,2,-0.21428571428571427,0.5484126984126985
578
- aggregate,aggregate,HELM AirBench Criminal Activities,helm_airbench_240916.csv,kendall,random,8,0,-0.5714285714285714,0.06101190476190476
579
- aggregate,aggregate,HELM AirBench Criminal Activities,helm_airbench_240916.csv,kendall,random,8,1,-0.5714285714285714,0.06101190476190476
580
- aggregate,aggregate,HELM AirBench Criminal Activities,helm_airbench_240916.csv,kendall,random,8,2,-0.42857142857142855,0.17886904761904762
581
- aggregate,aggregate,HELM AirBench AIR Score,helm_airbench_240916.csv,kendall,random,8,0,-0.6428571428571428,0.03115079365079365
582
- aggregate,aggregate,HELM AirBench AIR Score,helm_airbench_240916.csv,kendall,random,8,1,-0.5714285714285714,0.06101190476190476
583
- aggregate,aggregate,HELM AirBench AIR Score,helm_airbench_240916.csv,kendall,random,8,2,-0.47280542884465016,0.10506382347888965
584
- aggregate,aggregate,OpenCompass,opencompass_240829.csv,kendall,random,8,0,0.7857142857142856,0.005505952380952381
585
- aggregate,aggregate,OpenCompass,opencompass_240829.csv,kendall,random,8,1,0.4999999999999999,0.10868055555555556
586
- aggregate,aggregate,OpenCompass,opencompass_240829.csv,kendall,random,8,2,0.3571428571428571,0.27509920634920637
587
- aggregate,aggregate,OpenCompass Language,opencompass_240829.csv,kendall,random,8,0,0.21428571428571427,0.5484126984126985
588
- aggregate,aggregate,OpenCompass Language,opencompass_240829.csv,kendall,random,8,1,0.42857142857142855,0.17886904761904762
589
- aggregate,aggregate,OpenCompass Language,opencompass_240829.csv,kendall,random,8,2,0.4999999999999999,0.10868055555555556
590
- aggregate,aggregate,OpenCompass Knowledge,opencompass_240829.csv,kendall,random,8,0,0.4999999999999999,0.10868055555555556
591
- aggregate,aggregate,OpenCompass Knowledge,opencompass_240829.csv,kendall,random,8,1,0.3571428571428571,0.27509920634920637
592
- aggregate,aggregate,OpenCompass Knowledge,opencompass_240829.csv,kendall,random,8,2,-0.21428571428571427,0.5484126984126985
593
- aggregate,aggregate,OpenCompass Reasoning,opencompass_240829.csv,kendall,random,8,0,0.4999999999999999,0.10868055555555556
594
- aggregate,aggregate,OpenCompass Reasoning,opencompass_240829.csv,kendall,random,8,1,0.42857142857142855,0.17886904761904762
595
- aggregate,aggregate,OpenCompass Reasoning,opencompass_240829.csv,kendall,random,8,2,0.4999999999999999,0.10868055555555556
596
- aggregate,aggregate,OpenCompass Math,opencompass_240829.csv,kendall,random,8,0,0.7142857142857142,0.014136904761904762
597
- aggregate,aggregate,OpenCompass Math,opencompass_240829.csv,kendall,random,8,1,0.42857142857142855,0.17886904761904762
598
- aggregate,aggregate,OpenCompass Math,opencompass_240829.csv,kendall,random,8,2,0.14285714285714285,0.7195436507936508
599
- aggregate,aggregate,OpenCompass Code,opencompass_240829.csv,kendall,random,8,0,0.6428571428571428,0.03115079365079365
600
- aggregate,aggregate,OpenCompass Code,opencompass_240829.csv,kendall,random,8,1,0.6910233190806425,0.017844011512848347
601
- aggregate,aggregate,OpenCompass Code,opencompass_240829.csv,kendall,random,8,2,0.40006613209931935,0.17023995462900499
602
- aggregate,aggregate,OpenCompass Instruction,opencompass_240829.csv,kendall,random,8,0,0.7142857142857142,0.014136904761904762
603
- aggregate,aggregate,OpenCompass Instruction,opencompass_240829.csv,kendall,random,8,1,0.7857142857142856,0.005505952380952381
604
- aggregate,aggregate,OpenCompass Instruction,opencompass_240829.csv,kendall,random,8,2,0.7857142857142856,0.005505952380952381
605
- aggregate,aggregate,OpenCompass Agent,opencompass_240829.csv,kendall,random,8,0,0.5714285714285714,0.06101190476190476
606
- aggregate,aggregate,OpenCompass Agent,opencompass_240829.csv,kendall,random,8,1,0.21428571428571427,0.5484126984126985
607
- aggregate,aggregate,OpenCompass Agent,opencompass_240829.csv,kendall,random,8,2,-0.14285714285714285,0.7195436507936508
608
- aggregate,aggregate,OpenCompass Arena,opencompass_arena_240829.csv,kendall,random,8,0,0.32732683535398854,0.2618277009271762
609
- aggregate,aggregate,OpenCompass Arena,opencompass_arena_240829.csv,kendall,random,8,1,0.4999999999999999,0.10868055555555556
610
- aggregate,aggregate,OpenCompass Arena,opencompass_arena_240829.csv,kendall,random,8,2,0.21428571428571427,0.5484126984126985
611
- aggregate,aggregate,LiveBench 240725,livebench_240829.csv,kendall,random,8,0,0.6428571428571428,0.03115079365079365
612
- aggregate,aggregate,LiveBench 240725,livebench_240829.csv,kendall,random,8,1,0.42857142857142855,0.17886904761904762
613
- aggregate,aggregate,LiveBench 240725,livebench_240829.csv,kendall,random,8,2,0.3571428571428571,0.27509920634920637
614
- aggregate,aggregate,LiveBench Reasoning,livebench_240829.csv,kendall,random,8,0,0.7142857142857142,0.014136904761904762
615
- aggregate,aggregate,LiveBench Reasoning,livebench_240829.csv,kendall,random,8,1,0.2545875386086578,0.38281014365989596
616
- aggregate,aggregate,LiveBench Reasoning,livebench_240829.csv,kendall,random,8,2,0.3571428571428571,0.27509920634920637
617
- aggregate,aggregate,LiveBench Coding,livebench_240829.csv,kendall,random,8,0,0.4999999999999999,0.10868055555555556
618
- aggregate,aggregate,LiveBench Coding,livebench_240829.csv,kendall,random,8,1,0.6428571428571428,0.03115079365079365
619
- aggregate,aggregate,LiveBench Coding,livebench_240829.csv,kendall,random,8,2,0.2857142857142857,0.39875992063492066
620
- aggregate,aggregate,LiveBench Mathematics,livebench_240829.csv,kendall,random,8,0,0.4999999999999999,0.10868055555555556
621
- aggregate,aggregate,LiveBench Mathematics,livebench_240829.csv,kendall,random,8,1,0.2857142857142857,0.39875992063492066
622
- aggregate,aggregate,LiveBench Mathematics,livebench_240829.csv,kendall,random,8,2,0.42857142857142855,0.17886904761904762
623
- aggregate,aggregate,LiveBench Data Analysis,livebench_240829.csv,kendall,random,8,0,0.7142857142857142,0.014136904761904762
624
- aggregate,aggregate,LiveBench Data Analysis,livebench_240829.csv,kendall,random,8,1,0.4999999999999999,0.10868055555555556
625
- aggregate,aggregate,LiveBench Data Analysis,livebench_240829.csv,kendall,random,8,2,-0.07142857142857142,0.9048611111111111
626
- aggregate,aggregate,LiveBench Language,livebench_240829.csv,kendall,random,8,0,0.7857142857142856,0.005505952380952381
627
- aggregate,aggregate,LiveBench Language,livebench_240829.csv,kendall,random,8,1,0.5714285714285714,0.06101190476190476
628
- aggregate,aggregate,LiveBench Language,livebench_240829.csv,kendall,random,8,2,0.42857142857142855,0.17886904761904762
629
- aggregate,aggregate,LiveBench Instruction Following,livebench_240829.csv,kendall,random,8,0,0.7142857142857142,0.014136904761904762
630
- aggregate,aggregate,LiveBench Instruction Following,livebench_240829.csv,kendall,random,8,1,0.5714285714285714,0.06101190476190476
631
- aggregate,aggregate,LiveBench Instruction Following,livebench_240829.csv,kendall,random,8,2,0.5714285714285714,0.06101190476190476
632
- aggregate,aggregate,Enkrypt AI Safety,enkrypt_ai_safety_240916.csv,kendall,random,8,0,0.4999999999999999,0.10868055555555556
633
- aggregate,aggregate,Enkrypt AI Safety,enkrypt_ai_safety_240916.csv,kendall,random,8,1,0.3571428571428571,0.27509920634920637
634
- aggregate,aggregate,Enkrypt AI Safety,enkrypt_ai_safety_240916.csv,kendall,random,8,2,0.2857142857142857,0.39875992063492066
635
- aggregate,aggregate,WildBench Elo LC,wildbench_240829.csv,kendall,random,8,0,0.5714285714285714,0.06101190476190476
636
- aggregate,aggregate,WildBench Elo LC,wildbench_240829.csv,kendall,random,8,1,0.21428571428571427,0.5484126984126985
637
- aggregate,aggregate,WildBench Elo LC,wildbench_240829.csv,kendall,random,8,2,0.5714285714285714,0.06101190476190476
638
- aggregate,aggregate,WildBench Information Seeking,wildbench_240829.csv,kendall,random,8,0,0.7142857142857142,0.014136904761904762
639
- aggregate,aggregate,WildBench Information Seeking,wildbench_240829.csv,kendall,random,8,1,0.3571428571428571,0.27509920634920637
640
- aggregate,aggregate,WildBench Information Seeking,wildbench_240829.csv,kendall,random,8,2,0.7857142857142856,0.005505952380952381
641
- aggregate,aggregate,WildBench Creative,wildbench_240829.csv,kendall,random,8,0,0.7857142857142856,0.005505952380952381
642
- aggregate,aggregate,WildBench Creative,wildbench_240829.csv,kendall,random,8,1,0.3571428571428571,0.27509920634920637
643
- aggregate,aggregate,WildBench Creative,wildbench_240829.csv,kendall,random,8,2,0.7857142857142856,0.005505952380952381
644
- aggregate,aggregate,WildBench Code Debugging,wildbench_240829.csv,kendall,random,8,0,0.5714285714285714,0.06101190476190476
645
- aggregate,aggregate,WildBench Code Debugging,wildbench_240829.csv,kendall,random,8,1,0.21428571428571427,0.5484126984126985
646
- aggregate,aggregate,WildBench Code Debugging,wildbench_240829.csv,kendall,random,8,2,0.6428571428571428,0.03115079365079365
647
- aggregate,aggregate,WildBench Math & Data,wildbench_240829.csv,kendall,random,8,0,0.5714285714285714,0.06101190476190476
648
- aggregate,aggregate,WildBench Math & Data,wildbench_240829.csv,kendall,random,8,1,0.2857142857142857,0.39875992063492066
649
- aggregate,aggregate,WildBench Math & Data,wildbench_240829.csv,kendall,random,8,2,0.5714285714285714,0.06101190476190476
650
- aggregate,aggregate,WildBench Reasoning & Planning,wildbench_240829.csv,kendall,random,8,0,0.6428571428571428,0.03115079365079365
651
- aggregate,aggregate,WildBench Reasoning & Planning,wildbench_240829.csv,kendall,random,8,1,0.2857142857142857,0.39875992063492066
652
- aggregate,aggregate,WildBench Reasoning & Planning,wildbench_240829.csv,kendall,random,8,2,0.6428571428571428,0.03115079365079365
653
- aggregate,aggregate,WildBench Score,wildbench_240829.csv,kendall,random,8,0,0.5714285714285714,0.06101190476190476
654
- aggregate,aggregate,WildBench Score,wildbench_240829.csv,kendall,random,8,1,0.2857142857142857,0.39875992063492066
655
- aggregate,aggregate,WildBench Score,wildbench_240829.csv,kendall,random,8,2,0.6428571428571428,0.03115079365079365
656
- aggregate,aggregate,Decentralized Arena (0-1 Normalized),dec_arena_241022.csv,kendall,random,8,0,0.5714285714285714,0.06101190476190476
657
- aggregate,aggregate,Decentralized Arena (0-1 Normalized),dec_arena_241022.csv,kendall,random,8,1,0.40006613209931935,0.17023995462900499
658
- aggregate,aggregate,Decentralized Arena (0-1 Normalized),dec_arena_241022.csv,kendall,random,8,2,0.6428571428571428,0.03115079365079365
659
- aggregate,aggregate,Arena Hard,arena_hard_240829.csv,kendall,random,8,0,0.6428571428571428,0.03115079365079365
660
- aggregate,aggregate,Arena Hard,arena_hard_240829.csv,kendall,random,8,1,0.5714285714285714,0.06101190476190476
661
- aggregate,aggregate,Arena Hard,arena_hard_240829.csv,kendall,random,8,2,0.21428571428571427,0.5484126984126985
662
- aggregate,aggregate,AgentBench,agenbench_240829.csv,kendall,random,8,0,0.32732683535398854,0.2618277009271762
663
- aggregate,aggregate,AgentBench,agenbench_240829.csv,kendall,random,8,1,0.32732683535398854,0.2618277009271762
664
- aggregate,aggregate,AgentBench,agenbench_240829.csv,kendall,random,8,2,0.32732683535398854,0.2618277009271762
665
- aggregate,aggregate,MT-Bench,mtbench_240829_frozen.csv,kendall,random,8,0,0.6428571428571428,0.03115079365079365
666
- aggregate,aggregate,MT-Bench,mtbench_240829_frozen.csv,kendall,random,8,1,0.6182840223353117,0.0340492747686748
667
- aggregate,aggregate,MT-Bench,mtbench_240829_frozen.csv,kendall,random,8,2,0.6428571428571428,0.03115079365079365
668
- aggregate,aggregate,HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,0,0.6428571428571428,0.03115079365079365
669
- aggregate,aggregate,HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,1,0.21428571428571427,0.5484126984126985
670
- aggregate,aggregate,HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,2,0.07142857142857142,0.9048611111111111
671
- aggregate,aggregate,HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,0,0.42857142857142855,0.17886904761904762
672
- aggregate,aggregate,HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,1,0.3571428571428571,0.27509920634920637
673
- aggregate,aggregate,HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,2,0.07142857142857142,0.9048611111111111
674
- aggregate,aggregate,HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,0,0.6428571428571428,0.03115079365079365
675
- aggregate,aggregate,HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,1,0.0,1.0
676
- aggregate,aggregate,HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,2,0.14285714285714285,0.7195436507936508
677
- aggregate,aggregate,HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,0,0.42857142857142855,0.17886904761904762
678
- aggregate,aggregate,HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,1,0.42857142857142855,0.17886904761904762
679
- aggregate,aggregate,HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,2,0.21428571428571427,0.5484126984126985
680
- aggregate,aggregate,HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,0,0.5714285714285714,0.06101190476190476
681
- aggregate,aggregate,HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,1,0.3571428571428571,0.27509920634920637
682
- aggregate,aggregate,HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,2,0.07142857142857142,0.9048611111111111
683
- aggregate,aggregate,HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,0,0.6428571428571428,0.03115079365079365
684
- aggregate,aggregate,HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,1,0.3571428571428571,0.27509920634920637
685
- aggregate,aggregate,HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,2,0.0,1.0
686
- aggregate,aggregate,HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,0,0.4999999999999999,0.10868055555555556
687
- aggregate,aggregate,HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,1,0.42857142857142855,0.17886904761904762
688
- aggregate,aggregate,HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,2,0.42857142857142855,0.17886904761904762
689
- aggregate,aggregate,BFCL,bfcl_240906.csv,kendall,random,8,0,0.4999999999999999,0.10868055555555556
690
- aggregate,aggregate,BFCL,bfcl_240906.csv,kendall,random,8,1,0.2857142857142857,0.39875992063492066
691
- aggregate,aggregate,BFCL,bfcl_240906.csv,kendall,random,8,2,0.42857142857142855,0.17886904761904762
692
- aggregate,aggregate,eq_bench,eqbench_240912.csv,kendall,random,8,0,0.14285714285714285,0.7195436507936508
693
- aggregate,aggregate,eq_bench,eqbench_240912.csv,kendall,random,8,1,0.07142857142857142,0.9048611111111111
694
- aggregate,aggregate,eq_bench,eqbench_240912.csv,kendall,random,8,2,0.4999999999999999,0.10868055555555556
695
- aggregate,aggregate,magi_hard,eqbench_240912.csv,kendall,random,8,0,0.4999999999999999,0.10868055555555556
696
- aggregate,aggregate,magi_hard,eqbench_240912.csv,kendall,random,8,1,-0.07142857142857142,0.9048611111111111
697
- aggregate,aggregate,magi_hard,eqbench_240912.csv,kendall,random,8,2,-0.07142857142857142,0.9048611111111111
698
- aggregate,aggregate,BIGGEN,biggen_240829.csv,kendall,random,8,0,0.5714285714285714,0.06101190476190476
699
- aggregate,aggregate,BIGGEN,biggen_240829.csv,kendall,random,8,1,-0.07142857142857142,0.9048611111111111
700
- aggregate,aggregate,BIGGEN,biggen_240829.csv,kendall,random,8,2,0.5714285714285714,0.06101190476190476
701
- aggregate,aggregate,BIGGEN Grounding,biggen_240829.csv,kendall,random,8,0,0.6428571428571428,0.03115079365079365
702
- aggregate,aggregate,BIGGEN Grounding,biggen_240829.csv,kendall,random,8,1,0.07142857142857142,0.9048611111111111
703
- aggregate,aggregate,BIGGEN Grounding,biggen_240829.csv,kendall,random,8,2,0.6428571428571428,0.03115079365079365
704
- aggregate,aggregate,BIGGEN Instruction Following,biggen_240829.csv,kendall,random,8,0,0.47280542884465016,0.10506382347888965
705
- aggregate,aggregate,BIGGEN Instruction Following,biggen_240829.csv,kendall,random,8,1,0.41576092031014994,0.1612822677790775
706
- aggregate,aggregate,BIGGEN Instruction Following,biggen_240829.csv,kendall,random,8,2,0.7142857142857142,0.014136904761904762
707
- aggregate,aggregate,BIGGEN Planning,biggen_240829.csv,kendall,random,8,0,0.6428571428571428,0.03115079365079365
708
- aggregate,aggregate,BIGGEN Planning,biggen_240829.csv,kendall,random,8,1,0.2857142857142857,0.39875992063492066
709
- aggregate,aggregate,BIGGEN Planning,biggen_240829.csv,kendall,random,8,2,0.5714285714285714,0.06101190476190476
710
- aggregate,aggregate,BIGGEN Reasoning,biggen_240829.csv,kendall,random,8,0,0.3571428571428571,0.27509920634920637
711
- aggregate,aggregate,BIGGEN Reasoning,biggen_240829.csv,kendall,random,8,1,-0.21428571428571427,0.5484126984126985
712
- aggregate,aggregate,BIGGEN Reasoning,biggen_240829.csv,kendall,random,8,2,0.3571428571428571,0.27509920634920637
713
- aggregate,aggregate,BIGGEN Refinement,biggen_240829.csv,kendall,random,8,0,0.6425396041156862,0.030400749685896046
714
- aggregate,aggregate,BIGGEN Refinement,biggen_240829.csv,kendall,random,8,1,-0.036369648372665396,0.9007802600472398
715
- aggregate,aggregate,BIGGEN Refinement,biggen_240829.csv,kendall,random,8,2,0.3571428571428571,0.27509920634920637
716
- aggregate,aggregate,BIGGEN Safety,biggen_240829.csv,kendall,random,8,0,0.6428571428571428,0.03115079365079365
717
- aggregate,aggregate,BIGGEN Safety,biggen_240829.csv,kendall,random,8,1,0.21428571428571427,0.5484126984126985
718
- aggregate,aggregate,BIGGEN Safety,biggen_240829.csv,kendall,random,8,2,0.47280542884465016,0.10506382347888965
719
- aggregate,aggregate,BIGGEN Theory of Mind,biggen_240829.csv,kendall,random,8,0,0.5714285714285714,0.06101190476190476
720
- aggregate,aggregate,BIGGEN Theory of Mind,biggen_240829.csv,kendall,random,8,1,0.14285714285714285,0.7195436507936508
721
- aggregate,aggregate,BIGGEN Theory of Mind,biggen_240829.csv,kendall,random,8,2,0.6428571428571428,0.03115079365079365
722
- aggregate,aggregate,BIGGEN Tool Usage,biggen_240829.csv,kendall,random,8,0,0.5714285714285714,0.06101190476190476
723
- aggregate,aggregate,BIGGEN Tool Usage,biggen_240829.csv,kendall,random,8,1,0.14285714285714285,0.7195436507936508
724
- aggregate,aggregate,BIGGEN Tool Usage,biggen_240829.csv,kendall,random,8,2,0.42857142857142855,0.17886904761904762
725
- aggregate,aggregate,BIGGEN Multilingual,biggen_240829.csv,kendall,random,8,0,0.8571428571428571,0.001736111111111111
726
- aggregate,aggregate,BIGGEN Multilingual,biggen_240829.csv,kendall,random,8,1,0.47280542884465016,0.10506382347888965
727
- aggregate,aggregate,BIGGEN Multilingual,biggen_240829.csv,kendall,random,8,2,0.6428571428571428,0.03115079365079365
728
- aggregate,aggregate,LiveBench 240624,livebench_240701.csv,kendall,random,8,0,0.5714285714285714,0.06101190476190476
729
- aggregate,aggregate,LiveBench 240624,livebench_240701.csv,kendall,random,8,1,0.4999999999999999,0.10868055555555556
730
- aggregate,aggregate,LiveBench 240624,livebench_240701.csv,kendall,random,8,2,0.5714285714285714,0.06101190476190476
731
- aggregate,aggregate,LiveBench Reasoning Average,livebench_240701.csv,kendall,random,8,0,0.6428571428571428,0.03115079365079365
732
- aggregate,aggregate,LiveBench Reasoning Average,livebench_240701.csv,kendall,random,8,1,0.42857142857142855,0.17886904761904762
733
- aggregate,aggregate,LiveBench Reasoning Average,livebench_240701.csv,kendall,random,8,2,0.7637626158259734,0.008839740160738534
734
- aggregate,aggregate,LiveBench Coding Average,livebench_240701.csv,kendall,random,8,0,0.4999999999999999,0.10868055555555556
735
- aggregate,aggregate,LiveBench Coding Average,livebench_240701.csv,kendall,random,8,1,0.4999999999999999,0.10868055555555556
736
- aggregate,aggregate,LiveBench Coding Average,livebench_240701.csv,kendall,random,8,2,0.3571428571428571,0.27509920634920637
737
- aggregate,aggregate,LiveBench Mathematics Average,livebench_240701.csv,kendall,random,8,0,0.6428571428571428,0.03115079365079365
738
- aggregate,aggregate,LiveBench Mathematics Average,livebench_240701.csv,kendall,random,8,1,0.5714285714285714,0.06101190476190476
739
- aggregate,aggregate,LiveBench Mathematics Average,livebench_240701.csv,kendall,random,8,2,0.6428571428571428,0.03115079365079365
740
- aggregate,aggregate,LiveBench Data Analysis Average,livebench_240701.csv,kendall,random,8,0,0.42857142857142855,0.17886904761904762
741
- aggregate,aggregate,LiveBench Data Analysis Average,livebench_240701.csv,kendall,random,8,1,0.4999999999999999,0.10868055555555556
742
- aggregate,aggregate,LiveBench Data Analysis Average,livebench_240701.csv,kendall,random,8,2,0.5714285714285714,0.06101190476190476
743
- aggregate,aggregate,LiveBench Language Average,livebench_240701.csv,kendall,random,8,0,0.6428571428571428,0.03115079365079365
744
- aggregate,aggregate,LiveBench Language Average,livebench_240701.csv,kendall,random,8,1,0.7142857142857142,0.014136904761904762
745
- aggregate,aggregate,LiveBench Language Average,livebench_240701.csv,kendall,random,8,2,0.5714285714285714,0.06101190476190476
746
- aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,kendall,random,8,0,0.7857142857142856,0.005505952380952381
747
- aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,kendall,random,8,1,0.6428571428571428,0.03115079365079365
748
- aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,kendall,random,8,2,0.4999999999999999,0.10868055555555556
749
- aggregate,aggregate,hydrox_integrity,hydrox_safety_241001.csv,kendall,random,8,0,0.4999999999999999,0.10868055555555556
750
- aggregate,aggregate,hydrox_integrity,hydrox_safety_241001.csv,kendall,random,8,1,0.0,1.0
751
- aggregate,aggregate,hydrox_integrity,hydrox_safety_241001.csv,kendall,random,8,2,0.14285714285714285,0.7195436507936508
752
- aggregate,aggregate,hydrox_overall_score,hydrox_safety_241001.csv,kendall,random,8,0,0.42857142857142855,0.17886904761904762
753
- aggregate,aggregate,hydrox_overall_score,hydrox_safety_241001.csv,kendall,random,8,1,0.0,1.0
754
- aggregate,aggregate,hydrox_overall_score,hydrox_safety_241001.csv,kendall,random,8,2,0.14285714285714285,0.7195436507936508
755
- aggregate,aggregate,hydrox_privacy,hydrox_safety_241001.csv,kendall,random,8,0,0.42857142857142855,0.17886904761904762
756
- aggregate,aggregate,hydrox_privacy,hydrox_safety_241001.csv,kendall,random,8,1,-0.07142857142857142,0.9048611111111111
757
- aggregate,aggregate,hydrox_privacy,hydrox_safety_241001.csv,kendall,random,8,2,0.14285714285714285,0.7195436507936508
758
- aggregate,aggregate,hydrox_safety,hydrox_safety_241001.csv,kendall,random,8,0,0.42857142857142855,0.17886904761904762
759
- aggregate,aggregate,hydrox_safety,hydrox_safety_241001.csv,kendall,random,8,1,-0.07142857142857142,0.9048611111111111
760
- aggregate,aggregate,hydrox_safety,hydrox_safety_241001.csv,kendall,random,8,2,0.07142857142857142,0.9048611111111111
761
- aggregate,aggregate,hydrox_security,hydrox_safety_241001.csv,kendall,random,8,0,0.4999999999999999,0.10868055555555556
762
- aggregate,aggregate,hydrox_security,hydrox_safety_241001.csv,kendall,random,8,1,0.0,1.0
763
- aggregate,aggregate,hydrox_security,hydrox_safety_241001.csv,kendall,random,8,2,0.07142857142857142,0.9048611111111111
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cache/agreements_cache_5e66a88dab42480065db47711c55c458.csv CHANGED
@@ -155,57 +155,57 @@ MixEval DROP Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,2,0.74
155
  AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7857142857142856,0.005505952380952381
156
  AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,aggregate,aggregate,kendall,random,8,1,0.5714285714285714,0.06101190476190476
157
  AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,aggregate,aggregate,kendall,random,8,2,0.8571428571428571,0.001736111111111111
158
- HELM AirBench Security Risks,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,0,-0.42857142857142855,0.17886904761904762
159
- HELM AirBench Security Risks,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,1,-0.5714285714285714,0.06101190476190476
160
- HELM AirBench Security Risks,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,2,-0.3571428571428571,0.27509920634920637
161
- HELM AirBench Operational Misuses,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,0,-0.3571428571428571,0.27509920634920637
162
- HELM AirBench Operational Misuses,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,1,-0.4999999999999999,0.10868055555555556
163
- HELM AirBench Operational Misuses,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,2,-0.42857142857142855,0.17886904761904762
164
- HELM AirBench Violence & Extremism,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,0,-0.42857142857142855,0.17886904761904762
165
- HELM AirBench Violence & Extremism,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,1,-0.6428571428571428,0.03115079365079365
166
- HELM AirBench Violence & Extremism,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,2,-0.4999999999999999,0.10868055555555556
167
- HELM AirBench Hate/Toxicity,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,0,-0.2857142857142857,0.39875992063492066
168
- HELM AirBench Hate/Toxicity,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,1,-0.4999999999999999,0.10868055555555556
169
- HELM AirBench Hate/Toxicity,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,2,-0.42857142857142855,0.17886904761904762
170
- HELM AirBench Sexual Content,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,0,-0.2857142857142857,0.39875992063492066
171
- HELM AirBench Sexual Content,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,1,-0.4999999999999999,0.10868055555555556
172
- HELM AirBench Sexual Content,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,2,-0.2857142857142857,0.39875992063492066
173
- HELM AirBench Child Harm,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,0,-0.4999999999999999,0.10868055555555556
174
- HELM AirBench Child Harm,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,1,-0.6428571428571428,0.03115079365079365
175
- HELM AirBench Child Harm,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,2,-0.5714285714285714,0.06101190476190476
176
- HELM AirBench Self Harm,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,0,-0.32732683535398854,0.2618277009271762
177
- HELM AirBench Self Harm,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,1,-0.7142857142857142,0.014136904761904762
178
- HELM AirBench Self Harm,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,2,-0.4999999999999999,0.10868055555555556
179
- HELM AirBench Political Usage,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,0,-0.3571428571428571,0.27509920634920637
180
- HELM AirBench Political Usage,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,1,-0.5714285714285714,0.06101190476190476
181
- HELM AirBench Political Usage,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,2,-0.5714285714285714,0.06101190476190476
182
- HELM AirBench Economic Harm,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,0,-0.3571428571428571,0.27509920634920637
183
- HELM AirBench Economic Harm,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,1,-0.6428571428571428,0.03115079365079365
184
- HELM AirBench Economic Harm,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,2,-0.5714285714285714,0.06101190476190476
185
- HELM AirBench Deception,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,0,-0.47280542884465016,0.10506382347888965
186
- HELM AirBench Deception,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,1,-0.7142857142857142,0.014136904761904762
187
- HELM AirBench Deception,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,2,-0.5714285714285714,0.06101190476190476
188
- HELM AirBench Manipulation,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,0,-0.3571428571428571,0.27509920634920637
189
- HELM AirBench Manipulation,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,1,-0.6428571428571428,0.03115079365079365
190
- HELM AirBench Manipulation,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,2,-0.6182840223353117,0.0340492747686748
191
- HELM AirBench Defamation,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,0,-0.40006613209931935,0.17023995462900499
192
- HELM AirBench Defamation,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,1,-0.6428571428571428,0.03115079365079365
193
- HELM AirBench Defamation,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,2,-0.40006613209931935,0.17023995462900499
194
- HELM AirBench Fundamental Rights,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,0,-0.40006613209931935,0.17023995462900499
195
- HELM AirBench Fundamental Rights,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,1,-0.6428571428571428,0.03115079365079365
196
- HELM AirBench Fundamental Rights,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,2,-0.6182840223353117,0.0340492747686748
197
- HELM AirBench Discrimination/Bias,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,0,-0.3571428571428571,0.27509920634920637
198
- HELM AirBench Discrimination/Bias,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,1,-0.5455447255899809,0.0614649096074132
199
- HELM AirBench Discrimination/Bias,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,2,-0.47280542884465016,0.10506382347888965
200
- HELM AirBench Privacy,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,0,-0.3571428571428571,0.27509920634920637
201
- HELM AirBench Privacy,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,1,-0.4999999999999999,0.10868055555555556
202
- HELM AirBench Privacy,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,2,-0.42857142857142855,0.17886904761904762
203
- HELM AirBench Criminal Activities,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,0,-0.4999999999999999,0.10868055555555556
204
- HELM AirBench Criminal Activities,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,1,-0.6428571428571428,0.03115079365079365
205
- HELM AirBench Criminal Activities,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,2,-0.5714285714285714,0.06101190476190476
206
- HELM AirBench AIR Score,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,0,-0.42857142857142855,0.17886904761904762
207
- HELM AirBench AIR Score,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,1,-0.6428571428571428,0.03115079365079365
208
- HELM AirBench AIR Score,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,2,-0.5455447255899809,0.0614649096074132
209
  OpenCompass,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,0,0.9285714285714285,0.0003968253968253968
210
  OpenCompass,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,1,0.42857142857142855,0.17886904761904762
211
  OpenCompass,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,2,0.2857142857142857,0.39875992063492066
@@ -365,21 +365,21 @@ LiveBench Language Average,livebench_240701.csv,aggregate,aggregate,kendall,rand
365
  LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,0,0.5714285714285714,0.06101190476190476
366
  LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,1,0.2857142857142857,0.39875992063492066
367
  LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,2,0.5714285714285714,0.06101190476190476
368
- hydrox_integrity,hydrox_safety_241001.csv,aggregate,aggregate,kendall,random,8,0,0.07142857142857142,0.9048611111111111
369
- hydrox_integrity,hydrox_safety_241001.csv,aggregate,aggregate,kendall,random,8,1,0.5714285714285714,0.06101190476190476
370
- hydrox_integrity,hydrox_safety_241001.csv,aggregate,aggregate,kendall,random,8,2,0.2857142857142857,0.39875992063492066
371
- hydrox_overall_score,hydrox_safety_241001.csv,aggregate,aggregate,kendall,random,8,0,0.0,1.0
372
- hydrox_overall_score,hydrox_safety_241001.csv,aggregate,aggregate,kendall,random,8,1,0.5714285714285714,0.06101190476190476
373
- hydrox_overall_score,hydrox_safety_241001.csv,aggregate,aggregate,kendall,random,8,2,0.2857142857142857,0.39875992063492066
374
- hydrox_privacy,hydrox_safety_241001.csv,aggregate,aggregate,kendall,random,8,0,0.0,1.0
375
- hydrox_privacy,hydrox_safety_241001.csv,aggregate,aggregate,kendall,random,8,1,0.5714285714285714,0.06101190476190476
376
- hydrox_privacy,hydrox_safety_241001.csv,aggregate,aggregate,kendall,random,8,2,0.21428571428571427,0.5484126984126985
377
- hydrox_safety,hydrox_safety_241001.csv,aggregate,aggregate,kendall,random,8,0,-0.07142857142857142,0.9048611111111111
378
- hydrox_safety,hydrox_safety_241001.csv,aggregate,aggregate,kendall,random,8,1,0.6428571428571428,0.03115079365079365
379
- hydrox_safety,hydrox_safety_241001.csv,aggregate,aggregate,kendall,random,8,2,0.2857142857142857,0.39875992063492066
380
- hydrox_security,hydrox_safety_241001.csv,aggregate,aggregate,kendall,random,8,0,0.07142857142857142,0.9048611111111111
381
- hydrox_security,hydrox_safety_241001.csv,aggregate,aggregate,kendall,random,8,1,0.5714285714285714,0.06101190476190476
382
- hydrox_security,hydrox_safety_241001.csv,aggregate,aggregate,kendall,random,8,2,0.2857142857142857,0.39875992063492066
383
  aggregate,aggregate,Holmes,holmes_240829.csv,kendall,random,8,0,0.21428571428571427,0.5484126984126985
384
  aggregate,aggregate,Holmes,holmes_240829.csv,kendall,random,8,1,0.42857142857142855,0.17886904761904762
385
  aggregate,aggregate,Holmes,holmes_240829.csv,kendall,random,8,2,0.3571428571428571,0.27509920634920637
@@ -536,57 +536,57 @@ aggregate,aggregate,MixEval DROP Hard,mixeval_240829.csv,kendall,random,8,2,0.74
536
  aggregate,aggregate,AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,kendall,random,8,0,0.7857142857142856,0.005505952380952381
537
  aggregate,aggregate,AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,kendall,random,8,1,0.5714285714285714,0.06101190476190476
538
  aggregate,aggregate,AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,kendall,random,8,2,0.8571428571428571,0.001736111111111111
539
- aggregate,aggregate,HELM AirBench Security Risks,helm_airbench_240916.csv,kendall,random,8,0,-0.42857142857142855,0.17886904761904762
540
- aggregate,aggregate,HELM AirBench Security Risks,helm_airbench_240916.csv,kendall,random,8,1,-0.5714285714285714,0.06101190476190476
541
- aggregate,aggregate,HELM AirBench Security Risks,helm_airbench_240916.csv,kendall,random,8,2,-0.3571428571428571,0.27509920634920637
542
- aggregate,aggregate,HELM AirBench Operational Misuses,helm_airbench_240916.csv,kendall,random,8,0,-0.3571428571428571,0.27509920634920637
543
- aggregate,aggregate,HELM AirBench Operational Misuses,helm_airbench_240916.csv,kendall,random,8,1,-0.4999999999999999,0.10868055555555556
544
- aggregate,aggregate,HELM AirBench Operational Misuses,helm_airbench_240916.csv,kendall,random,8,2,-0.42857142857142855,0.17886904761904762
545
- aggregate,aggregate,HELM AirBench Violence & Extremism,helm_airbench_240916.csv,kendall,random,8,0,-0.42857142857142855,0.17886904761904762
546
- aggregate,aggregate,HELM AirBench Violence & Extremism,helm_airbench_240916.csv,kendall,random,8,1,-0.6428571428571428,0.03115079365079365
547
- aggregate,aggregate,HELM AirBench Violence & Extremism,helm_airbench_240916.csv,kendall,random,8,2,-0.4999999999999999,0.10868055555555556
548
- aggregate,aggregate,HELM AirBench Hate/Toxicity,helm_airbench_240916.csv,kendall,random,8,0,-0.2857142857142857,0.39875992063492066
549
- aggregate,aggregate,HELM AirBench Hate/Toxicity,helm_airbench_240916.csv,kendall,random,8,1,-0.4999999999999999,0.10868055555555556
550
- aggregate,aggregate,HELM AirBench Hate/Toxicity,helm_airbench_240916.csv,kendall,random,8,2,-0.42857142857142855,0.17886904761904762
551
- aggregate,aggregate,HELM AirBench Sexual Content,helm_airbench_240916.csv,kendall,random,8,0,-0.2857142857142857,0.39875992063492066
552
- aggregate,aggregate,HELM AirBench Sexual Content,helm_airbench_240916.csv,kendall,random,8,1,-0.4999999999999999,0.10868055555555556
553
- aggregate,aggregate,HELM AirBench Sexual Content,helm_airbench_240916.csv,kendall,random,8,2,-0.2857142857142857,0.39875992063492066
554
- aggregate,aggregate,HELM AirBench Child Harm,helm_airbench_240916.csv,kendall,random,8,0,-0.4999999999999999,0.10868055555555556
555
- aggregate,aggregate,HELM AirBench Child Harm,helm_airbench_240916.csv,kendall,random,8,1,-0.6428571428571428,0.03115079365079365
556
- aggregate,aggregate,HELM AirBench Child Harm,helm_airbench_240916.csv,kendall,random,8,2,-0.5714285714285714,0.06101190476190476
557
- aggregate,aggregate,HELM AirBench Self Harm,helm_airbench_240916.csv,kendall,random,8,0,-0.32732683535398854,0.2618277009271762
558
- aggregate,aggregate,HELM AirBench Self Harm,helm_airbench_240916.csv,kendall,random,8,1,-0.7142857142857142,0.014136904761904762
559
- aggregate,aggregate,HELM AirBench Self Harm,helm_airbench_240916.csv,kendall,random,8,2,-0.4999999999999999,0.10868055555555556
560
- aggregate,aggregate,HELM AirBench Political Usage,helm_airbench_240916.csv,kendall,random,8,0,-0.3571428571428571,0.27509920634920637
561
- aggregate,aggregate,HELM AirBench Political Usage,helm_airbench_240916.csv,kendall,random,8,1,-0.5714285714285714,0.06101190476190476
562
- aggregate,aggregate,HELM AirBench Political Usage,helm_airbench_240916.csv,kendall,random,8,2,-0.5714285714285714,0.06101190476190476
563
- aggregate,aggregate,HELM AirBench Economic Harm,helm_airbench_240916.csv,kendall,random,8,0,-0.3571428571428571,0.27509920634920637
564
- aggregate,aggregate,HELM AirBench Economic Harm,helm_airbench_240916.csv,kendall,random,8,1,-0.6428571428571428,0.03115079365079365
565
- aggregate,aggregate,HELM AirBench Economic Harm,helm_airbench_240916.csv,kendall,random,8,2,-0.5714285714285714,0.06101190476190476
566
- aggregate,aggregate,HELM AirBench Deception,helm_airbench_240916.csv,kendall,random,8,0,-0.47280542884465016,0.10506382347888965
567
- aggregate,aggregate,HELM AirBench Deception,helm_airbench_240916.csv,kendall,random,8,1,-0.7142857142857142,0.014136904761904762
568
- aggregate,aggregate,HELM AirBench Deception,helm_airbench_240916.csv,kendall,random,8,2,-0.5714285714285714,0.06101190476190476
569
- aggregate,aggregate,HELM AirBench Manipulation,helm_airbench_240916.csv,kendall,random,8,0,-0.3571428571428571,0.27509920634920637
570
- aggregate,aggregate,HELM AirBench Manipulation,helm_airbench_240916.csv,kendall,random,8,1,-0.6428571428571428,0.03115079365079365
571
- aggregate,aggregate,HELM AirBench Manipulation,helm_airbench_240916.csv,kendall,random,8,2,-0.6182840223353117,0.0340492747686748
572
- aggregate,aggregate,HELM AirBench Defamation,helm_airbench_240916.csv,kendall,random,8,0,-0.40006613209931935,0.17023995462900499
573
- aggregate,aggregate,HELM AirBench Defamation,helm_airbench_240916.csv,kendall,random,8,1,-0.6428571428571428,0.03115079365079365
574
- aggregate,aggregate,HELM AirBench Defamation,helm_airbench_240916.csv,kendall,random,8,2,-0.40006613209931935,0.17023995462900499
575
- aggregate,aggregate,HELM AirBench Fundamental Rights,helm_airbench_240916.csv,kendall,random,8,0,-0.40006613209931935,0.17023995462900499
576
- aggregate,aggregate,HELM AirBench Fundamental Rights,helm_airbench_240916.csv,kendall,random,8,1,-0.6428571428571428,0.03115079365079365
577
- aggregate,aggregate,HELM AirBench Fundamental Rights,helm_airbench_240916.csv,kendall,random,8,2,-0.6182840223353117,0.0340492747686748
578
- aggregate,aggregate,HELM AirBench Discrimination/Bias,helm_airbench_240916.csv,kendall,random,8,0,-0.3571428571428571,0.27509920634920637
579
- aggregate,aggregate,HELM AirBench Discrimination/Bias,helm_airbench_240916.csv,kendall,random,8,1,-0.5455447255899809,0.0614649096074132
580
- aggregate,aggregate,HELM AirBench Discrimination/Bias,helm_airbench_240916.csv,kendall,random,8,2,-0.47280542884465016,0.10506382347888965
581
- aggregate,aggregate,HELM AirBench Privacy,helm_airbench_240916.csv,kendall,random,8,0,-0.3571428571428571,0.27509920634920637
582
- aggregate,aggregate,HELM AirBench Privacy,helm_airbench_240916.csv,kendall,random,8,1,-0.4999999999999999,0.10868055555555556
583
- aggregate,aggregate,HELM AirBench Privacy,helm_airbench_240916.csv,kendall,random,8,2,-0.42857142857142855,0.17886904761904762
584
- aggregate,aggregate,HELM AirBench Criminal Activities,helm_airbench_240916.csv,kendall,random,8,0,-0.4999999999999999,0.10868055555555556
585
- aggregate,aggregate,HELM AirBench Criminal Activities,helm_airbench_240916.csv,kendall,random,8,1,-0.6428571428571428,0.03115079365079365
586
- aggregate,aggregate,HELM AirBench Criminal Activities,helm_airbench_240916.csv,kendall,random,8,2,-0.5714285714285714,0.06101190476190476
587
- aggregate,aggregate,HELM AirBench AIR Score,helm_airbench_240916.csv,kendall,random,8,0,-0.42857142857142855,0.17886904761904762
588
- aggregate,aggregate,HELM AirBench AIR Score,helm_airbench_240916.csv,kendall,random,8,1,-0.6428571428571428,0.03115079365079365
589
- aggregate,aggregate,HELM AirBench AIR Score,helm_airbench_240916.csv,kendall,random,8,2,-0.5455447255899809,0.0614649096074132
590
  aggregate,aggregate,OpenCompass,opencompass_240829.csv,kendall,random,8,0,0.9285714285714285,0.0003968253968253968
591
  aggregate,aggregate,OpenCompass,opencompass_240829.csv,kendall,random,8,1,0.42857142857142855,0.17886904761904762
592
  aggregate,aggregate,OpenCompass,opencompass_240829.csv,kendall,random,8,2,0.2857142857142857,0.39875992063492066
@@ -746,18 +746,18 @@ aggregate,aggregate,LiveBench Language Average,livebench_240701.csv,kendall,rand
746
  aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,kendall,random,8,0,0.5714285714285714,0.06101190476190476
747
  aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,kendall,random,8,1,0.2857142857142857,0.39875992063492066
748
  aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,kendall,random,8,2,0.5714285714285714,0.06101190476190476
749
- aggregate,aggregate,hydrox_integrity,hydrox_safety_241001.csv,kendall,random,8,0,0.07142857142857142,0.9048611111111111
750
- aggregate,aggregate,hydrox_integrity,hydrox_safety_241001.csv,kendall,random,8,1,0.5714285714285714,0.06101190476190476
751
- aggregate,aggregate,hydrox_integrity,hydrox_safety_241001.csv,kendall,random,8,2,0.2857142857142857,0.39875992063492066
752
- aggregate,aggregate,hydrox_overall_score,hydrox_safety_241001.csv,kendall,random,8,0,0.0,1.0
753
- aggregate,aggregate,hydrox_overall_score,hydrox_safety_241001.csv,kendall,random,8,1,0.5714285714285714,0.06101190476190476
754
- aggregate,aggregate,hydrox_overall_score,hydrox_safety_241001.csv,kendall,random,8,2,0.2857142857142857,0.39875992063492066
755
- aggregate,aggregate,hydrox_privacy,hydrox_safety_241001.csv,kendall,random,8,0,0.0,1.0
756
- aggregate,aggregate,hydrox_privacy,hydrox_safety_241001.csv,kendall,random,8,1,0.5714285714285714,0.06101190476190476
757
- aggregate,aggregate,hydrox_privacy,hydrox_safety_241001.csv,kendall,random,8,2,0.21428571428571427,0.5484126984126985
758
- aggregate,aggregate,hydrox_safety,hydrox_safety_241001.csv,kendall,random,8,0,-0.07142857142857142,0.9048611111111111
759
- aggregate,aggregate,hydrox_safety,hydrox_safety_241001.csv,kendall,random,8,1,0.6428571428571428,0.03115079365079365
760
- aggregate,aggregate,hydrox_safety,hydrox_safety_241001.csv,kendall,random,8,2,0.2857142857142857,0.39875992063492066
761
- aggregate,aggregate,hydrox_security,hydrox_safety_241001.csv,kendall,random,8,0,0.07142857142857142,0.9048611111111111
762
- aggregate,aggregate,hydrox_security,hydrox_safety_241001.csv,kendall,random,8,1,0.5714285714285714,0.06101190476190476
763
- aggregate,aggregate,hydrox_security,hydrox_safety_241001.csv,kendall,random,8,2,0.2857142857142857,0.39875992063492066
 
155
  AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7857142857142856,0.005505952380952381
156
  AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,aggregate,aggregate,kendall,random,8,1,0.5714285714285714,0.06101190476190476
157
  AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,aggregate,aggregate,kendall,random,8,2,0.8571428571428571,0.001736111111111111
158
+ HELM AirBench Security Risks,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,0,0.42857142857142855,0.17886904761904762
159
+ HELM AirBench Security Risks,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,1,0.5714285714285714,0.06101190476190476
160
+ HELM AirBench Security Risks,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,2,0.3571428571428571,0.27509920634920637
161
+ HELM AirBench Operational Misuses,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,0,0.3571428571428571,0.27509920634920637
162
+ HELM AirBench Operational Misuses,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,1,0.4999999999999999,0.10868055555555556
163
+ HELM AirBench Operational Misuses,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,2,0.42857142857142855,0.17886904761904762
164
+ HELM AirBench Violence & Extremism,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,0,0.42857142857142855,0.17886904761904762
165
+ HELM AirBench Violence & Extremism,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,1,0.6428571428571428,0.03115079365079365
166
+ HELM AirBench Violence & Extremism,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,2,0.4999999999999999,0.10868055555555556
167
+ HELM AirBench Hate/Toxicity,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,0,0.2857142857142857,0.39875992063492066
168
+ HELM AirBench Hate/Toxicity,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,1,0.4999999999999999,0.10868055555555556
169
+ HELM AirBench Hate/Toxicity,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,2,0.42857142857142855,0.17886904761904762
170
+ HELM AirBench Sexual Content,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,0,0.2857142857142857,0.39875992063492066
171
+ HELM AirBench Sexual Content,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,1,0.4999999999999999,0.10868055555555556
172
+ HELM AirBench Sexual Content,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,2,0.2857142857142857,0.39875992063492066
173
+ HELM AirBench Child Harm,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,0,0.4999999999999999,0.10868055555555556
174
+ HELM AirBench Child Harm,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,1,0.6428571428571428,0.03115079365079365
175
+ HELM AirBench Child Harm,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,2,0.5714285714285714,0.06101190476190476
176
+ HELM AirBench Self Harm,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,0,0.32732683535398854,0.2618277009271762
177
+ HELM AirBench Self Harm,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,1,0.7142857142857142,0.014136904761904762
178
+ HELM AirBench Self Harm,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,2,0.4999999999999999,0.10868055555555556
179
+ HELM AirBench Political Usage,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,0,0.3571428571428571,0.27509920634920637
180
+ HELM AirBench Political Usage,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,1,0.5714285714285714,0.06101190476190476
181
+ HELM AirBench Political Usage,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,2,0.5714285714285714,0.06101190476190476
182
+ HELM AirBench Economic Harm,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,0,0.3571428571428571,0.27509920634920637
183
+ HELM AirBench Economic Harm,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,1,0.6428571428571428,0.03115079365079365
184
+ HELM AirBench Economic Harm,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,2,0.5714285714285714,0.06101190476190476
185
+ HELM AirBench Deception,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,0,0.47280542884465016,0.10506382347888965
186
+ HELM AirBench Deception,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,1,0.7142857142857142,0.014136904761904762
187
+ HELM AirBench Deception,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,2,0.5714285714285714,0.06101190476190476
188
+ HELM AirBench Manipulation,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,0,0.3571428571428571,0.27509920634920637
189
+ HELM AirBench Manipulation,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,1,0.6428571428571428,0.03115079365079365
190
+ HELM AirBench Manipulation,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,2,0.6182840223353117,0.0340492747686748
191
+ HELM AirBench Defamation,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,0,0.40006613209931935,0.17023995462900499
192
+ HELM AirBench Defamation,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,1,0.6428571428571428,0.03115079365079365
193
+ HELM AirBench Defamation,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,2,0.40006613209931935,0.17023995462900499
194
+ HELM AirBench Fundamental Rights,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,0,0.40006613209931935,0.17023995462900499
195
+ HELM AirBench Fundamental Rights,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,1,0.6428571428571428,0.03115079365079365
196
+ HELM AirBench Fundamental Rights,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,2,0.6182840223353117,0.0340492747686748
197
+ HELM AirBench Discrimination/Bias,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,0,0.3571428571428571,0.27509920634920637
198
+ HELM AirBench Discrimination/Bias,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,1,0.5455447255899809,0.0614649096074132
199
+ HELM AirBench Discrimination/Bias,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,2,0.47280542884465016,0.10506382347888965
200
+ HELM AirBench Privacy,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,0,0.3571428571428571,0.27509920634920637
201
+ HELM AirBench Privacy,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,1,0.4999999999999999,0.10868055555555556
202
+ HELM AirBench Privacy,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,2,0.42857142857142855,0.17886904761904762
203
+ HELM AirBench Criminal Activities,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,0,0.4999999999999999,0.10868055555555556
204
+ HELM AirBench Criminal Activities,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,1,0.6428571428571428,0.03115079365079365
205
+ HELM AirBench Criminal Activities,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,2,0.5714285714285714,0.06101190476190476
206
+ HELM AirBench AIR Score,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,0,0.42857142857142855,0.17886904761904762
207
+ HELM AirBench AIR Score,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,1,0.6428571428571428,0.03115079365079365
208
+ HELM AirBench AIR Score,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,2,0.5455447255899809,0.0614649096074132
209
  OpenCompass,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,0,0.9285714285714285,0.0003968253968253968
210
  OpenCompass,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,1,0.42857142857142855,0.17886904761904762
211
  OpenCompass,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,2,0.2857142857142857,0.39875992063492066
 
365
  LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,0,0.5714285714285714,0.06101190476190476
366
  LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,1,0.2857142857142857,0.39875992063492066
367
  LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,2,0.5714285714285714,0.06101190476190476
368
+ Hydrox Integrity,hydrox_safety_241001.csv,aggregate,aggregate,kendall,random,8,0,0.07142857142857142,0.9048611111111111
369
+ Hydrox Integrity,hydrox_safety_241001.csv,aggregate,aggregate,kendall,random,8,1,0.5714285714285714,0.06101190476190476
370
+ Hydrox Integrity,hydrox_safety_241001.csv,aggregate,aggregate,kendall,random,8,2,0.2857142857142857,0.39875992063492066
371
+ Hydrox Overall Score,hydrox_safety_241001.csv,aggregate,aggregate,kendall,random,8,0,0.0,1.0
372
+ Hydrox Overall Score,hydrox_safety_241001.csv,aggregate,aggregate,kendall,random,8,1,0.5714285714285714,0.06101190476190476
373
+ Hydrox Overall Score,hydrox_safety_241001.csv,aggregate,aggregate,kendall,random,8,2,0.2857142857142857,0.39875992063492066
374
+ Hydrox Privacy,hydrox_safety_241001.csv,aggregate,aggregate,kendall,random,8,0,0.0,1.0
375
+ Hydrox Privacy,hydrox_safety_241001.csv,aggregate,aggregate,kendall,random,8,1,0.5714285714285714,0.06101190476190476
376
+ Hydrox Privacy,hydrox_safety_241001.csv,aggregate,aggregate,kendall,random,8,2,0.21428571428571427,0.5484126984126985
377
+ Hydrox Safety,hydrox_safety_241001.csv,aggregate,aggregate,kendall,random,8,0,-0.07142857142857142,0.9048611111111111
378
+ Hydrox Safety,hydrox_safety_241001.csv,aggregate,aggregate,kendall,random,8,1,0.6428571428571428,0.03115079365079365
379
+ Hydrox Safety,hydrox_safety_241001.csv,aggregate,aggregate,kendall,random,8,2,0.2857142857142857,0.39875992063492066
380
+ Hydrox Security,hydrox_safety_241001.csv,aggregate,aggregate,kendall,random,8,0,0.07142857142857142,0.9048611111111111
381
+ Hydrox Security,hydrox_safety_241001.csv,aggregate,aggregate,kendall,random,8,1,0.5714285714285714,0.06101190476190476
382
+ Hydrox Security,hydrox_safety_241001.csv,aggregate,aggregate,kendall,random,8,2,0.2857142857142857,0.39875992063492066
383
  aggregate,aggregate,Holmes,holmes_240829.csv,kendall,random,8,0,0.21428571428571427,0.5484126984126985
384
  aggregate,aggregate,Holmes,holmes_240829.csv,kendall,random,8,1,0.42857142857142855,0.17886904761904762
385
  aggregate,aggregate,Holmes,holmes_240829.csv,kendall,random,8,2,0.3571428571428571,0.27509920634920637
 
536
  aggregate,aggregate,AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,kendall,random,8,0,0.7857142857142856,0.005505952380952381
537
  aggregate,aggregate,AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,kendall,random,8,1,0.5714285714285714,0.06101190476190476
538
  aggregate,aggregate,AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,kendall,random,8,2,0.8571428571428571,0.001736111111111111
539
+ aggregate,aggregate,HELM AirBench Security Risks,helm_airbench_240916.csv,kendall,random,8,0,0.42857142857142855,0.17886904761904762
540
+ aggregate,aggregate,HELM AirBench Security Risks,helm_airbench_240916.csv,kendall,random,8,1,0.5714285714285714,0.06101190476190476
541
+ aggregate,aggregate,HELM AirBench Security Risks,helm_airbench_240916.csv,kendall,random,8,2,0.3571428571428571,0.27509920634920637
542
+ aggregate,aggregate,HELM AirBench Operational Misuses,helm_airbench_240916.csv,kendall,random,8,0,0.3571428571428571,0.27509920634920637
543
+ aggregate,aggregate,HELM AirBench Operational Misuses,helm_airbench_240916.csv,kendall,random,8,1,0.4999999999999999,0.10868055555555556
544
+ aggregate,aggregate,HELM AirBench Operational Misuses,helm_airbench_240916.csv,kendall,random,8,2,0.42857142857142855,0.17886904761904762
545
+ aggregate,aggregate,HELM AirBench Violence & Extremism,helm_airbench_240916.csv,kendall,random,8,0,0.42857142857142855,0.17886904761904762
546
+ aggregate,aggregate,HELM AirBench Violence & Extremism,helm_airbench_240916.csv,kendall,random,8,1,0.6428571428571428,0.03115079365079365
547
+ aggregate,aggregate,HELM AirBench Violence & Extremism,helm_airbench_240916.csv,kendall,random,8,2,0.4999999999999999,0.10868055555555556
548
+ aggregate,aggregate,HELM AirBench Hate/Toxicity,helm_airbench_240916.csv,kendall,random,8,0,0.2857142857142857,0.39875992063492066
549
+ aggregate,aggregate,HELM AirBench Hate/Toxicity,helm_airbench_240916.csv,kendall,random,8,1,0.4999999999999999,0.10868055555555556
550
+ aggregate,aggregate,HELM AirBench Hate/Toxicity,helm_airbench_240916.csv,kendall,random,8,2,0.42857142857142855,0.17886904761904762
551
+ aggregate,aggregate,HELM AirBench Sexual Content,helm_airbench_240916.csv,kendall,random,8,0,0.2857142857142857,0.39875992063492066
552
+ aggregate,aggregate,HELM AirBench Sexual Content,helm_airbench_240916.csv,kendall,random,8,1,0.4999999999999999,0.10868055555555556
553
+ aggregate,aggregate,HELM AirBench Sexual Content,helm_airbench_240916.csv,kendall,random,8,2,0.2857142857142857,0.39875992063492066
554
+ aggregate,aggregate,HELM AirBench Child Harm,helm_airbench_240916.csv,kendall,random,8,0,0.4999999999999999,0.10868055555555556
555
+ aggregate,aggregate,HELM AirBench Child Harm,helm_airbench_240916.csv,kendall,random,8,1,0.6428571428571428,0.03115079365079365
556
+ aggregate,aggregate,HELM AirBench Child Harm,helm_airbench_240916.csv,kendall,random,8,2,0.5714285714285714,0.06101190476190476
557
+ aggregate,aggregate,HELM AirBench Self Harm,helm_airbench_240916.csv,kendall,random,8,0,0.32732683535398854,0.2618277009271762
558
+ aggregate,aggregate,HELM AirBench Self Harm,helm_airbench_240916.csv,kendall,random,8,1,0.7142857142857142,0.014136904761904762
559
+ aggregate,aggregate,HELM AirBench Self Harm,helm_airbench_240916.csv,kendall,random,8,2,0.4999999999999999,0.10868055555555556
560
+ aggregate,aggregate,HELM AirBench Political Usage,helm_airbench_240916.csv,kendall,random,8,0,0.3571428571428571,0.27509920634920637
561
+ aggregate,aggregate,HELM AirBench Political Usage,helm_airbench_240916.csv,kendall,random,8,1,0.5714285714285714,0.06101190476190476
562
+ aggregate,aggregate,HELM AirBench Political Usage,helm_airbench_240916.csv,kendall,random,8,2,0.5714285714285714,0.06101190476190476
563
+ aggregate,aggregate,HELM AirBench Economic Harm,helm_airbench_240916.csv,kendall,random,8,0,0.3571428571428571,0.27509920634920637
564
+ aggregate,aggregate,HELM AirBench Economic Harm,helm_airbench_240916.csv,kendall,random,8,1,0.6428571428571428,0.03115079365079365
565
+ aggregate,aggregate,HELM AirBench Economic Harm,helm_airbench_240916.csv,kendall,random,8,2,0.5714285714285714,0.06101190476190476
566
+ aggregate,aggregate,HELM AirBench Deception,helm_airbench_240916.csv,kendall,random,8,0,0.47280542884465016,0.10506382347888965
567
+ aggregate,aggregate,HELM AirBench Deception,helm_airbench_240916.csv,kendall,random,8,1,0.7142857142857142,0.014136904761904762
568
+ aggregate,aggregate,HELM AirBench Deception,helm_airbench_240916.csv,kendall,random,8,2,0.5714285714285714,0.06101190476190476
569
+ aggregate,aggregate,HELM AirBench Manipulation,helm_airbench_240916.csv,kendall,random,8,0,0.3571428571428571,0.27509920634920637
570
+ aggregate,aggregate,HELM AirBench Manipulation,helm_airbench_240916.csv,kendall,random,8,1,0.6428571428571428,0.03115079365079365
571
+ aggregate,aggregate,HELM AirBench Manipulation,helm_airbench_240916.csv,kendall,random,8,2,0.6182840223353117,0.0340492747686748
572
+ aggregate,aggregate,HELM AirBench Defamation,helm_airbench_240916.csv,kendall,random,8,0,0.40006613209931935,0.17023995462900499
573
+ aggregate,aggregate,HELM AirBench Defamation,helm_airbench_240916.csv,kendall,random,8,1,0.6428571428571428,0.03115079365079365
574
+ aggregate,aggregate,HELM AirBench Defamation,helm_airbench_240916.csv,kendall,random,8,2,0.40006613209931935,0.17023995462900499
575
+ aggregate,aggregate,HELM AirBench Fundamental Rights,helm_airbench_240916.csv,kendall,random,8,0,0.40006613209931935,0.17023995462900499
576
+ aggregate,aggregate,HELM AirBench Fundamental Rights,helm_airbench_240916.csv,kendall,random,8,1,0.6428571428571428,0.03115079365079365
577
+ aggregate,aggregate,HELM AirBench Fundamental Rights,helm_airbench_240916.csv,kendall,random,8,2,0.6182840223353117,0.0340492747686748
578
+ aggregate,aggregate,HELM AirBench Discrimination/Bias,helm_airbench_240916.csv,kendall,random,8,0,0.3571428571428571,0.27509920634920637
579
+ aggregate,aggregate,HELM AirBench Discrimination/Bias,helm_airbench_240916.csv,kendall,random,8,1,0.5455447255899809,0.0614649096074132
580
+ aggregate,aggregate,HELM AirBench Discrimination/Bias,helm_airbench_240916.csv,kendall,random,8,2,0.47280542884465016,0.10506382347888965
581
+ aggregate,aggregate,HELM AirBench Privacy,helm_airbench_240916.csv,kendall,random,8,0,0.3571428571428571,0.27509920634920637
582
+ aggregate,aggregate,HELM AirBench Privacy,helm_airbench_240916.csv,kendall,random,8,1,0.4999999999999999,0.10868055555555556
583
+ aggregate,aggregate,HELM AirBench Privacy,helm_airbench_240916.csv,kendall,random,8,2,0.42857142857142855,0.17886904761904762
584
+ aggregate,aggregate,HELM AirBench Criminal Activities,helm_airbench_240916.csv,kendall,random,8,0,0.4999999999999999,0.10868055555555556
585
+ aggregate,aggregate,HELM AirBench Criminal Activities,helm_airbench_240916.csv,kendall,random,8,1,0.6428571428571428,0.03115079365079365
586
+ aggregate,aggregate,HELM AirBench Criminal Activities,helm_airbench_240916.csv,kendall,random,8,2,0.5714285714285714,0.06101190476190476
587
+ aggregate,aggregate,HELM AirBench AIR Score,helm_airbench_240916.csv,kendall,random,8,0,0.42857142857142855,0.17886904761904762
588
+ aggregate,aggregate,HELM AirBench AIR Score,helm_airbench_240916.csv,kendall,random,8,1,0.6428571428571428,0.03115079365079365
589
+ aggregate,aggregate,HELM AirBench AIR Score,helm_airbench_240916.csv,kendall,random,8,2,0.5455447255899809,0.0614649096074132
590
  aggregate,aggregate,OpenCompass,opencompass_240829.csv,kendall,random,8,0,0.9285714285714285,0.0003968253968253968
591
  aggregate,aggregate,OpenCompass,opencompass_240829.csv,kendall,random,8,1,0.42857142857142855,0.17886904761904762
592
  aggregate,aggregate,OpenCompass,opencompass_240829.csv,kendall,random,8,2,0.2857142857142857,0.39875992063492066
 
746
  aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,kendall,random,8,0,0.5714285714285714,0.06101190476190476
747
  aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,kendall,random,8,1,0.2857142857142857,0.39875992063492066
748
  aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,kendall,random,8,2,0.5714285714285714,0.06101190476190476
749
+ aggregate,aggregate,Hydrox Integrity,hydrox_safety_241001.csv,kendall,random,8,0,0.07142857142857142,0.9048611111111111
750
+ aggregate,aggregate,Hydrox Integrity,hydrox_safety_241001.csv,kendall,random,8,1,0.5714285714285714,0.06101190476190476
751
+ aggregate,aggregate,Hydrox Integrity,hydrox_safety_241001.csv,kendall,random,8,2,0.2857142857142857,0.39875992063492066
752
+ aggregate,aggregate,Hydrox Overall Score,hydrox_safety_241001.csv,kendall,random,8,0,0.0,1.0
753
+ aggregate,aggregate,Hydrox Overall Score,hydrox_safety_241001.csv,kendall,random,8,1,0.5714285714285714,0.06101190476190476
754
+ aggregate,aggregate,Hydrox Overall Score,hydrox_safety_241001.csv,kendall,random,8,2,0.2857142857142857,0.39875992063492066
755
+ aggregate,aggregate,Hydrox Privacy,hydrox_safety_241001.csv,kendall,random,8,0,0.0,1.0
756
+ aggregate,aggregate,Hydrox Privacy,hydrox_safety_241001.csv,kendall,random,8,1,0.5714285714285714,0.06101190476190476
757
+ aggregate,aggregate,Hydrox Privacy,hydrox_safety_241001.csv,kendall,random,8,2,0.21428571428571427,0.5484126984126985
758
+ aggregate,aggregate,Hydrox Safety,hydrox_safety_241001.csv,kendall,random,8,0,-0.07142857142857142,0.9048611111111111
759
+ aggregate,aggregate,Hydrox Safety,hydrox_safety_241001.csv,kendall,random,8,1,0.6428571428571428,0.03115079365079365
760
+ aggregate,aggregate,Hydrox Safety,hydrox_safety_241001.csv,kendall,random,8,2,0.2857142857142857,0.39875992063492066
761
+ aggregate,aggregate,Hydrox Security,hydrox_safety_241001.csv,kendall,random,8,0,0.07142857142857142,0.9048611111111111
762
+ aggregate,aggregate,Hydrox Security,hydrox_safety_241001.csv,kendall,random,8,1,0.5714285714285714,0.06101190476190476
763
+ aggregate,aggregate,Hydrox Security,hydrox_safety_241001.csv,kendall,random,8,2,0.2857142857142857,0.39875992063492066
cache/allbenchs_cache_05c0405c5253dda90dc632e052accfd2.csv DELETED
The diff for this file is too large to render. See raw diff
 
cache/allbenchs_cache_5e66a88dab42480065db47711c55c458.csv CHANGED
The diff for this file is too large to render. See raw diff