win-rate aggregation results
Browse files- best_judges_single_agg.csv +48 -48
best_judges_single_agg.csv
CHANGED
@@ -1,49 +1,49 @@
|
|
1 |
Judge Model,full_name,Realization,Ranking Agreement,Decisiveness,Bias,ci_low,ci_high
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
llama-3-1-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
Qwen2.5-72B-Instruct,Qwen2.5-72B-Instruct#
|
16 |
-
|
17 |
-
gpt-4o-2024-11-20,gpt-4o-2024-11-20#0-100_verbalized-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
llama-3-1-405b-instruct-fp8,llama-3-1-405b-instruct-fp8#
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
internlm2-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
llama-3-70b-instruct,llama-3-70b-instruct#
|
36 |
-
|
37 |
-
Llama-3.1-8B-Instruct,Llama-3.1-8B-Instruct#
|
38 |
-
llama-3-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
mixtral-
|
45 |
-
|
46 |
-
|
47 |
-
mixtral-8x7B-instruct-v0.1,mixtral-8x7B-instruct-v0.1#bad-good_textual-
|
48 |
-
|
49 |
-
|
|
|
1 |
Judge Model,full_name,Realization,Ranking Agreement,Decisiveness,Bias,ci_low,ci_high
|
2 |
+
Qwen2.5-72B-Instruct,Qwen2.5-72B-Instruct#bad-good_textual-score_win_rate,Likert,0.8270017533606078,4.755366194422462,0.07924632786346093,,
|
3 |
+
URM-LLaMa-3.1-8B,URM-LLaMa-3.1-8B_win_rate,Reward,0.8164815897136176,1.8368566117313365,0.08485779417852375,,
|
4 |
+
gpt-4o-2024-11-20,gpt-4o-2024-11-20#comparative-anchor-gpt-4-0314_win_rate,Anchor,0.8141437755698421,3.0737000941858494,0.08468363327726611,,
|
5 |
+
Qwen2.5-72B-Instruct,Qwen2.5-72B-Instruct#0-100_verbalized-score_win_rate,Numeric,0.8129748684979544,4.0878126854835,0.07925204683413378,,
|
6 |
+
llama-3-1-405b-instruct-fp8,llama-3-1-405b-instruct-fp8#0-100_verbalized-score_win_rate,Numeric,0.8118059614260666,4.330580224795526,0.08713575870563035,,
|
7 |
+
gpt-4o-mini-2024-07-18,gpt-4o-mini-2024-07-18#0-100_verbalized-score_win_rate,Numeric,0.8071303331385155,2.911340336840001,0.07690456122004496,,
|
8 |
+
mistral-large-instruct-2407,mistral-large-instruct-2407#0-100_verbalized-score_win_rate,Numeric,0.8024547048509642,3.0100161147388955,0.0820935147925871,,
|
9 |
+
llama-3-1-70b-instruct,llama-3-1-70b-instruct#0-100_verbalized-score_win_rate,Numeric,0.7977790765634132,2.6939668808867303,0.08683395704068392,,
|
10 |
+
mistral-large-instruct-2407,mistral-large-instruct-2407#bad-good_textual-score_win_rate,Likert,0.7977790765634132,5.471086170786369,0.08571761438483068,,
|
11 |
+
llama-3-1-405b-instruct-fp8,llama-3-1-405b-instruct-fp8#bad-good_textual-score_win_rate,Likert,0.793103448275862,5.218423477156838,0.09726366816471475,,
|
12 |
+
llama-3-1-70b-instruct,llama-3-1-70b-instruct#good-yes-no_logprob-score_win_rate,TokenProbs,0.793103448275862,1.2599940887461256,0.06974800184734668,,
|
13 |
+
gpt-4o-mini-2024-07-18,gpt-4o-mini-2024-07-18#bad-good_textual-score_win_rate,Likert,0.793103448275862,4.610807214374205,0.08715933303084132,,
|
14 |
+
mistral-large-instruct-2407,mistral-large-instruct-2407#comparative-anchor-gpt-4-0314_win_rate,Anchor,0.7860900058445353,2.1297623705935567,0.11110146913759139,,
|
15 |
+
Qwen2.5-72B-Instruct,Qwen2.5-72B-Instruct#comparative-anchor-gpt-4-0314_win_rate,Anchor,0.7860900058445353,2.9295541683113755,0.08961997841795598,,
|
16 |
+
Skywork-Reward-Llama-3.1-8B-v0.2,Skywork-Reward-Llama-3.1-8B-v0.2_win_rate,Reward,0.7802454704850964,2.461196439206365,0.09968448349021375,,
|
17 |
+
gpt-4o-2024-11-20,gpt-4o-2024-11-20#0-100_verbalized-score_win_rate,Numeric,0.7708942139099941,2.147368211099292,0.07704891970422703,,
|
18 |
+
Llama-3-OffsetBias-RM-8B,Llama-3-OffsetBias-RM-8B_win_rate,Reward,0.7650496785505552,1.386859930640412,0.07566984800414184,,
|
19 |
+
ArmoRM-Llama3-8B-v0.1,ArmoRM-Llama3-8B-v0.1_win_rate,Reward,0.7615429573348919,1.8398700318743302,0.09237283513647337,,
|
20 |
+
gpt-4o-mini-2024-07-18,gpt-4o-mini-2024-07-18#good-yes-no_logprob-score_win_rate,TokenProbs,0.7592051431911162,2.10259493695348,0.08401740959016844,,
|
21 |
+
gpt-4o-2024-11-20,gpt-4o-2024-11-20#bad-good_textual-score_win_rate,Likert,0.7580362361192284,5.485635896159162,0.08906791777991026,,
|
22 |
+
mixtral-8x22B-instruct-v0.1,mixtral-8x22B-instruct-v0.1#0-100_verbalized-score_win_rate,Numeric,0.7556984219754529,2.123702380621778,0.08878149867714963,,
|
23 |
+
Qwen2.5-72B-Instruct,Qwen2.5-72B-Instruct#good-yes-no_logprob-score_win_rate,TokenProbs,0.7475160724722384,2.689252148396911,0.08165561425906073,,
|
24 |
+
llama-3-1-405b-instruct-fp8,llama-3-1-405b-instruct-fp8#comparative-anchor-gpt-4-0314_win_rate,Anchor,0.7475160724722384,3.577096074866618,0.11235985485344185,,
|
25 |
+
Llama-3.1-8B-Instruct,Llama-3.1-8B-Instruct#good-yes-no_logprob-score_win_rate,TokenProbs,0.744009351256575,0.5983808410581415,0.06128229980875954,,
|
26 |
+
llama-3-1-405b-instruct-fp8,llama-3-1-405b-instruct-fp8#good-yes-no_logprob-score_win_rate,TokenProbs,0.7405026300409117,1.5497071579540496,0.09227087986533976,,
|
27 |
+
llama-3-70b-instruct,llama-3-70b-instruct#0-100_verbalized-score_win_rate,Numeric,0.7334891876095849,1.2738290052706196,0.0843328604031163,,
|
28 |
+
llama-3-1-70b-instruct,llama-3-1-70b-instruct#bad-good_textual-score_win_rate,Likert,0.7323202805376972,3.901943147622847,0.12009332653938945,,
|
29 |
+
mixtral-8x22B-instruct-v0.1,mixtral-8x22B-instruct-v0.1#bad-good_textual-score_win_rate,Likert,0.7323202805376972,2.534301904660848,0.10758560218545998,,
|
30 |
+
internlm2-7b-reward,internlm2-7b-reward_win_rate,Reward,0.7171244886031559,2.353664499796978,0.11336430399297745,,
|
31 |
+
mixtral-8x22B-instruct-v0.1,mixtral-8x22B-instruct-v0.1#good-yes-no_logprob-score_win_rate,TokenProbs,0.7171244886031559,1.849571280249153,0.08844553785560817,,
|
32 |
+
llama-3-1-70b-instruct,llama-3-1-70b-instruct#comparative-anchor-gpt-4-0314_win_rate,Anchor,0.7159555815312681,2.711477731374198,0.1262116303875313,,
|
33 |
+
internlm2-20b-reward,internlm2-20b-reward_win_rate,Reward,0.7159555815312681,1.9003691605695128,0.09838917792721068,,
|
34 |
+
GRM-Llama3.2-3B-rewardmodel-ft,GRM-Llama3.2-3B-rewardmodel-ft_win_rate,Reward,0.7124488603156048,2.302320479143431,0.11380131384129795,,
|
35 |
+
llama-3-70b-instruct,llama-3-70b-instruct#good-yes-no_logprob-score_win_rate,TokenProbs,0.6960841613091758,0.774852442203426,0.0712068298228739,,
|
36 |
+
gpt-4o-2024-11-20,gpt-4o-2024-11-20#good-yes-no_logprob-score_win_rate,TokenProbs,0.6960841613091758,2.2241585234952765,0.09319651198050675,,
|
37 |
+
Llama-3.1-8B-Instruct,Llama-3.1-8B-Instruct#bad-good_textual-score_win_rate,Likert,0.6937463471654003,0.9348572206360583,0.09020347195052465,,
|
38 |
+
llama-3-70b-instruct,llama-3-70b-instruct#bad-good_textual-score_win_rate,Likert,0.6808883693746346,2.4001241250045453,0.12200247480571864,,
|
39 |
+
gpt-4o-mini-2024-07-18,gpt-4o-mini-2024-07-18#comparative-anchor-gpt-4-0314_win_rate,Anchor,0.668030391583869,1.4123201435751862,0.1108225332258117,,
|
40 |
+
Llama-3.1-8B-Instruct,Llama-3.1-8B-Instruct#comparative-anchor-gpt-4-0314_win_rate,Anchor,0.6388077147866744,0.8684080176854654,0.08510985248268717,,
|
41 |
+
llama-3-70b-instruct,llama-3-70b-instruct#comparative-anchor-gpt-4-0314_win_rate,Anchor,0.6376388077147865,1.8202412271839277,0.13196822136679537,,
|
42 |
+
Llama-3.1-8B-Instruct,Llama-3.1-8B-Instruct#0-100_verbalized-score_win_rate,Numeric,0.6317942723553477,1.2045215166190555,0.10431279729383011,,
|
43 |
+
Eurus-RM-7b,Eurus-RM-7b_win_rate,Reward,0.6294564582115721,2.492726583183693,0.13811267469299746,,
|
44 |
+
mixtral-8x7B-instruct-v0.1,mixtral-8x7B-instruct-v0.1#0-100_verbalized-score_win_rate,Numeric,0.6259497369959087,1.2703499058409227,0.10233107571803857,,
|
45 |
+
mixtral-8x7B-instruct-v0.1,mixtral-8x7B-instruct-v0.1#comparative-anchor-gpt-4-0314_win_rate,Anchor,0.6224430157802454,1.1679716230736195,0.10168601661101805,,
|
46 |
+
mixtral-8x22B-instruct-v0.1,mixtral-8x22B-instruct-v0.1#comparative-anchor-gpt-4-0314_win_rate,Anchor,0.6119228521332554,1.4971790245204495,0.1398327898420888,,
|
47 |
+
mixtral-8x7B-instruct-v0.1,mixtral-8x7B-instruct-v0.1#bad-good_textual-score_win_rate,Likert,0.542957334891876,0.8381223432288695,0.11017386542259801,,
|
48 |
+
mistral-large-instruct-2407,mistral-large-instruct-2407#good-yes-no_logprob-score_win_rate,TokenProbs,0.4167153711279952,1.1652168815452648,0.12258728493955592,,
|
49 |
+
mixtral-8x7B-instruct-v0.1,mixtral-8x7B-instruct-v0.1#good-yes-no_logprob-score_win_rate,TokenProbs,0.3711279953243717,0.7389819834751149,0.1070750522871957,,
|