File size: 5,765 Bytes
9b2d9e3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
Judge Model,full_name,Realization,Ranking Agreement,Decisiveness,Bias,ci_low,ci_high
URM-LLaMa-3.1-8B,URM-LLaMa-3.1-8B_BT,Reward,0.818819404,1.836856612,0.084857794,,
Qwen2.5-72B-Instruct,Qwen2.5-72B-Instruct#bad-good_textual-score_BT,Likert,0.817304917,4.755366194,0.079246328,,
Qwen2.5-72B-Instruct,Qwen2.5-72B-Instruct#0-100_verbalized-score_BT,Numeric,0.814143776,4.087812685,0.079252047,,
mistral-large-instruct-2407,mistral-large-instruct-2407#bad-good_textual-score_BT,Likert,0.810637054,5.471086171,0.085717614,,
gpt-4o-2024-11-20,gpt-4o-2024-11-20#comparative-anchor-gpt-4-0314_BT,Anchor,0.809468147,3.073700094,0.084683633,,
mistral-large-instruct-2407,mistral-large-instruct-2407#0-100_verbalized-score_BT,Numeric,0.809468147,3.010016115,0.082093515,,
llama-3-1-405b-instruct-fp8,llama-3-1-405b-instruct-fp8#0-100_verbalized-score_BT,Numeric,0.804792519,4.330580225,0.087135759,,
gpt-4o-mini-2024-07-18,gpt-4o-mini-2024-07-18#0-100_verbalized-score_BT,Numeric,0.803623612,2.911340337,0.076904561,,
gpt-4o-mini-2024-07-18,gpt-4o-mini-2024-07-18#bad-good_textual-score_BT,Likert,0.797779077,4.610807214,0.087159333,,
llama-3-1-70b-instruct,llama-3-1-70b-instruct#0-100_verbalized-score_BT,Numeric,0.797779077,2.693966881,0.086833957,,
Qwen2.5-72B-Instruct,Qwen2.5-72B-Instruct#comparative-anchor-gpt-4-0314_BT,Anchor,0.794272355,2.929554168,0.089619978,,
llama-3-1-405b-instruct-fp8,llama-3-1-405b-instruct-fp8#bad-good_textual-score_BT,Likert,0.787258913,5.218423477,0.097263668,,
Skywork-Reward-Llama-3.1-8B-v0.2,Skywork-Reward-Llama-3.1-8B-v0.2_BT,Reward,0.777907656,2.461196439,0.099684483,,
Qwen2.5-72B-Instruct,Qwen2.5-72B-Instruct#good-yes-no_logprob-score_BT,TokenProbs,0.776738749,2.689252148,0.081655614,,
mixtral-8x22B-instruct-v0.1,mixtral-8x22B-instruct-v0.1#0-100_verbalized-score_BT,Numeric,0.775569842,2.123702381,0.088781499,,
gpt-4o-2024-11-20,gpt-4o-2024-11-20#0-100_verbalized-score_BT,Numeric,0.774400935,2.147368211,0.07704892,,
gpt-4o-2024-11-20,gpt-4o-2024-11-20#bad-good_textual-score_BT,Likert,0.772873462,5.485635896,0.089067918,,
llama-3-1-70b-instruct,llama-3-1-70b-instruct#good-yes-no_logprob-score_BT,TokenProbs,0.765049679,1.259994089,0.069748002,,
Llama-3-OffsetBias-RM-8B,Llama-3-OffsetBias-RM-8B_BT,Reward,0.765049679,1.386859931,0.075669848,,
ArmoRM-Llama3-8B-v0.1,ArmoRM-Llama3-8B-v0.1_BT,Reward,0.762711864,1.839870032,0.092372835,,
gpt-4o-mini-2024-07-18,gpt-4o-mini-2024-07-18#good-yes-no_logprob-score_BT,TokenProbs,0.752191701,2.102594937,0.08401741,,
llama-3-70b-instruct,llama-3-70b-instruct#0-100_verbalized-score_BT,Numeric,0.74868498,1.273829005,0.08433286,,
Llama-3.1-8B-Instruct,Llama-3.1-8B-Instruct#good-yes-no_logprob-score_BT,TokenProbs,0.74050263,0.598380841,0.0612823,,
mixtral-8x22B-instruct-v0.1,mixtral-8x22B-instruct-v0.1#bad-good_textual-score_BT,Likert,0.738164816,2.534301905,0.107585602,,
llama-3-1-405b-instruct-fp8,llama-3-1-405b-instruct-fp8#comparative-anchor-gpt-4-0314_BT,Anchor,0.729982466,3.577096075,0.112359855,,
mistral-large-instruct-2407,mistral-large-instruct-2407#comparative-anchor-gpt-4-0314_BT,Anchor,0.725306838,2.129762371,0.111101469,,
Llama-3.1-8B-Instruct,Llama-3.1-8B-Instruct#bad-good_textual-score_BT,Likert,0.722969024,0.934857221,0.090203472,,
llama-3-1-70b-instruct,llama-3-1-70b-instruct#bad-good_textual-score_BT,Likert,0.721800117,3.901943148,0.120093327,,
internlm2-20b-reward,internlm2-20b-reward_BT,Reward,0.717124489,1.900369161,0.098389178,,
internlm2-7b-reward,internlm2-7b-reward_BT,Reward,0.71244886,2.3536645,0.113364304,,
GRM-Llama3.2-3B-rewardmodel-ft,GRM-Llama3.2-3B-rewardmodel-ft_BT,Reward,0.711279953,2.302320479,0.113801314,,
mixtral-8x22B-instruct-v0.1,mixtral-8x22B-instruct-v0.1#good-yes-no_logprob-score_BT,TokenProbs,0.701928697,1.84957128,0.088445538,,
gpt-4o-2024-11-20,gpt-4o-2024-11-20#good-yes-no_logprob-score_BT,TokenProbs,0.700380036,2.224158523,0.093196512,,
llama-3-70b-instruct,llama-3-70b-instruct#bad-good_textual-score_BT,Likert,0.698421975,2.400124125,0.122002475,,
llama-3-1-70b-instruct,llama-3-1-70b-instruct#comparative-anchor-gpt-4-0314_BT,Anchor,0.687901812,2.711477731,0.12621163,,
Llama-3.1-8B-Instruct,Llama-3.1-8B-Instruct#comparative-anchor-gpt-4-0314_BT,Anchor,0.677381648,0.868408018,0.085109852,,
llama-3-1-405b-instruct-fp8,llama-3-1-405b-instruct-fp8#good-yes-no_logprob-score_BT,TokenProbs,0.671537113,1.549707158,0.09227088,,
Llama-3.1-8B-Instruct,Llama-3.1-8B-Instruct#0-100_verbalized-score_BT,Numeric,0.668030392,1.204521517,0.104312797,,
llama-3-70b-instruct,llama-3-70b-instruct#good-yes-no_logprob-score_BT,TokenProbs,0.663354763,0.774852442,0.07120683,,
gpt-4o-mini-2024-07-18,gpt-4o-mini-2024-07-18#comparative-anchor-gpt-4-0314_BT,Anchor,0.658679135,1.412320144,0.110822533,,
mixtral-8x7B-instruct-v0.1,mixtral-8x7B-instruct-v0.1#0-100_verbalized-score_BT,Numeric,0.656341321,1.270349906,0.102331076,,
mixtral-8x7B-instruct-v0.1,mixtral-8x7B-instruct-v0.1#comparative-anchor-gpt-4-0314_BT,Anchor,0.655172414,1.167971623,0.101686017,,
mixtral-8x22B-instruct-v0.1,mixtral-8x22B-instruct-v0.1#comparative-anchor-gpt-4-0314_BT,Anchor,0.641145529,1.497179025,0.13983279,,
llama-3-70b-instruct,llama-3-70b-instruct#comparative-anchor-gpt-4-0314_BT,Anchor,0.632963179,1.820241227,0.131968221,,
Eurus-RM-7b,Eurus-RM-7b_BT,Reward,0.628287551,2.492726583,0.138112675,,
mixtral-8x7B-instruct-v0.1,mixtral-8x7B-instruct-v0.1#bad-good_textual-score_BT,Likert,0.589713618,0.838122343,0.110173865,,
mixtral-8x7B-instruct-v0.1,mixtral-8x7B-instruct-v0.1#good-yes-no_logprob-score_BT,TokenProbs,0.427235535,0.738981983,0.107075052,,
mistral-large-instruct-2407,mistral-large-instruct-2407#good-yes-no_logprob-score_BT,TokenProbs,0.368790181,1.165216882,0.122587285,,