justinxzhao commited on
Commit
ca1e7f4
1 Parent(s): f2a6ef6

Initial version of AlpacaEval Visualizations

Browse files
Files changed (5) hide show
  1. .gitignore +2 -1
  2. app.py +233 -1
  3. data/model_win_rates.json +1 -0
  4. prep_data.py +92 -0
  5. requirements.txt +5 -0
.gitignore CHANGED
@@ -1 +1,2 @@
1
- env/
 
 
1
+ env/
2
+ submodules/
app.py CHANGED
@@ -1,9 +1,241 @@
1
  import streamlit as st
2
  import pandas as pd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
 
4
 
5
  def app():
6
- st.title("Alpaca Evaluation")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
 
9
  if __name__ == "__main__":
 
1
  import streamlit as st
2
  import pandas as pd
3
+ import plotly.express as px
4
+ import plotly.graph_objects as go
5
+ import statsmodels.api as sm
6
+
7
+ # Set the layout to wide
8
+ st.set_page_config(layout="wide")
9
+
10
+
11
+ def prep_rankings_table(df, y_column):
12
+ # Create a copy of the dataframe.
13
+ df_copy = df.copy()
14
+
15
+ # Select the columns we care about, sort by the y column, and reset the index.
16
+ df_copy = (
17
+ df_copy[
18
+ [
19
+ "model_name",
20
+ y_column,
21
+ "num_words_mean",
22
+ ]
23
+ ]
24
+ .sort_values(y_column, ascending=False)
25
+ .reset_index()
26
+ )
27
+
28
+ # Create a rank column.
29
+ df_copy["rank"] = df_copy.index + 1
30
+
31
+ # Round the y column.
32
+ df_copy[y_column] = df_copy[y_column].round(2)
33
+
34
+ # Fix the order.
35
+ df_copy = df_copy[["rank", "model_name", y_column, "num_words_mean"]]
36
+ return df_copy
37
 
38
 
39
  def app():
40
+ st.title("AlpacaEval Visualizations")
41
+
42
+ st.markdown("## Win rate vs. overall mean length")
43
+
44
+ # Load the data
45
+ df = pd.read_json("data/model_win_rates.json")
46
+
47
+ # Add a model name column for hover labels
48
+ df["model_name"] = df.index.astype(str)
49
+
50
+ # Define the preset groups
51
+ presets = {
52
+ "gpt": df[df["model_name"].str.contains("openai|gpt", case=False)][
53
+ "model_name"
54
+ ].tolist(),
55
+ "claude": df[df["model_name"].str.contains("claude", case=False)][
56
+ "model_name"
57
+ ].tolist(),
58
+ "moa": df[df["model_name"].str.contains("moa", case=False)][
59
+ "model_name"
60
+ ].tolist(),
61
+ "llama": df[df["model_name"].str.contains("llama", case=False)][
62
+ "model_name"
63
+ ].tolist(),
64
+ "custom": [],
65
+ }
66
+
67
+ # Add radio button for preset groups
68
+ preset_selection = st.radio(
69
+ "Select a preset group of models or choose 'custom' to select manually",
70
+ options=["custom", "gpt", "claude", "moa", "llama"],
71
+ )
72
+
73
+ # Add multiselect for custom model selection
74
+ if preset_selection == "custom":
75
+ selected_models = st.multiselect(
76
+ "Select models to highlight", options=df["model_name"].unique()
77
+ )
78
+ else:
79
+ selected_models = presets[preset_selection]
80
+
81
+ def create_scatter_plot(df, y_column, selected_models, title):
82
+ fig = go.Figure()
83
+
84
+ # Add scatter plots for num_words_mean and num_tokens_mean
85
+ fig.add_trace(
86
+ go.Scatter(
87
+ x=df["num_words_mean"],
88
+ y=df[y_column],
89
+ mode="markers",
90
+ name="words",
91
+ text=df["model_name"],
92
+ marker=dict(size=5, color="skyblue"),
93
+ showlegend=True,
94
+ visible="legendonly", # Make 'words' trace initially visible only in legend
95
+ )
96
+ )
97
+ fig.add_trace(
98
+ go.Scatter(
99
+ x=df["num_tokens_mean"],
100
+ y=df[y_column],
101
+ mode="markers",
102
+ name="tokens",
103
+ text=df["model_name"],
104
+ marker=dict(size=5, color="orange"),
105
+ showlegend=True,
106
+ )
107
+ )
108
+
109
+ # Highlight selected models
110
+ if selected_models:
111
+ selected_data = df[df["model_name"].isin(selected_models)]
112
+ fig.add_trace(
113
+ go.Scatter(
114
+ x=selected_data["num_words_mean"],
115
+ y=selected_data[y_column],
116
+ mode="markers",
117
+ name="selected words",
118
+ text=selected_data["model_name"],
119
+ marker=dict(size=10, color="blue"),
120
+ showlegend=True,
121
+ visible="legendonly", # Make 'selected words' trace initially visible only in legend
122
+ )
123
+ )
124
+ fig.add_trace(
125
+ go.Scatter(
126
+ x=selected_data["num_tokens_mean"],
127
+ y=selected_data[y_column],
128
+ mode="markers",
129
+ name="selected tokens",
130
+ text=selected_data["model_name"],
131
+ marker=dict(size=10, color="orangered"),
132
+ showlegend=True,
133
+ )
134
+ )
135
+
136
+ # Add trendlines
137
+ def add_trendline(fig, x, y, name, color, visibility="legendonly"):
138
+ X = sm.add_constant(df[x])
139
+ model = sm.OLS(df[y], X).fit()
140
+ trendline = model.predict(X)
141
+ fig.add_trace(
142
+ go.Scatter(
143
+ x=df[x],
144
+ y=trendline,
145
+ mode="lines",
146
+ name=f"{name} trendline",
147
+ line=dict(color=color, width=2),
148
+ visible=visibility, # Control the initial visibility
149
+ )
150
+ )
151
+ return model.rsquared
152
+
153
+ r_squared_words = add_trendline(
154
+ fig, "num_words_mean", y_column, "words", "blue"
155
+ )
156
+ r_squared_tokens = add_trendline(
157
+ fig, "num_tokens_mean", y_column, "tokens", "orangered", visibility=True
158
+ )
159
+
160
+ # Update layout with titles and labels
161
+ fig.update_layout(
162
+ xaxis_title="Mean length",
163
+ yaxis_title=(
164
+ "Win rate"
165
+ if y_column == "win_rate"
166
+ else (
167
+ "LC Win Rate"
168
+ if y_column == "length_controlled_winrate"
169
+ else "Discrete Win Rate"
170
+ )
171
+ ),
172
+ title=title,
173
+ legend_title="Legend",
174
+ )
175
+
176
+ return fig, r_squared_words, r_squared_tokens
177
+
178
+ y_column1 = "length_controlled_winrate"
179
+ y_column2 = "win_rate"
180
+ y_column3 = "discrete_win_rate"
181
+
182
+ fig1, r_squared_words_1, r_squared_tokens_1 = create_scatter_plot(
183
+ df, y_column1, selected_models, "Length-Controlled Win Rate"
184
+ )
185
+ fig2, r_squared_words_2, r_squared_tokens_2 = create_scatter_plot(
186
+ df, y_column2, selected_models, "Win Rate"
187
+ )
188
+ fig3, r_squared_words_3, r_squared_tokens_3 = create_scatter_plot(
189
+ df, y_column3, selected_models, "Discrete Win Rate"
190
+ )
191
+
192
+ # Create tabs for each chart
193
+ tab1, tab2, tab3 = st.tabs(["LC Win Rate", "Win Rate", "Discrete Win Rate"])
194
+
195
+ with tab1:
196
+ col1, col2 = st.columns([3, 2])
197
+ col1.plotly_chart(fig1)
198
+ col2.markdown("#### Rankings")
199
+ prepped_df = prep_rankings_table(df, "length_controlled_winrate")
200
+ col2.dataframe(
201
+ prepped_df,
202
+ hide_index=True,
203
+ )
204
+ with st.expander("Trendline R²"):
205
+ st.markdown(
206
+ f"- R² (Words vs {y_column1}): {r_squared_words_1:.2f} \n- R² (Tokens vs {y_column1}): {r_squared_tokens_1:.2f}"
207
+ )
208
+
209
+ with tab2:
210
+ col1, col2 = st.columns([3, 2])
211
+ col1.plotly_chart(fig2)
212
+ col2.markdown("#### Rankings")
213
+ prepped_df = prep_rankings_table(df, "win_rate")
214
+ col2.dataframe(
215
+ prepped_df,
216
+ hide_index=True,
217
+ )
218
+ with st.expander("Trendline R²"):
219
+ st.markdown(
220
+ f"- R² (Words vs {y_column2}): {r_squared_words_2:.2f} \n- R² (Tokens vs {y_column2}): {r_squared_tokens_2:.2f}"
221
+ )
222
+
223
+ with tab3:
224
+ col1, col2 = st.columns([3, 2])
225
+ col1.plotly_chart(fig3)
226
+ col2.markdown("#### Rankings")
227
+ prepped_df = prep_rankings_table(df, "discrete_win_rate")
228
+ col2.dataframe(
229
+ prepped_df,
230
+ hide_index=True,
231
+ )
232
+ with st.expander("Trendline R²"):
233
+ st.markdown(
234
+ f"- R² (Words vs {y_column3}): {r_squared_words_3:.2f}\n- R² (Tokens vs {y_column3}): {r_squared_tokens_3:.2f}"
235
+ )
236
+
237
+ with st.expander("Raw data"):
238
+ st.dataframe(df)
239
 
240
 
241
  if __name__ == "__main__":
data/model_win_rates.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"num_words_mean":{"gpt-3.5-turbo-0301":136,"gpt-3.5-turbo-1106_verbose":166,"vicuna-13b-v1.5-togetherai":177,"Qwen1.5-1.8B-Chat":426,"recycled-wizardlm-7b-v1.0":235,"aligner-2b_claude-3-opus-20240229":257,"Qwen1.5-110B-Chat":253,"claude-3-opus-20240229":216,"llama-2-7b-chat-hf":241,"mistral-medium":241,"vicuna-33b-v1.3":237,"cohere":315,"claude-2":174,"guanaco-65b":203,"Mixtral-8x7B-Instruct-v0.1":238,"openchat-v2-w-13b":249,"falcon-7b-instruct":74,"wizardlm-13b-v1.1":244,"Meta-Llama-3-8B-Instruct":301,"FsfairX-Zephyr-Chat-v0.1":342,"Infinity-Instruct-3M-0613-Mistral-7B":79,"Qwen1.5-72B-Chat":243,"xwinlm-7b-v0.1":304,"Mixtral-8x22B-Instruct-v0.1":229,"vicuna-13b-v1.5":177,"dbrx-instruct":235,"zephyr-7b-alpha":208,"tulu-2-dpo-13b":257,"Qwen1.5-7B-Chat":254,"Together-MoA-Lite":297,"cut-13b":269,"Meta-Llama-3-70B-Instruct":299,"vicuna-13b-v1.3":189,"claude-instant-1.2":179,"airoboros-65b":236,"openbuddy-llama2-13b-v11.1":177,"phi-2":102,"Together-MoA":272,"mistral-large-2402":218,"openbuddy-llama-30b-v7.1":162,"TempNet-LLaMA2-Chat-70B-v0.1":296,"pairrm-tulu-2-13b":236,"recycled-wizardlm-7b-v2.0":251,"Storm-7B-best-of-64":336,"vicuna-7b":175,"claude-3-sonnet-20240229":221,"Mistral-7B-Instruct-v0.2":261,"Samba-CoE-v0.1":190,"claude":176,"Nanbeige2-8B-Chat":415,"REBEL-Llama-3-8B-Instruct":341,"chatglm2-6b":175,"gpt-4o-2024-05-13":286,"gpt4_1106_preview_verbose":378,"TempNet-LLaMA2-Chat-13B-v0.1":253,"text_davinci_001":50,"Mixtral-8x7B-Instruct-v0.1_verbose":336,"baize-v2-7b":189,"phi-2-dpo":270,"alpaca-farm-ppo-human":135,"Nanbeige2-16B-Chat":284,"gpt4_0613":183,"pythia-12b-mix-sft":147,"alpaca-7b-neft":170,"Qwen1.5-14B-Chat":252,"gpt-4-0125-preview":313,"guanaco-33b":220,"oasst-sft-llama-33b":125,"gpt4_0613_verbose":237,"llama-2-chat-7b-evol70k-neft":260,"gpt35_turbo_instruct":166,"platolm-7b":209,"llama-2-13b-chat-hf":249,"Nanbeige-Plus-Chat-v0.1":391,"openchat-v2-13b":249,"mistral-orpo-beta":263,"Snorkel-Mistral-PairRM-DPO-best-of-16":393,"tulu-2-dpo-7b":278,"alpaca-7b_verbose":90,"OpenHermes-2.5-Mistral-7B":176,"claude-2.1_verbose":228,"ultralm-13b-v2.0":231,"deita-7b-v1.0":228,"minichat-1.5-3b":242,"Qwen-14B-Chat":167,"airoboros-33b":238,"alpaca-farm-ppo-sim-gpt4-20k":82,"ultralm-13b":181,"openbuddy-falcon-40b-v9":182,"openchat8192-13b":268,"wizardlm-13b":163,"vicuna-13b":175,"merlinite-7B-AOT":281,"gpt4_0314":215,"gpt4_0613_concise":99,"jina-chat":106,"Contextual-KTO-Mistral-PairRM":381,"xwinlm-13b-v0.1":300,"LMCocktail-10.7B-v1":182,"SPPO-Mistral7B-PairRM-ExPO":344,"Mixtral-8x7B-Instruct-v0.1_concise":144,"gpt4_1106_preview_concise":177,"Mistral-7B-ReMax-v0.1":234,"Llama-3-Instruct-8B-SimPO-ExPO":261,"dolphin-2.2.1-mistral-7b":182,"humpback-llama2-70b":178,"openpipe-moa-gpt-4-turbo-v1":272,"vicuna-7b-v1.5":181,"Starling-LM-7B-alpha":301,"falcon-40b-instruct":109,"Samba-CoE-v0.2-best-of-16":225,"opencoderplus-15b":262,"xwinlm-70b-v0.1":282,"wizardlm-13b-v1.2":264,"aligner-2b_qwen1.5-72b-chat":280,"internlm2-chat-7b-ExPO":364,"claude-2.1":177,"vicuna-7b-v1.3":184,"oasst-rlhf-llama-33b":181,"zephyr-7b-alpha-ExPO":201,"openchat-v3.1-13b":235,"SPPO-Llama-3-Instruct-8B-PairRM":317,"minotaur-13b":138,"tulu-2-dpo-13b-ExPO":264,"zephyr-7b-beta-ExPO":224,"tulu-2-dpo-7b-ExPO":277,"Llama-3-Instruct-8B-SimPO":272,"baize-v2-13b":155,"guanaco-7b":233,"ultralm-13b-v2.0-best-of-16":273,"claude-2.1_concise":91,"openchat-13b":260,"tulu-2-dpo-70b":231,"deepseek-llm-67b-chat":189,"humpback-llama-65b":196,"tulu-2-dpo-70b-ExPO":276,"TempNet-LLaMA2-Chat-7B-v0.1":246,"nous-hermes-13b":139,"gpt-3.5-turbo-0613":207,"alpaca-7b_concise":58,"baichuan-13b-chat":225,"claude-3-5-sonnet-20240620":228,"gpt-3.5-turbo-1106":126,"minichat-3b":145,"Storm-7B":300,"oasst-sft-pythia-12b":118,"Conifer-7B-DPO":205,"Snorkel-Mistral-PairRM-DPO":414,"internlm2-chat-20b-ExPO":507,"Samba-CoE-v0.2":210,"gemini-pro":228,"pairrm-tulu-2-70b":264,"text_davinci_003":52,"gpt4":215,"Yi-34B-Chat":339,"Starling-LM-7B-beta-ExPO":336,"pairrm-Yi-34B-Chat":349,"gpt4_1106_preview":323,"evo-7b":280,"zephyr-7b-beta":229,"guanaco-13b":308,"alpaca-7b":66,"internlm2-chat-20b-ppo":371,"gemma-2b-it":165,"pairrm-zephyr-7b-beta":236,"evo-v2-7b":274,"causallm-14b":228,"SPPO-Mistral7B-PairRM":322,"gpt-3.5-turbo-1106_concise":68,"openbuddy-llama-65b-v8":194,"claude2-alpaca-13b":186,"Starling-LM-7B-alpha-ExPO":288,"openbuddy-falcon-7b-v6":192,"gemma-7b-it":176,"phi-2-sft":175,"gpt4_gamed":11,"llama-2-70b-chat-hf":292,"openbuddy-llama2-70b-v10.1":178,"wizardlm-70b":249,"ultralm-13b-best-of-16":311},"num_words_std":{"gpt-3.5-turbo-0301":109,"gpt-3.5-turbo-1106_verbose":110,"vicuna-13b-v1.5-togetherai":119,"Qwen1.5-1.8B-Chat":358,"recycled-wizardlm-7b-v1.0":129,"aligner-2b_claude-3-opus-20240229":145,"Qwen1.5-110B-Chat":153,"claude-3-opus-20240229":113,"llama-2-7b-chat-hf":134,"mistral-medium":166,"vicuna-33b-v1.3":149,"cohere":179,"claude-2":90,"guanaco-65b":157,"Mixtral-8x7B-Instruct-v0.1":163,"openchat-v2-w-13b":151,"falcon-7b-instruct":92,"wizardlm-13b-v1.1":156,"Meta-Llama-3-8B-Instruct":167,"FsfairX-Zephyr-Chat-v0.1":187,"Infinity-Instruct-3M-0613-Mistral-7B":78,"Qwen1.5-72B-Chat":144,"xwinlm-7b-v0.1":207,"Mixtral-8x22B-Instruct-v0.1":168,"vicuna-13b-v1.5":122,"dbrx-instruct":143,"zephyr-7b-alpha":189,"tulu-2-dpo-13b":309,"Qwen1.5-7B-Chat":143,"Together-MoA-Lite":161,"cut-13b":159,"Meta-Llama-3-70B-Instruct":171,"vicuna-13b-v1.3":119,"claude-instant-1.2":104,"airoboros-65b":288,"openbuddy-llama2-13b-v11.1":122,"phi-2":209,"Together-MoA":151,"mistral-large-2402":148,"openbuddy-llama-30b-v7.1":115,"TempNet-LLaMA2-Chat-70B-v0.1":196,"pairrm-tulu-2-13b":135,"recycled-wizardlm-7b-v2.0":134,"Storm-7B-best-of-64":195,"vicuna-7b":124,"claude-3-sonnet-20240229":129,"Mistral-7B-Instruct-v0.2":201,"Samba-CoE-v0.1":138,"claude":97,"Nanbeige2-8B-Chat":173,"REBEL-Llama-3-8B-Instruct":273,"chatglm2-6b":225,"gpt-4o-2024-05-13":190,"gpt4_1106_preview_verbose":190,"TempNet-LLaMA2-Chat-13B-v0.1":136,"text_davinci_001":51,"Mixtral-8x7B-Instruct-v0.1_verbose":169,"baize-v2-7b":133,"phi-2-dpo":137,"alpaca-farm-ppo-human":125,"Nanbeige2-16B-Chat":151,"gpt4_0613":124,"pythia-12b-mix-sft":122,"alpaca-7b-neft":82,"Qwen1.5-14B-Chat":145,"gpt-4-0125-preview":180,"guanaco-33b":170,"oasst-sft-llama-33b":115,"gpt4_0613_verbose":127,"llama-2-chat-7b-evol70k-neft":113,"gpt35_turbo_instruct":157,"platolm-7b":145,"llama-2-13b-chat-hf":135,"Nanbeige-Plus-Chat-v0.1":159,"openchat-v2-13b":151,"mistral-orpo-beta":161,"Snorkel-Mistral-PairRM-DPO-best-of-16":185,"tulu-2-dpo-7b":415,"alpaca-7b_verbose":68,"OpenHermes-2.5-Mistral-7B":154,"claude-2.1_verbose":94,"ultralm-13b-v2.0":137,"deita-7b-v1.0":160,"minichat-1.5-3b":157,"Qwen-14B-Chat":124,"airoboros-33b":246,"alpaca-farm-ppo-sim-gpt4-20k":53,"ultralm-13b":108,"openbuddy-falcon-40b-v9":124,"openchat8192-13b":182,"wizardlm-13b":117,"vicuna-13b":115,"merlinite-7B-AOT":128,"gpt4_0314":160,"gpt4_0613_concise":79,"jina-chat":76,"Contextual-KTO-Mistral-PairRM":205,"xwinlm-13b-v0.1":188,"LMCocktail-10.7B-v1":118,"SPPO-Mistral7B-PairRM-ExPO":144,"Mixtral-8x7B-Instruct-v0.1_concise":131,"gpt4_1106_preview_concise":130,"Mistral-7B-ReMax-v0.1":132,"Llama-3-Instruct-8B-SimPO-ExPO":134,"dolphin-2.2.1-mistral-7b":167,"humpback-llama2-70b":133,"openpipe-moa-gpt-4-turbo-v1":137,"vicuna-7b-v1.5":117,"Starling-LM-7B-alpha":194,"falcon-40b-instruct":111,"Samba-CoE-v0.2-best-of-16":140,"opencoderplus-15b":194,"xwinlm-70b-v0.1":169,"wizardlm-13b-v1.2":188,"aligner-2b_qwen1.5-72b-chat":151,"internlm2-chat-7b-ExPO":172,"claude-2.1":92,"vicuna-7b-v1.3":117,"oasst-rlhf-llama-33b":174,"zephyr-7b-alpha-ExPO":149,"openchat-v3.1-13b":156,"SPPO-Llama-3-Instruct-8B-PairRM":148,"minotaur-13b":109,"tulu-2-dpo-13b-ExPO":139,"zephyr-7b-beta-ExPO":157,"tulu-2-dpo-7b-ExPO":144,"Llama-3-Instruct-8B-SimPO":141,"baize-v2-13b":110,"guanaco-7b":200,"ultralm-13b-v2.0-best-of-16":131,"claude-2.1_concise":68,"openchat-13b":168,"tulu-2-dpo-70b":151,"deepseek-llm-67b-chat":122,"humpback-llama-65b":122,"tulu-2-dpo-70b-ExPO":140,"TempNet-LLaMA2-Chat-7B-v0.1":140,"nous-hermes-13b":130,"gpt-3.5-turbo-0613":149,"alpaca-7b_concise":45,"baichuan-13b-chat":301,"claude-3-5-sonnet-20240620":142,"gpt-3.5-turbo-1106":103,"minichat-3b":119,"Storm-7B":135,"oasst-sft-pythia-12b":120,"Conifer-7B-DPO":126,"Snorkel-Mistral-PairRM-DPO":241,"internlm2-chat-20b-ExPO":206,"Samba-CoE-v0.2":138,"gemini-pro":164,"pairrm-tulu-2-70b":212,"text_davinci_003":66,"gpt4":156,"Yi-34B-Chat":178,"Starling-LM-7B-beta-ExPO":106,"pairrm-Yi-34B-Chat":189,"gpt4_1106_preview":181,"evo-7b":176,"zephyr-7b-beta":160,"guanaco-13b":288,"alpaca-7b":52,"internlm2-chat-20b-ppo":268,"gemma-2b-it":128,"pairrm-zephyr-7b-beta":178,"evo-v2-7b":170,"causallm-14b":166,"SPPO-Mistral7B-PairRM":148,"gpt-3.5-turbo-1106_concise":57,"openbuddy-llama-65b-v8":113,"claude2-alpaca-13b":127,"Starling-LM-7B-alpha-ExPO":174,"openbuddy-falcon-7b-v6":126,"gemma-7b-it":118,"phi-2-sft":123,"gpt4_gamed":60,"llama-2-70b-chat-hf":174,"openbuddy-llama2-70b-v10.1":118,"wizardlm-70b":144,"ultralm-13b-best-of-16":132},"win_rate":{"gpt-3.5-turbo-0301":9.6224532951,"gpt-3.5-turbo-1106_verbose":12.7631698103,"vicuna-13b-v1.5-togetherai":6.9582753694,"Qwen1.5-1.8B-Chat":3.7055568157,"recycled-wizardlm-7b-v1.0":6.6327499605,"aligner-2b_claude-3-opus-20240229":34.4633736232,"Qwen1.5-110B-Chat":33.7770952757,"claude-3-opus-20240229":29.1052695333,"llama-2-7b-chat-hf":4.9613395472,"mistral-medium":21.8557725437,"vicuna-33b-v1.3":12.7059479215,"cohere":12.9014552097,"claude-2":17.1882403567,"guanaco-65b":6.8584945134,"Mixtral-8x7B-Instruct-v0.1":18.2553176264,"openchat-v2-w-13b":9.6153441584,"falcon-7b-instruct":2.1466175532,"wizardlm-13b-v1.1":11.2339095729,"Meta-Llama-3-8B-Instruct":22.5699026093,"FsfairX-Zephyr-Chat-v0.1":35.9464864409,"Infinity-Instruct-3M-0613-Mistral-7B":15.7478281307,"Qwen1.5-72B-Chat":26.4982833956,"xwinlm-7b-v0.1":11.2456517378,"Mixtral-8x22B-Instruct-v0.1":22.2101705475,"vicuna-13b-v1.5":6.7221220149,"dbrx-instruct":19.7553327319,"zephyr-7b-alpha":8.3526639682,"tulu-2-dpo-13b":10.1197883883,"Qwen1.5-7B-Chat":11.7709270696,"Together-MoA-Lite":56.5930456223,"cut-13b":10.7790892025,"Meta-Llama-3-70B-Instruct":33.1778569588,"vicuna-13b-v1.3":7.1372403865,"claude-instant-1.2":16.1273996216,"airoboros-65b":9.3889501497,"openbuddy-llama2-13b-v11.1":6.1747164895,"phi-2":2.350209543,"Together-MoA":59.8688062333,"mistral-large-2402":21.4387759814,"openbuddy-llama-30b-v7.1":6.130014614,"TempNet-LLaMA2-Chat-70B-v0.1":15.0518944202,"pairrm-tulu-2-13b":13.8319010168,"recycled-wizardlm-7b-v2.0":7.3371293705,"Storm-7B-best-of-64":63.0409907519,"vicuna-7b":4.1626111623,"claude-3-sonnet-20240229":25.5563252923,"Mistral-7B-Instruct-v0.2":14.7227726577,"Samba-CoE-v0.1":16.8355018701,"claude":16.9853436124,"Nanbeige2-8B-Chat":39.354502072,"REBEL-Llama-3-8B-Instruct":34.3064238313,"chatglm2-6b":2.7621847965,"gpt-4o-2024-05-13":51.3275757825,"gpt4_1106_preview_verbose":64.303601471,"TempNet-LLaMA2-Chat-13B-v0.1":7.7284050659,"text_davinci_001":2.7640052311,"Mixtral-8x7B-Instruct-v0.1_verbose":24.6140630502,"baize-v2-7b":3.4048149775,"phi-2-dpo":7.7570957018,"alpaca-farm-ppo-human":4.100426815,"Nanbeige2-16B-Chat":37.0360860499,"gpt4_0613":15.7550380876,"pythia-12b-mix-sft":2.578090281,"alpaca-7b-neft":3.1321786695,"Qwen1.5-14B-Chat":18.6458143619,"gpt-4-0125-preview":54.9665397329,"guanaco-33b":5.002493725,"oasst-sft-llama-33b":4.7703909916,"gpt4_0613_verbose":23.2373600435,"llama-2-chat-7b-evol70k-neft":7.6023835122,"gpt35_turbo_instruct":8.4624465044,"platolm-7b":6.3208280585,"llama-2-13b-chat-hf":7.7023099579,"Nanbeige-Plus-Chat-v0.1":56.7030097302,"openchat-v2-13b":8.4350756447,"mistral-orpo-beta":12.5654087946,"Snorkel-Mistral-PairRM-DPO-best-of-16":34.8601328913,"tulu-2-dpo-7b":8.1975153845,"alpaca-7b_verbose":2.9331016025,"OpenHermes-2.5-Mistral-7B":10.3404157058,"claude-2.1_verbose":24.3540710901,"ultralm-13b-v2.0":7.5046229557,"deita-7b-v1.0":12.6466394724,"minichat-1.5-3b":6.5534430528,"Qwen-14B-Chat":7.5023334847,"airoboros-33b":9.0531603961,"alpaca-farm-ppo-sim-gpt4-20k":3.4503419871,"ultralm-13b":5.0745903805,"openbuddy-falcon-40b-v9":5.9557428463,"openchat8192-13b":7.472766808,"wizardlm-13b":5.8781525894,"vicuna-13b":5.8311031845,"merlinite-7B-AOT":29.8963508407,"gpt4_0314":22.0732589287,"gpt4_0613_concise":9.4003205746,"jina-chat":7.7861303934,"Contextual-KTO-Mistral-PairRM":33.2273552,"xwinlm-13b-v0.1":17.4279347502,"LMCocktail-10.7B-v1":13.1534309174,"SPPO-Mistral7B-PairRM-ExPO":35.4431306717,"Mixtral-8x7B-Instruct-v0.1_concise":13.7440401548,"gpt4_1106_preview_concise":22.9201944405,"Mistral-7B-ReMax-v0.1":15.999331369,"Llama-3-Instruct-8B-SimPO-ExPO":40.6328540086,"dolphin-2.2.1-mistral-7b":9.0397997282,"humpback-llama2-70b":10.1217715026,"openpipe-moa-gpt-4-turbo-v1":63.1549345123,"vicuna-7b-v1.5":4.7974939392,"Starling-LM-7B-alpha":14.2459235216,"falcon-40b-instruct":3.3429188225,"Samba-CoE-v0.2-best-of-16":26.9882543183,"opencoderplus-15b":7.406222451,"xwinlm-70b-v0.1":21.8129570739,"wizardlm-13b-v1.2":12.0274803428,"aligner-2b_qwen1.5-72b-chat":31.773037737,"internlm2-chat-7b-ExPO":28.0678174371,"claude-2.1":15.7335067364,"vicuna-7b-v1.3":4.6425118575,"oasst-rlhf-llama-33b":6.2964347858,"zephyr-7b-alpha-ExPO":10.5593543457,"openchat-v3.1-13b":11.0822304894,"SPPO-Llama-3-Instruct-8B-PairRM":39.6728609061,"minotaur-13b":5.7389636691,"tulu-2-dpo-13b-ExPO":15.5514054294,"zephyr-7b-beta-ExPO":11.0611168323,"tulu-2-dpo-7b-ExPO":11.529221039,"Llama-3-Instruct-8B-SimPO":40.5297749846,"baize-v2-13b":4.5905453306,"guanaco-7b":2.8800022662,"ultralm-13b-v2.0-best-of-16":13.8533734712,"claude-2.1_concise":9.2271252406,"openchat-13b":8.0223860109,"tulu-2-dpo-70b":15.9828543741,"deepseek-llm-67b-chat":12.0934222649,"humpback-llama-65b":9.4251390478,"tulu-2-dpo-70b-ExPO":22.9806197059,"TempNet-LLaMA2-Chat-7B-v0.1":5.4301432647,"nous-hermes-13b":5.4118789332,"gpt-3.5-turbo-0613":14.0957985739,"alpaca-7b_concise":1.9911763835,"baichuan-13b-chat":1.9921455615,"claude-3-5-sonnet-20240620":40.5602140968,"gpt-3.5-turbo-1106":9.177964562,"minichat-3b":3.0071507064,"Storm-7B":50.2688690553,"oasst-sft-pythia-12b":1.7901140832,"Conifer-7B-DPO":11.3135856492,"Snorkel-Mistral-PairRM-DPO":30.2200527007,"internlm2-chat-20b-ExPO":46.1853674689,"Samba-CoE-v0.2":21.8473786693,"gemini-pro":18.1776445406,"pairrm-tulu-2-70b":18.6389629674,"text_davinci_003":1.9621476654,"gpt4":23.5767893148,"Yi-34B-Chat":29.6599467188,"Starling-LM-7B-beta-ExPO":29.6008518479,"pairrm-Yi-34B-Chat":31.2412829468,"gpt4_1106_preview":50.0,"evo-7b":15.5774373995,"zephyr-7b-beta":10.9928857554,"guanaco-13b":3.4695968597,"alpaca-7b":2.5914505402,"internlm2-chat-20b-ppo":21.7491545005,"gemma-2b-it":3.4019714381,"pairrm-zephyr-7b-beta":12.8412782556,"evo-v2-7b":20.8341130226,"causallm-14b":11.14616087,"SPPO-Mistral7B-PairRM":32.2453123638,"gpt-3.5-turbo-1106_concise":7.4158649776,"openbuddy-llama-65b-v8":8.7706501509,"claude2-alpaca-13b":7.4373513248,"Starling-LM-7B-alpha-ExPO":18.1797559203,"openbuddy-falcon-7b-v6":3.521174372,"gemma-7b-it":6.9372943797,"phi-2-sft":3.9775677752,"gpt4_gamed":3.7383373714,"llama-2-70b-chat-hf":13.8882583437,"openbuddy-llama2-70b-v10.1":8.0964220963,"wizardlm-70b":14.3838960868,"ultralm-13b-best-of-16":11.3073149478},"standard_error":{"gpt-3.5-turbo-0301":0.9129656687,"gpt-3.5-turbo-1106_verbose":1.0442468192,"vicuna-13b-v1.5-togetherai":0.7825381738,"Qwen1.5-1.8B-Chat":0.5811750995,"recycled-wizardlm-7b-v1.0":0.7713329914,"aligner-2b_claude-3-opus-20240229":1.3146665263,"Qwen1.5-110B-Chat":1.3776163154,"claude-3-opus-20240229":1.3941539442,"llama-2-7b-chat-hf":0.6691754517,"mistral-medium":1.2682402187,"vicuna-33b-v1.3":0.9992557843,"cohere":1.0141034031,"claude-2":1.1748282562,"guanaco-65b":0.8048449272,"Mixtral-8x7B-Instruct-v0.1":1.1885585969,"openchat-v2-w-13b":0.8908241711,"falcon-7b-instruct":0.4542257929,"wizardlm-13b-v1.1":0.9502711246,"Meta-Llama-3-8B-Instruct":1.2575802331,"FsfairX-Zephyr-Chat-v0.1":1.4410058098,"Infinity-Instruct-3M-0613-Mistral-7B":1.1194852006,"Qwen1.5-72B-Chat":1.3042361649,"xwinlm-7b-v0.1":0.9455447881,"Mixtral-8x22B-Instruct-v0.1":1.2780740057,"vicuna-13b-v1.5":0.7674173991,"dbrx-instruct":1.2063251121,"zephyr-7b-alpha":0.8664491645,"tulu-2-dpo-13b":0.929813366,"Qwen1.5-7B-Chat":0.9544463489,"Together-MoA-Lite":1.4464848562,"cut-13b":0.9428953579,"Meta-Llama-3-70B-Instruct":1.3886514096,"vicuna-13b-v1.3":0.7846846272,"claude-instant-1.2":1.1341036838,"airoboros-65b":0.8816208133,"openbuddy-llama2-13b-v11.1":0.753544387,"phi-2":0.4496590406,"Together-MoA":1.4343056045,"mistral-large-2402":1.2485232545,"openbuddy-llama-30b-v7.1":0.7645283386,"TempNet-LLaMA2-Chat-70B-v0.1":1.0801507581,"pairrm-tulu-2-13b":1.0835284665,"recycled-wizardlm-7b-v2.0":0.8012012288,"Storm-7B-best-of-64":1.4253258915,"vicuna-7b":0.6135107768,"claude-3-sonnet-20240229":1.3419811052,"Mistral-7B-Instruct-v0.2":1.0785266447,"Samba-CoE-v0.1":1.1180386125,"claude":1.1687959793,"Nanbeige2-8B-Chat":1.4524224246,"REBEL-Llama-3-8B-Instruct":1.3914900256,"chatglm2-6b":0.5020758951,"gpt-4o-2024-05-13":1.470009459,"gpt4_1106_preview_verbose":1.3348590089,"TempNet-LLaMA2-Chat-13B-v0.1":0.8268032188,"text_davinci_001":0.5177668864,"Mixtral-8x7B-Instruct-v0.1_verbose":1.2975757386,"baize-v2-7b":0.5826293992,"phi-2-dpo":0.8357079426,"alpaca-farm-ppo-human":0.6304721407,"Nanbeige2-16B-Chat":1.4340261273,"gpt4_0613":1.0754642482,"pythia-12b-mix-sft":0.5127326717,"alpaca-7b-neft":0.5522241753,"Qwen1.5-14B-Chat":1.1351340211,"gpt-4-0125-preview":1.4286740089,"guanaco-33b":0.6697115752,"oasst-sft-llama-33b":0.6385940189,"gpt4_0613_verbose":1.2835395056,"llama-2-chat-7b-evol70k-neft":0.8110538776,"gpt35_turbo_instruct":0.8724086934,"platolm-7b":0.7405704765,"llama-2-13b-chat-hf":0.8286143394,"Nanbeige-Plus-Chat-v0.1":1.482841875,"openchat-v2-13b":0.8235980231,"mistral-orpo-beta":0.9929774686,"Snorkel-Mistral-PairRM-DPO-best-of-16":1.3599450437,"tulu-2-dpo-7b":0.8749615125,"alpaca-7b_verbose":0.5302092824,"OpenHermes-2.5-Mistral-7B":0.9356553899,"claude-2.1_verbose":1.29358621,"ultralm-13b-v2.0":0.8150376948,"deita-7b-v1.0":1.0352555321,"minichat-1.5-3b":0.7674159339,"Qwen-14B-Chat":0.8147265702,"airoboros-33b":0.8607792116,"alpaca-farm-ppo-sim-gpt4-20k":0.5834901038,"ultralm-13b":0.6707048924,"openbuddy-falcon-40b-v9":0.7388621614,"openchat8192-13b":0.8038094305,"wizardlm-13b":0.704420227,"vicuna-13b":0.7422829864,"merlinite-7B-AOT":1.3666520485,"gpt4_0314":1.2466725495,"gpt4_0613_concise":0.9010212759,"jina-chat":0.8398450576,"Contextual-KTO-Mistral-PairRM":1.3779687478,"xwinlm-13b-v0.1":1.1450161467,"LMCocktail-10.7B-v1":1.0457195357,"SPPO-Mistral7B-PairRM-ExPO":1.3981308966,"Mixtral-8x7B-Instruct-v0.1_concise":1.0718682992,"gpt4_1106_preview_concise":1.2325177143,"Mistral-7B-ReMax-v0.1":1.1288683901,"Llama-3-Instruct-8B-SimPO-ExPO":1.4439449942,"dolphin-2.2.1-mistral-7b":0.8892901247,"humpback-llama2-70b":0.9401806122,"openpipe-moa-gpt-4-turbo-v1":1.4229800988,"vicuna-7b-v1.5":0.6655960677,"Starling-LM-7B-alpha":1.0685460609,"falcon-40b-instruct":0.5541127159,"Samba-CoE-v0.2-best-of-16":1.318903,"opencoderplus-15b":0.8024858021,"xwinlm-70b-v0.1":1.2303274476,"wizardlm-13b-v1.2":0.9717618177,"aligner-2b_qwen1.5-72b-chat":1.2392772646,"internlm2-chat-7b-ExPO":1.3159792318,"claude-2.1":1.1203158654,"vicuna-7b-v1.3":0.6420919828,"oasst-rlhf-llama-33b":0.7417944201,"zephyr-7b-alpha-ExPO":0.9774634449,"openchat-v3.1-13b":0.9501308701,"SPPO-Llama-3-Instruct-8B-PairRM":1.4247223562,"minotaur-13b":0.7271241247,"tulu-2-dpo-13b-ExPO":1.1714853384,"zephyr-7b-beta-ExPO":1.0204784889,"tulu-2-dpo-7b-ExPO":1.0497814893,"Llama-3-Instruct-8B-SimPO":1.4225744647,"baize-v2-13b":0.6497033227,"guanaco-7b":0.5202924149,"ultralm-13b-v2.0-best-of-16":1.049344706,"claude-2.1_concise":0.8921752289,"openchat-13b":0.8368334957,"tulu-2-dpo-70b":1.1457861368,"deepseek-llm-67b-chat":1.0173843633,"humpback-llama-65b":0.9300866723,"tulu-2-dpo-70b-ExPO":1.3591734083,"TempNet-LLaMA2-Chat-7B-v0.1":0.7210775889,"nous-hermes-13b":0.7081240036,"gpt-3.5-turbo-0613":1.0371186215,"alpaca-7b_concise":0.4437510224,"baichuan-13b-chat":0.4176985079,"claude-3-5-sonnet-20240620":1.4679655404,"gpt-3.5-turbo-1106":0.8904117512,"minichat-3b":0.5041245962,"Storm-7B":1.4728176781,"oasst-sft-pythia-12b":0.3985580883,"Conifer-7B-DPO":0.9870897936,"Snorkel-Mistral-PairRM-DPO":1.3328273013,"internlm2-chat-20b-ExPO":1.4638315246,"Samba-CoE-v0.2":1.2171089783,"gemini-pro":1.1588503791,"pairrm-tulu-2-70b":1.19249667,"text_davinci_003":0.4346747594,"gpt4":1.2757042012,"Yi-34B-Chat":1.3225712598,"Starling-LM-7B-beta-ExPO":1.3252049543,"pairrm-Yi-34B-Chat":1.3482437399,"gpt4_1106_preview":0.0,"evo-7b":1.0835570389,"zephyr-7b-beta":0.9617876718,"guanaco-13b":0.5518606726,"alpaca-7b":0.4870855383,"internlm2-chat-20b-ppo":1.244366241,"gemma-2b-it":0.538998125,"pairrm-zephyr-7b-beta":1.0535874942,"evo-v2-7b":1.2159901798,"causallm-14b":0.9544127301,"SPPO-Mistral7B-PairRM":1.390800011,"gpt-3.5-turbo-1106_concise":0.8374438114,"openbuddy-llama-65b-v8":0.8871992619,"claude2-alpaca-13b":0.8249428868,"Starling-LM-7B-alpha-ExPO":1.2498324796,"openbuddy-falcon-7b-v6":0.5655836443,"gemma-7b-it":0.7869665732,"phi-2-sft":0.6098271417,"gpt4_gamed":0.6278799634,"llama-2-70b-chat-hf":1.0799847727,"openbuddy-llama2-70b-v10.1":0.8498371494,"wizardlm-70b":1.0395048913,"ultralm-13b-best-of-16":0.9418434059},"n_wins":{"gpt-3.5-turbo-0301":71.0,"gpt-3.5-turbo-1106_verbose":94.0,"vicuna-13b-v1.5-togetherai":53.0,"Qwen1.5-1.8B-Chat":27.0,"recycled-wizardlm-7b-v1.0":53.0,"aligner-2b_claude-3-opus-20240229":225.0,"Qwen1.5-110B-Chat":255.0,"claude-3-opus-20240229":223.0,"llama-2-7b-chat-hf":38.0,"mistral-medium":164.0,"vicuna-33b-v1.3":90.0,"cohere":96.0,"claude-2":131.0,"guanaco-65b":54.0,"Mixtral-8x7B-Instruct-v0.1":135.0,"openchat-v2-w-13b":67.0,"falcon-7b-instruct":16.0,"wizardlm-13b-v1.1":79.0,"Meta-Llama-3-8B-Instruct":176.0,"FsfairX-Zephyr-Chat-v0.1":285.0,"Infinity-Instruct-3M-0613-Mistral-7B":118.0,"Qwen1.5-72B-Chat":201.0,"xwinlm-7b-v0.1":77.0,"Mixtral-8x22B-Instruct-v0.1":174.0,"vicuna-13b-v1.5":48.0,"dbrx-instruct":147.0,"zephyr-7b-alpha":59.0,"tulu-2-dpo-13b":75.0,"Qwen1.5-7B-Chat":80.0,"Together-MoA-Lite":456.0,"cut-13b":83.0,"Meta-Llama-3-70B-Instruct":266.0,"vicuna-13b-v1.3":50.0,"claude-instant-1.2":120.0,"airoboros-65b":67.0,"openbuddy-llama2-13b-v11.1":42.0,"phi-2":15.0,"Together-MoA":490.0,"mistral-large-2402":166.0,"openbuddy-llama-30b-v7.1":47.0,"TempNet-LLaMA2-Chat-70B-v0.1":111.0,"pairrm-tulu-2-13b":110.0,"recycled-wizardlm-7b-v2.0":50.0,"Storm-7B-best-of-64":519.0,"vicuna-7b":28.0,"claude-3-sonnet-20240229":193.0,"Mistral-7B-Instruct-v0.2":113.0,"Samba-CoE-v0.1":124.0,"claude":129.0,"Nanbeige2-8B-Chat":323.0,"REBEL-Llama-3-8B-Instruct":268.0,"chatglm2-6b":19.0,"gpt-4o-2024-05-13":429.0,"gpt4_1106_preview_verbose":525.0,"TempNet-LLaMA2-Chat-13B-v0.1":56.0,"text_davinci_001":23.0,"Mixtral-8x7B-Instruct-v0.1_verbose":194.0,"baize-v2-7b":26.0,"phi-2-dpo":57.0,"alpaca-farm-ppo-human":32.0,"Nanbeige2-16B-Chat":288.0,"gpt4_0613":117.0,"pythia-12b-mix-sft":19.0,"alpaca-7b-neft":22.0,"Qwen1.5-14B-Chat":137.0,"gpt-4-0125-preview":446.0,"guanaco-33b":37.0,"oasst-sft-llama-33b":36.0,"gpt4_0613_verbose":171.0,"llama-2-chat-7b-evol70k-neft":57.0,"gpt35_turbo_instruct":66.0,"platolm-7b":42.0,"llama-2-13b-chat-hf":60.0,"Nanbeige-Plus-Chat-v0.1":456.0,"openchat-v2-13b":56.0,"mistral-orpo-beta":95.0,"Snorkel-Mistral-PairRM-DPO-best-of-16":270.0,"tulu-2-dpo-7b":64.0,"alpaca-7b_verbose":22.0,"OpenHermes-2.5-Mistral-7B":75.0,"claude-2.1_verbose":191.0,"ultralm-13b-v2.0":51.0,"deita-7b-v1.0":96.0,"minichat-1.5-3b":48.0,"Qwen-14B-Chat":57.0,"airoboros-33b":64.0,"alpaca-farm-ppo-sim-gpt4-20k":26.0,"ultralm-13b":38.0,"openbuddy-falcon-40b-v9":45.0,"openchat8192-13b":51.0,"wizardlm-13b":42.0,"vicuna-13b":44.0,"merlinite-7B-AOT":234.0,"gpt4_0314":172.0,"gpt4_0613_concise":71.0,"jina-chat":59.0,"Contextual-KTO-Mistral-PairRM":260.0,"xwinlm-13b-v0.1":129.0,"LMCocktail-10.7B-v1":104.0,"SPPO-Mistral7B-PairRM-ExPO":274.0,"Mixtral-8x7B-Instruct-v0.1_concise":105.0,"gpt4_1106_preview_concise":172.0,"Mistral-7B-ReMax-v0.1":120.0,"Llama-3-Instruct-8B-SimPO-ExPO":325.0,"dolphin-2.2.1-mistral-7b":68.0,"humpback-llama2-70b":77.0,"openpipe-moa-gpt-4-turbo-v1":515.0,"vicuna-7b-v1.5":35.0,"Starling-LM-7B-alpha":102.0,"falcon-40b-instruct":27.0,"Samba-CoE-v0.2-best-of-16":201.0,"opencoderplus-15b":52.0,"xwinlm-70b-v0.1":166.0,"wizardlm-13b-v1.2":82.0,"aligner-2b_qwen1.5-72b-chat":180.0,"internlm2-chat-7b-ExPO":209.0,"claude-2.1":115.0,"vicuna-7b-v1.3":31.0,"oasst-rlhf-llama-33b":44.0,"zephyr-7b-alpha-ExPO":79.0,"openchat-v3.1-13b":80.0,"SPPO-Llama-3-Instruct-8B-PairRM":310.0,"minotaur-13b":42.0,"tulu-2-dpo-13b-ExPO":121.0,"zephyr-7b-beta-ExPO":89.0,"tulu-2-dpo-7b-ExPO":91.0,"Llama-3-Instruct-8B-SimPO":319.0,"baize-v2-13b":32.0,"guanaco-7b":21.0,"ultralm-13b-v2.0-best-of-16":98.0,"claude-2.1_concise":72.0,"openchat-13b":58.0,"tulu-2-dpo-70b":119.0,"deepseek-llm-67b-chat":90.0,"humpback-llama-65b":70.0,"tulu-2-dpo-70b-ExPO":184.0,"TempNet-LLaMA2-Chat-7B-v0.1":39.0,"nous-hermes-13b":43.0,"gpt-3.5-turbo-0613":99.0,"alpaca-7b_concise":15.0,"baichuan-13b-chat":14.0,"claude-3-5-sonnet-20240620":312.0,"gpt-3.5-turbo-1106":64.0,"minichat-3b":22.0,"Storm-7B":397.0,"oasst-sft-pythia-12b":13.0,"Conifer-7B-DPO":87.0,"Snorkel-Mistral-PairRM-DPO":231.0,"internlm2-chat-20b-ExPO":375.0,"Samba-CoE-v0.2":159.0,"gemini-pro":135.0,"pairrm-tulu-2-70b":140.0,"text_davinci_003":14.0,"gpt4":179.0,"Yi-34B-Chat":219.0,"Starling-LM-7B-beta-ExPO":225.0,"pairrm-Yi-34B-Chat":239.0,"gpt4_1106_preview":0.0,"evo-7b":112.0,"zephyr-7b-beta":78.0,"guanaco-13b":22.0,"alpaca-7b":17.0,"internlm2-chat-20b-ppo":170.0,"gemma-2b-it":23.0,"pairrm-zephyr-7b-beta":98.0,"evo-v2-7b":158.0,"causallm-14b":81.0,"SPPO-Mistral7B-PairRM":249.0,"gpt-3.5-turbo-1106_concise":57.0,"openbuddy-llama-65b-v8":64.0,"claude2-alpaca-13b":59.0,"Starling-LM-7B-alpha-ExPO":148.0,"openbuddy-falcon-7b-v6":27.0,"gemma-7b-it":50.0,"phi-2-sft":28.0,"gpt4_gamed":32.0,"llama-2-70b-chat-hf":104.0,"openbuddy-llama2-70b-v10.1":57.0,"wizardlm-70b":106.0,"ultralm-13b-best-of-16":80.0},"n_wins_base":{"gpt-3.5-turbo-0301":733.0,"gpt-3.5-turbo-1106_verbose":709.0,"vicuna-13b-v1.5-togetherai":747.0,"Qwen1.5-1.8B-Chat":774.0,"recycled-wizardlm-7b-v1.0":752.0,"aligner-2b_claude-3-opus-20240229":475.0,"Qwen1.5-110B-Chat":545.0,"claude-3-opus-20240229":579.0,"llama-2-7b-chat-hf":766.0,"mistral-medium":639.0,"vicuna-33b-v1.3":711.0,"cohere":709.0,"claude-2":673.0,"guanaco-65b":751.0,"Mixtral-8x7B-Instruct-v0.1":668.0,"openchat-v2-w-13b":736.0,"falcon-7b-instruct":787.0,"wizardlm-13b-v1.1":723.0,"Meta-Llama-3-8B-Instruct":626.0,"FsfairX-Zephyr-Chat-v0.1":517.0,"Infinity-Instruct-3M-0613-Mistral-7B":687.0,"Qwen1.5-72B-Chat":600.0,"xwinlm-7b-v0.1":727.0,"Mixtral-8x22B-Instruct-v0.1":628.0,"vicuna-13b-v1.5":753.0,"dbrx-instruct":657.0,"zephyr-7b-alpha":745.0,"tulu-2-dpo-13b":728.0,"Qwen1.5-7B-Chat":721.0,"Together-MoA-Lite":347.0,"cut-13b":721.0,"Meta-Llama-3-70B-Instruct":537.0,"vicuna-13b-v1.3":751.0,"claude-instant-1.2":682.0,"airoboros-65b":735.0,"openbuddy-llama2-13b-v11.1":761.0,"phi-2":785.0,"Together-MoA":314.0,"mistral-large-2402":638.0,"openbuddy-llama-30b-v7.1":755.0,"TempNet-LLaMA2-Chat-70B-v0.1":691.0,"pairrm-tulu-2-13b":694.0,"recycled-wizardlm-7b-v2.0":755.0,"Storm-7B-best-of-64":286.0,"vicuna-7b":775.0,"claude-3-sonnet-20240229":608.0,"Mistral-7B-Instruct-v0.2":691.0,"Samba-CoE-v0.1":680.0,"claude":676.0,"Nanbeige2-8B-Chat":480.0,"REBEL-Llama-3-8B-Instruct":537.0,"chatglm2-6b":781.0,"gpt-4o-2024-05-13":369.0,"gpt4_1106_preview_verbose":268.0,"TempNet-LLaMA2-Chat-13B-v0.1":749.0,"text_davinci_001":777.0,"Mixtral-8x7B-Instruct-v0.1_verbose":609.0,"baize-v2-7b":779.0,"phi-2-dpo":748.0,"alpaca-farm-ppo-human":770.0,"Nanbeige2-16B-Chat":514.0,"gpt4_0613":684.0,"pythia-12b-mix-sft":786.0,"alpaca-7b-neft":783.0,"Qwen1.5-14B-Chat":664.0,"gpt-4-0125-preview":347.0,"guanaco-33b":768.0,"oasst-sft-llama-33b":764.0,"gpt4_0613_verbose":630.0,"llama-2-chat-7b-evol70k-neft":748.0,"gpt35_turbo_instruct":735.0,"platolm-7b":759.0,"llama-2-13b-chat-hf":744.0,"Nanbeige-Plus-Chat-v0.1":347.0,"openchat-v2-13b":746.0,"mistral-orpo-beta":707.0,"Snorkel-Mistral-PairRM-DPO-best-of-16":533.0,"tulu-2-dpo-7b":740.0,"alpaca-7b_verbose":778.0,"OpenHermes-2.5-Mistral-7B":727.0,"claude-2.1_verbose":613.0,"ultralm-13b-v2.0":754.0,"deita-7b-v1.0":708.0,"minichat-1.5-3b":757.0,"Qwen-14B-Chat":742.0,"airoboros-33b":740.0,"alpaca-farm-ppo-sim-gpt4-20k":776.0,"ultralm-13b":765.0,"openbuddy-falcon-40b-v9":758.0,"openchat8192-13b":754.0,"wizardlm-13b":759.0,"vicuna-13b":759.0,"merlinite-7B-AOT":571.0,"gpt4_0314":627.0,"gpt4_0613_concise":729.0,"jina-chat":743.0,"Contextual-KTO-Mistral-PairRM":544.0,"xwinlm-13b-v0.1":672.0,"LMCocktail-10.7B-v1":700.0,"SPPO-Mistral7B-PairRM-ExPO":531.0,"Mixtral-8x7B-Instruct-v0.1_concise":700.0,"gpt4_1106_preview_concise":622.0,"Mistral-7B-ReMax-v0.1":683.0,"Llama-3-Instruct-8B-SimPO-ExPO":479.0,"dolphin-2.2.1-mistral-7b":734.0,"humpback-llama2-70b":727.0,"openpipe-moa-gpt-4-turbo-v1":283.0,"vicuna-7b-v1.5":767.0,"Starling-LM-7B-alpha":702.0,"falcon-40b-instruct":777.0,"Samba-CoE-v0.2-best-of-16":601.0,"opencoderplus-15b":750.0,"xwinlm-70b-v0.1":635.0,"wizardlm-13b-v1.2":720.0,"aligner-2b_qwen1.5-72b-chat":473.0,"internlm2-chat-7b-ExPO":595.0,"claude-2.1":688.0,"vicuna-7b-v1.3":771.0,"oasst-rlhf-llama-33b":759.0,"zephyr-7b-alpha-ExPO":725.0,"openchat-v3.1-13b":720.0,"SPPO-Llama-3-Instruct-8B-PairRM":494.0,"minotaur-13b":758.0,"tulu-2-dpo-13b-ExPO":679.0,"zephyr-7b-beta-ExPO":716.0,"tulu-2-dpo-7b-ExPO":714.0,"Llama-3-Instruct-8B-SimPO":485.0,"baize-v2-13b":770.0,"guanaco-7b":783.0,"ultralm-13b-v2.0-best-of-16":705.0,"claude-2.1_concise":730.0,"openchat-13b":746.0,"tulu-2-dpo-70b":683.0,"deepseek-llm-67b-chat":713.0,"humpback-llama-65b":734.0,"tulu-2-dpo-70b-ExPO":620.0,"TempNet-LLaMA2-Chat-7B-v0.1":765.0,"nous-hermes-13b":761.0,"gpt-3.5-turbo-0613":700.0,"alpaca-7b_concise":787.0,"baichuan-13b-chat":790.0,"claude-3-5-sonnet-20240620":493.0,"gpt-3.5-turbo-1106":737.0,"minichat-3b":779.0,"Storm-7B":408.0,"oasst-sft-pythia-12b":790.0,"Conifer-7B-DPO":717.0,"Snorkel-Mistral-PairRM-DPO":572.0,"internlm2-chat-20b-ExPO":430.0,"Samba-CoE-v0.2":645.0,"gemini-pro":665.0,"pairrm-tulu-2-70b":665.0,"text_davinci_003":787.0,"gpt4":618.0,"Yi-34B-Chat":582.0,"Starling-LM-7B-beta-ExPO":580.0,"pairrm-Yi-34B-Chat":563.0,"gpt4_1106_preview":0.0,"evo-7b":689.0,"zephyr-7b-beta":725.0,"guanaco-13b":780.0,"alpaca-7b":785.0,"internlm2-chat-20b-ppo":632.0,"gemma-2b-it":782.0,"pairrm-zephyr-7b-beta":706.0,"evo-v2-7b":644.0,"causallm-14b":720.0,"SPPO-Mistral7B-PairRM":556.0,"gpt-3.5-turbo-1106_concise":744.0,"openbuddy-llama-65b-v8":738.0,"claude2-alpaca-13b":746.0,"Starling-LM-7B-alpha-ExPO":657.0,"openbuddy-falcon-7b-v6":778.0,"gemma-7b-it":754.0,"phi-2-sft":777.0,"gpt4_gamed":771.0,"llama-2-70b-chat-hf":700.0,"openbuddy-llama2-70b-v10.1":744.0,"wizardlm-70b":697.0,"ultralm-13b-best-of-16":723.0},"n_draws":{"gpt-3.5-turbo-0301":1.0,"gpt-3.5-turbo-1106_verbose":2.0,"vicuna-13b-v1.5-togetherai":5.0,"Qwen1.5-1.8B-Chat":3.0,"recycled-wizardlm-7b-v1.0":0.0,"aligner-2b_claude-3-opus-20240229":105.0,"Qwen1.5-110B-Chat":5.0,"claude-3-opus-20240229":3.0,"llama-2-7b-chat-hf":1.0,"mistral-medium":2.0,"vicuna-33b-v1.3":4.0,"cohere":0.0,"claude-2":1.0,"guanaco-65b":0.0,"Mixtral-8x7B-Instruct-v0.1":2.0,"openchat-v2-w-13b":2.0,"falcon-7b-instruct":2.0,"wizardlm-13b-v1.1":3.0,"Meta-Llama-3-8B-Instruct":3.0,"FsfairX-Zephyr-Chat-v0.1":3.0,"Infinity-Instruct-3M-0613-Mistral-7B":0.0,"Qwen1.5-72B-Chat":4.0,"xwinlm-7b-v0.1":1.0,"Mixtral-8x22B-Instruct-v0.1":3.0,"vicuna-13b-v1.5":4.0,"dbrx-instruct":1.0,"zephyr-7b-alpha":1.0,"tulu-2-dpo-13b":2.0,"Qwen1.5-7B-Chat":4.0,"Together-MoA-Lite":2.0,"cut-13b":1.0,"Meta-Llama-3-70B-Instruct":2.0,"vicuna-13b-v1.3":4.0,"claude-instant-1.2":3.0,"airoboros-65b":3.0,"openbuddy-llama2-13b-v11.1":2.0,"phi-2":3.0,"Together-MoA":1.0,"mistral-large-2402":1.0,"openbuddy-llama-30b-v7.1":3.0,"TempNet-LLaMA2-Chat-70B-v0.1":2.0,"pairrm-tulu-2-13b":1.0,"recycled-wizardlm-7b-v2.0":0.0,"Storm-7B-best-of-64":0.0,"vicuna-7b":2.0,"claude-3-sonnet-20240229":4.0,"Mistral-7B-Instruct-v0.2":1.0,"Samba-CoE-v0.1":1.0,"claude":0.0,"Nanbeige2-8B-Chat":2.0,"REBEL-Llama-3-8B-Instruct":0.0,"chatglm2-6b":5.0,"gpt-4o-2024-05-13":7.0,"gpt4_1106_preview_verbose":12.0,"TempNet-LLaMA2-Chat-13B-v0.1":0.0,"text_davinci_001":3.0,"Mixtral-8x7B-Instruct-v0.1_verbose":2.0,"baize-v2-7b":0.0,"phi-2-dpo":0.0,"alpaca-farm-ppo-human":3.0,"Nanbeige2-16B-Chat":3.0,"gpt4_0613":4.0,"pythia-12b-mix-sft":0.0,"alpaca-7b-neft":0.0,"Qwen1.5-14B-Chat":4.0,"gpt-4-0125-preview":12.0,"guanaco-33b":0.0,"oasst-sft-llama-33b":5.0,"gpt4_0613_verbose":4.0,"llama-2-chat-7b-evol70k-neft":0.0,"gpt35_turbo_instruct":3.0,"platolm-7b":2.0,"llama-2-13b-chat-hf":1.0,"Nanbeige-Plus-Chat-v0.1":2.0,"openchat-v2-13b":3.0,"mistral-orpo-beta":3.0,"Snorkel-Mistral-PairRM-DPO-best-of-16":2.0,"tulu-2-dpo-7b":1.0,"alpaca-7b_verbose":2.0,"OpenHermes-2.5-Mistral-7B":3.0,"claude-2.1_verbose":1.0,"ultralm-13b-v2.0":0.0,"deita-7b-v1.0":1.0,"minichat-1.5-3b":0.0,"Qwen-14B-Chat":6.0,"airoboros-33b":1.0,"alpaca-farm-ppo-sim-gpt4-20k":3.0,"ultralm-13b":2.0,"openbuddy-falcon-40b-v9":2.0,"openchat8192-13b":0.0,"wizardlm-13b":4.0,"vicuna-13b":2.0,"merlinite-7B-AOT":0.0,"gpt4_0314":6.0,"gpt4_0613_concise":5.0,"jina-chat":3.0,"Contextual-KTO-Mistral-PairRM":1.0,"xwinlm-13b-v0.1":4.0,"LMCocktail-10.7B-v1":1.0,"SPPO-Mistral7B-PairRM-ExPO":0.0,"Mixtral-8x7B-Instruct-v0.1_concise":0.0,"gpt4_1106_preview_concise":11.0,"Mistral-7B-ReMax-v0.1":2.0,"Llama-3-Instruct-8B-SimPO-ExPO":1.0,"dolphin-2.2.1-mistral-7b":3.0,"humpback-llama2-70b":1.0,"openpipe-moa-gpt-4-turbo-v1":7.0,"vicuna-7b-v1.5":3.0,"Starling-LM-7B-alpha":1.0,"falcon-40b-instruct":1.0,"Samba-CoE-v0.2-best-of-16":3.0,"opencoderplus-15b":3.0,"xwinlm-70b-v0.1":4.0,"wizardlm-13b-v1.2":3.0,"aligner-2b_qwen1.5-72b-chat":152.0,"internlm2-chat-7b-ExPO":1.0,"claude-2.1":2.0,"vicuna-7b-v1.3":3.0,"oasst-rlhf-llama-33b":2.0,"zephyr-7b-alpha-ExPO":1.0,"openchat-v3.1-13b":5.0,"SPPO-Llama-3-Instruct-8B-PairRM":1.0,"minotaur-13b":4.0,"tulu-2-dpo-13b-ExPO":5.0,"zephyr-7b-beta-ExPO":0.0,"tulu-2-dpo-7b-ExPO":0.0,"Llama-3-Instruct-8B-SimPO":1.0,"baize-v2-13b":3.0,"guanaco-7b":1.0,"ultralm-13b-v2.0-best-of-16":2.0,"claude-2.1_concise":3.0,"openchat-13b":1.0,"tulu-2-dpo-70b":3.0,"deepseek-llm-67b-chat":2.0,"humpback-llama-65b":1.0,"tulu-2-dpo-70b-ExPO":1.0,"TempNet-LLaMA2-Chat-7B-v0.1":1.0,"nous-hermes-13b":1.0,"gpt-3.5-turbo-0613":6.0,"alpaca-7b_concise":2.0,"baichuan-13b-chat":1.0,"claude-3-5-sonnet-20240620":0.0,"gpt-3.5-turbo-1106":4.0,"minichat-3b":4.0,"Storm-7B":0.0,"oasst-sft-pythia-12b":2.0,"Conifer-7B-DPO":1.0,"Snorkel-Mistral-PairRM-DPO":1.0,"internlm2-chat-20b-ExPO":0.0,"Samba-CoE-v0.2":1.0,"gemini-pro":5.0,"pairrm-tulu-2-70b":0.0,"text_davinci_003":4.0,"gpt4":8.0,"Yi-34B-Chat":4.0,"Starling-LM-7B-beta-ExPO":0.0,"pairrm-Yi-34B-Chat":3.0,"gpt4_1106_preview":805.0,"evo-7b":4.0,"zephyr-7b-beta":2.0,"guanaco-13b":3.0,"alpaca-7b":3.0,"internlm2-chat-20b-ppo":3.0,"gemma-2b-it":0.0,"pairrm-zephyr-7b-beta":1.0,"evo-v2-7b":3.0,"causallm-14b":4.0,"SPPO-Mistral7B-PairRM":0.0,"gpt-3.5-turbo-1106_concise":4.0,"openbuddy-llama-65b-v8":3.0,"claude2-alpaca-13b":0.0,"Starling-LM-7B-alpha-ExPO":0.0,"openbuddy-falcon-7b-v6":0.0,"gemma-7b-it":1.0,"phi-2-sft":0.0,"gpt4_gamed":2.0,"llama-2-70b-chat-hf":0.0,"openbuddy-llama2-70b-v10.1":4.0,"wizardlm-70b":2.0,"ultralm-13b-best-of-16":2.0},"n_total":{"gpt-3.5-turbo-0301":805.0,"gpt-3.5-turbo-1106_verbose":805.0,"vicuna-13b-v1.5-togetherai":805.0,"Qwen1.5-1.8B-Chat":804.0,"recycled-wizardlm-7b-v1.0":805.0,"aligner-2b_claude-3-opus-20240229":805.0,"Qwen1.5-110B-Chat":805.0,"claude-3-opus-20240229":805.0,"llama-2-7b-chat-hf":805.0,"mistral-medium":805.0,"vicuna-33b-v1.3":805.0,"cohere":805.0,"claude-2":805.0,"guanaco-65b":805.0,"Mixtral-8x7B-Instruct-v0.1":805.0,"openchat-v2-w-13b":805.0,"falcon-7b-instruct":805.0,"wizardlm-13b-v1.1":805.0,"Meta-Llama-3-8B-Instruct":805.0,"FsfairX-Zephyr-Chat-v0.1":805.0,"Infinity-Instruct-3M-0613-Mistral-7B":805.0,"Qwen1.5-72B-Chat":805.0,"xwinlm-7b-v0.1":805.0,"Mixtral-8x22B-Instruct-v0.1":805.0,"vicuna-13b-v1.5":805.0,"dbrx-instruct":805.0,"zephyr-7b-alpha":805.0,"tulu-2-dpo-13b":805.0,"Qwen1.5-7B-Chat":805.0,"Together-MoA-Lite":805.0,"cut-13b":805.0,"Meta-Llama-3-70B-Instruct":805.0,"vicuna-13b-v1.3":805.0,"claude-instant-1.2":805.0,"airoboros-65b":805.0,"openbuddy-llama2-13b-v11.1":805.0,"phi-2":803.0,"Together-MoA":805.0,"mistral-large-2402":805.0,"openbuddy-llama-30b-v7.1":805.0,"TempNet-LLaMA2-Chat-70B-v0.1":804.0,"pairrm-tulu-2-13b":805.0,"recycled-wizardlm-7b-v2.0":805.0,"Storm-7B-best-of-64":805.0,"vicuna-7b":805.0,"claude-3-sonnet-20240229":805.0,"Mistral-7B-Instruct-v0.2":805.0,"Samba-CoE-v0.1":805.0,"claude":805.0,"Nanbeige2-8B-Chat":805.0,"REBEL-Llama-3-8B-Instruct":805.0,"chatglm2-6b":805.0,"gpt-4o-2024-05-13":805.0,"gpt4_1106_preview_verbose":805.0,"TempNet-LLaMA2-Chat-13B-v0.1":805.0,"text_davinci_001":803.0,"Mixtral-8x7B-Instruct-v0.1_verbose":805.0,"baize-v2-7b":805.0,"phi-2-dpo":805.0,"alpaca-farm-ppo-human":805.0,"Nanbeige2-16B-Chat":805.0,"gpt4_0613":805.0,"pythia-12b-mix-sft":805.0,"alpaca-7b-neft":805.0,"Qwen1.5-14B-Chat":805.0,"gpt-4-0125-preview":805.0,"guanaco-33b":805.0,"oasst-sft-llama-33b":805.0,"gpt4_0613_verbose":805.0,"llama-2-chat-7b-evol70k-neft":805.0,"gpt35_turbo_instruct":804.0,"platolm-7b":803.0,"llama-2-13b-chat-hf":805.0,"Nanbeige-Plus-Chat-v0.1":805.0,"openchat-v2-13b":805.0,"mistral-orpo-beta":805.0,"Snorkel-Mistral-PairRM-DPO-best-of-16":805.0,"tulu-2-dpo-7b":805.0,"alpaca-7b_verbose":802.0,"OpenHermes-2.5-Mistral-7B":805.0,"claude-2.1_verbose":805.0,"ultralm-13b-v2.0":805.0,"deita-7b-v1.0":805.0,"minichat-1.5-3b":805.0,"Qwen-14B-Chat":805.0,"airoboros-33b":805.0,"alpaca-farm-ppo-sim-gpt4-20k":805.0,"ultralm-13b":805.0,"openbuddy-falcon-40b-v9":805.0,"openchat8192-13b":805.0,"wizardlm-13b":805.0,"vicuna-13b":805.0,"merlinite-7B-AOT":805.0,"gpt4_0314":805.0,"gpt4_0613_concise":805.0,"jina-chat":805.0,"Contextual-KTO-Mistral-PairRM":805.0,"xwinlm-13b-v0.1":805.0,"LMCocktail-10.7B-v1":805.0,"SPPO-Mistral7B-PairRM-ExPO":805.0,"Mixtral-8x7B-Instruct-v0.1_concise":805.0,"gpt4_1106_preview_concise":805.0,"Mistral-7B-ReMax-v0.1":805.0,"Llama-3-Instruct-8B-SimPO-ExPO":805.0,"dolphin-2.2.1-mistral-7b":805.0,"humpback-llama2-70b":805.0,"openpipe-moa-gpt-4-turbo-v1":805.0,"vicuna-7b-v1.5":805.0,"Starling-LM-7B-alpha":805.0,"falcon-40b-instruct":805.0,"Samba-CoE-v0.2-best-of-16":805.0,"opencoderplus-15b":805.0,"xwinlm-70b-v0.1":805.0,"wizardlm-13b-v1.2":805.0,"aligner-2b_qwen1.5-72b-chat":805.0,"internlm2-chat-7b-ExPO":805.0,"claude-2.1":805.0,"vicuna-7b-v1.3":805.0,"oasst-rlhf-llama-33b":805.0,"zephyr-7b-alpha-ExPO":805.0,"openchat-v3.1-13b":805.0,"SPPO-Llama-3-Instruct-8B-PairRM":805.0,"minotaur-13b":804.0,"tulu-2-dpo-13b-ExPO":805.0,"zephyr-7b-beta-ExPO":805.0,"tulu-2-dpo-7b-ExPO":805.0,"Llama-3-Instruct-8B-SimPO":805.0,"baize-v2-13b":805.0,"guanaco-7b":805.0,"ultralm-13b-v2.0-best-of-16":805.0,"claude-2.1_concise":805.0,"openchat-13b":805.0,"tulu-2-dpo-70b":805.0,"deepseek-llm-67b-chat":805.0,"humpback-llama-65b":805.0,"tulu-2-dpo-70b-ExPO":805.0,"TempNet-LLaMA2-Chat-7B-v0.1":805.0,"nous-hermes-13b":805.0,"gpt-3.5-turbo-0613":805.0,"alpaca-7b_concise":804.0,"baichuan-13b-chat":805.0,"claude-3-5-sonnet-20240620":805.0,"gpt-3.5-turbo-1106":805.0,"minichat-3b":805.0,"Storm-7B":805.0,"oasst-sft-pythia-12b":805.0,"Conifer-7B-DPO":805.0,"Snorkel-Mistral-PairRM-DPO":804.0,"internlm2-chat-20b-ExPO":805.0,"Samba-CoE-v0.2":805.0,"gemini-pro":805.0,"pairrm-tulu-2-70b":805.0,"text_davinci_003":805.0,"gpt4":805.0,"Yi-34B-Chat":805.0,"Starling-LM-7B-beta-ExPO":805.0,"pairrm-Yi-34B-Chat":805.0,"gpt4_1106_preview":805.0,"evo-7b":805.0,"zephyr-7b-beta":805.0,"guanaco-13b":805.0,"alpaca-7b":805.0,"internlm2-chat-20b-ppo":805.0,"gemma-2b-it":805.0,"pairrm-zephyr-7b-beta":805.0,"evo-v2-7b":805.0,"causallm-14b":805.0,"SPPO-Mistral7B-PairRM":805.0,"gpt-3.5-turbo-1106_concise":805.0,"openbuddy-llama-65b-v8":805.0,"claude2-alpaca-13b":805.0,"Starling-LM-7B-alpha-ExPO":805.0,"openbuddy-falcon-7b-v6":805.0,"gemma-7b-it":805.0,"phi-2-sft":805.0,"gpt4_gamed":805.0,"llama-2-70b-chat-hf":804.0,"openbuddy-llama2-70b-v10.1":805.0,"wizardlm-70b":805.0,"ultralm-13b-best-of-16":805.0},"discrete_win_rate":{"gpt-3.5-turbo-0301":8.8819875776,"gpt-3.5-turbo-1106_verbose":11.801242236,"vicuna-13b-v1.5-togetherai":6.8944099379,"Qwen1.5-1.8B-Chat":3.5447761194,"recycled-wizardlm-7b-v1.0":6.5838509317,"aligner-2b_claude-3-opus-20240229":34.4720496894,"Qwen1.5-110B-Chat":31.9875776398,"claude-3-opus-20240229":27.8881987578,"llama-2-7b-chat-hf":4.7826086957,"mistral-medium":20.4968944099,"vicuna-33b-v1.3":11.4285714286,"cohere":11.9254658385,"claude-2":16.3354037267,"guanaco-65b":6.7080745342,"Mixtral-8x7B-Instruct-v0.1":16.8944099379,"openchat-v2-w-13b":8.4472049689,"falcon-7b-instruct":2.1118012422,"wizardlm-13b-v1.1":10.0,"Meta-Llama-3-8B-Instruct":22.049689441,"FsfairX-Zephyr-Chat-v0.1":35.5900621118,"Infinity-Instruct-3M-0613-Mistral-7B":14.6583850932,"Qwen1.5-72B-Chat":25.2173913043,"xwinlm-7b-v0.1":9.6273291925,"Mixtral-8x22B-Instruct-v0.1":21.801242236,"vicuna-13b-v1.5":6.2111801242,"dbrx-instruct":18.3229813665,"zephyr-7b-alpha":7.3913043478,"tulu-2-dpo-13b":9.4409937888,"Qwen1.5-7B-Chat":10.1863354037,"Together-MoA-Lite":56.7701863354,"cut-13b":10.3726708075,"Meta-Llama-3-70B-Instruct":33.1677018634,"vicuna-13b-v1.3":6.4596273292,"claude-instant-1.2":15.0931677019,"airoboros-65b":8.5093167702,"openbuddy-llama2-13b-v11.1":5.3416149068,"phi-2":2.0547945205,"Together-MoA":60.9316770186,"mistral-large-2402":20.6832298137,"openbuddy-llama-30b-v7.1":6.0248447205,"TempNet-LLaMA2-Chat-70B-v0.1":13.9303482587,"pairrm-tulu-2-13b":13.7267080745,"recycled-wizardlm-7b-v2.0":6.2111801242,"Storm-7B-best-of-64":64.4720496894,"vicuna-7b":3.602484472,"claude-3-sonnet-20240229":24.2236024845,"Mistral-7B-Instruct-v0.2":14.099378882,"Samba-CoE-v0.1":15.4658385093,"claude":16.0248447205,"Nanbeige2-8B-Chat":40.248447205,"REBEL-Llama-3-8B-Instruct":33.2919254658,"chatglm2-6b":2.6708074534,"gpt-4o-2024-05-13":53.7267080745,"gpt4_1106_preview_verbose":65.9627329193,"TempNet-LLaMA2-Chat-13B-v0.1":6.9565217391,"text_davinci_001":3.0510585305,"Mixtral-8x7B-Instruct-v0.1_verbose":24.2236024845,"baize-v2-7b":3.2298136646,"phi-2-dpo":7.0807453416,"alpaca-farm-ppo-human":4.1614906832,"Nanbeige2-16B-Chat":35.9627329193,"gpt4_0613":14.7826086957,"pythia-12b-mix-sft":2.3602484472,"alpaca-7b-neft":2.7329192547,"Qwen1.5-14B-Chat":17.2670807453,"gpt-4-0125-preview":56.149068323,"guanaco-33b":4.5962732919,"oasst-sft-llama-33b":4.7826086957,"gpt4_0613_verbose":21.4906832298,"llama-2-chat-7b-evol70k-neft":7.0807453416,"gpt35_turbo_instruct":8.3955223881,"platolm-7b":5.3549190535,"llama-2-13b-chat-hf":7.5155279503,"Nanbeige-Plus-Chat-v0.1":56.7701863354,"openchat-v2-13b":7.1428571429,"mistral-orpo-beta":11.9875776398,"Snorkel-Mistral-PairRM-DPO-best-of-16":33.6645962733,"tulu-2-dpo-7b":8.0124223602,"alpaca-7b_verbose":2.8678304239,"OpenHermes-2.5-Mistral-7B":9.5031055901,"claude-2.1_verbose":23.7888198758,"ultralm-13b-v2.0":6.3354037267,"deita-7b-v1.0":11.9875776398,"minichat-1.5-3b":5.9627329193,"Qwen-14B-Chat":7.4534161491,"airoboros-33b":8.0124223602,"alpaca-farm-ppo-sim-gpt4-20k":3.4161490683,"ultralm-13b":4.8447204969,"openbuddy-falcon-40b-v9":5.7142857143,"openchat8192-13b":6.3354037267,"wizardlm-13b":5.4658385093,"vicuna-13b":5.5900621118,"merlinite-7B-AOT":29.0683229814,"gpt4_0314":21.7391304348,"gpt4_0613_concise":9.1304347826,"jina-chat":7.5155279503,"Contextual-KTO-Mistral-PairRM":32.3602484472,"xwinlm-13b-v0.1":16.2732919255,"LMCocktail-10.7B-v1":12.9813664596,"SPPO-Mistral7B-PairRM-ExPO":34.0372670807,"Mixtral-8x7B-Instruct-v0.1_concise":13.0434782609,"gpt4_1106_preview_concise":22.049689441,"Mistral-7B-ReMax-v0.1":15.0310559006,"Llama-3-Instruct-8B-SimPO-ExPO":40.4347826087,"dolphin-2.2.1-mistral-7b":8.6335403727,"humpback-llama2-70b":9.6273291925,"openpipe-moa-gpt-4-turbo-v1":64.4099378882,"vicuna-7b-v1.5":4.5341614907,"Starling-LM-7B-alpha":12.7329192547,"falcon-40b-instruct":3.4161490683,"Samba-CoE-v0.2-best-of-16":25.1552795031,"opencoderplus-15b":6.6459627329,"xwinlm-70b-v0.1":20.8695652174,"wizardlm-13b-v1.2":10.3726708075,"aligner-2b_qwen1.5-72b-chat":31.801242236,"internlm2-chat-7b-ExPO":26.0248447205,"claude-2.1":14.4099378882,"vicuna-7b-v1.3":4.0372670807,"oasst-rlhf-llama-33b":5.5900621118,"zephyr-7b-alpha-ExPO":9.8757763975,"openchat-v3.1-13b":10.248447205,"SPPO-Llama-3-Instruct-8B-PairRM":38.5714285714,"minotaur-13b":5.4726368159,"tulu-2-dpo-13b-ExPO":15.3416149068,"zephyr-7b-beta-ExPO":11.0559006211,"tulu-2-dpo-7b-ExPO":11.3043478261,"Llama-3-Instruct-8B-SimPO":39.6894409938,"baize-v2-13b":4.1614906832,"guanaco-7b":2.6708074534,"ultralm-13b-v2.0-best-of-16":12.298136646,"claude-2.1_concise":9.1304347826,"openchat-13b":7.2670807453,"tulu-2-dpo-70b":14.9689440994,"deepseek-llm-67b-chat":11.3043478261,"humpback-llama-65b":8.7577639752,"tulu-2-dpo-70b-ExPO":22.9192546584,"TempNet-LLaMA2-Chat-7B-v0.1":4.9068322981,"nous-hermes-13b":5.4037267081,"gpt-3.5-turbo-0613":12.6708074534,"alpaca-7b_concise":1.9900497512,"baichuan-13b-chat":1.801242236,"claude-3-5-sonnet-20240620":38.7577639752,"gpt-3.5-turbo-1106":8.198757764,"minichat-3b":2.9813664596,"Storm-7B":49.3167701863,"oasst-sft-pythia-12b":1.7391304348,"Conifer-7B-DPO":10.8695652174,"Snorkel-Mistral-PairRM-DPO":28.7935323383,"internlm2-chat-20b-ExPO":46.5838509317,"Samba-CoE-v0.2":19.8136645963,"gemini-pro":17.0807453416,"pairrm-tulu-2-70b":17.3913043478,"text_davinci_003":1.9875776398,"gpt4":22.7329192547,"Yi-34B-Chat":27.4534161491,"Starling-LM-7B-beta-ExPO":27.950310559,"pairrm-Yi-34B-Chat":29.8757763975,"gpt4_1106_preview":50.0,"evo-7b":14.1614906832,"zephyr-7b-beta":9.8136645963,"guanaco-13b":2.9192546584,"alpaca-7b":2.298136646,"internlm2-chat-20b-ppo":21.3043478261,"gemma-2b-it":2.8571428571,"pairrm-zephyr-7b-beta":12.2360248447,"evo-v2-7b":19.8136645963,"causallm-14b":10.3105590062,"SPPO-Mistral7B-PairRM":30.9316770186,"gpt-3.5-turbo-1106_concise":7.3291925466,"openbuddy-llama-65b-v8":8.1366459627,"claude2-alpaca-13b":7.3291925466,"Starling-LM-7B-alpha-ExPO":18.3850931677,"openbuddy-falcon-7b-v6":3.3540372671,"gemma-7b-it":6.2732919255,"phi-2-sft":3.4782608696,"gpt4_gamed":4.099378882,"llama-2-70b-chat-hf":12.9353233831,"openbuddy-llama2-70b-v10.1":7.3291925466,"wizardlm-70b":13.2919254658,"ultralm-13b-best-of-16":10.0621118012},"length_controlled_winrate":{"gpt-3.5-turbo-0301":18.093241552,"gpt-3.5-turbo-1106_verbose":22.0009370217,"vicuna-13b-v1.5-togetherai":11.6853569655,"Qwen1.5-1.8B-Chat":2.5884988492,"recycled-wizardlm-7b-v1.0":6.901477322,"aligner-2b_claude-3-opus-20240229":41.8230717152,"Qwen1.5-110B-Chat":43.905552211,"claude-3-opus-20240229":40.5095080124,"llama-2-7b-chat-hf":5.3548212795,"mistral-medium":28.6143374017,"vicuna-33b-v1.3":17.5745753109,"cohere":10.8930208866,"claude-2":28.1551961416,"guanaco-65b":8.2529169916,"Mixtral-8x7B-Instruct-v0.1":23.6884826013,"openchat-v2-w-13b":12.030427771,"falcon-7b-instruct":4.0369375668,"wizardlm-13b-v1.1":13.9157205928,"Meta-Llama-3-8B-Instruct":22.9187846731,"FsfairX-Zephyr-Chat-v0.1":34.787447623,"Infinity-Instruct-3M-0613-Mistral-7B":25.5015577947,"Qwen1.5-72B-Chat":36.571754112,"xwinlm-7b-v0.1":10.8122056273,"Mixtral-8x22B-Instruct-v0.1":30.8788102941,"vicuna-13b-v1.5":10.4844382985,"dbrx-instruct":25.1853410397,"zephyr-7b-alpha":10.2897608887,"tulu-2-dpo-13b":11.5544794281,"Qwen1.5-7B-Chat":14.7484310443,"Together-MoA-Lite":59.1415240989,"cut-13b":12.1547817539,"Meta-Llama-3-70B-Instruct":34.4245971745,"vicuna-13b-v1.3":10.8431649437,"claude-instant-1.2":25.6122590254,"airoboros-65b":11.0076424064,"openbuddy-llama2-13b-v11.1":9.159089775,"phi-2":4.3986822709,"Together-MoA":65.3799697685,"mistral-large-2402":32.6520799853,"openbuddy-llama-30b-v7.1":10.2144949912,"TempNet-LLaMA2-Chat-70B-v0.1":15.8311627784,"pairrm-tulu-2-13b":17.405203698,"recycled-wizardlm-7b-v2.0":7.5216099553,"Storm-7B-best-of-64":61.637895572,"vicuna-7b":6.2772177385,"claude-3-sonnet-20240229":34.8724743624,"Mistral-7B-Instruct-v0.2":17.111251846,"Samba-CoE-v0.1":22.8658373348,"claude":27.2895044437,"Nanbeige2-8B-Chat":25.2417704867,"REBEL-Llama-3-8B-Instruct":31.4699427971,"chatglm2-6b":4.3592829268,"gpt-4o-2024-05-13":57.4568288333,"gpt4_1106_preview_verbose":51.5750079797,"TempNet-LLaMA2-Chat-13B-v0.1":8.5783553109,"text_davinci_001":9.0257288521,"Mixtral-8x7B-Instruct-v0.1_verbose":23.2231207809,"baize-v2-7b":4.382564905,"phi-2-dpo":7.7708946203,"alpaca-farm-ppo-human":6.4186032949,"Nanbeige2-16B-Chat":40.5912863493,"gpt4_0613":30.1833223167,"pythia-12b-mix-sft":4.2213618614,"alpaca-7b-neft":3.5091458375,"Qwen1.5-14B-Chat":23.8966467702,"gpt-4-0125-preview":56.3562938462,"guanaco-33b":5.6900190909,"oasst-sft-llama-33b":9.8664121438,"gpt4_0613_verbose":33.8212668866,"llama-2-chat-7b-evol70k-neft":7.5330526555,"gpt35_turbo_instruct":17.7278010829,"platolm-7b":10.5434020728,"llama-2-13b-chat-hf":8.4360145489,"Nanbeige-Plus-Chat-v0.1":44.4596624034,"openchat-v2-13b":10.3996073385,"mistral-orpo-beta":14.7167494307,"Snorkel-Mistral-PairRM-DPO-best-of-16":29.9743216131,"tulu-2-dpo-7b":9.2002656115,"alpaca-7b_verbose":6.8163068164,"OpenHermes-2.5-Mistral-7B":16.2485776967,"claude-2.1_verbose":30.2911791666,"ultralm-13b-v2.0":9.1290184442,"deita-7b-v1.0":16.0590135397,"minichat-1.5-3b":7.7016328215,"Qwen-14B-Chat":12.3787417907,"airoboros-33b":10.7190026781,"alpaca-farm-ppo-sim-gpt4-20k":7.1218081016,"ultralm-13b":7.1081913613,"openbuddy-falcon-40b-v9":8.9889364779,"openchat8192-13b":7.8970617346,"wizardlm-13b":9.8281507688,"vicuna-13b":9.2220600237,"merlinite-7B-AOT":31.721885287,"gpt4_0314":35.3070612164,"gpt4_0613_concise":21.5779909145,"jina-chat":15.8660040495,"Contextual-KTO-Mistral-PairRM":29.7058089397,"xwinlm-13b-v0.1":17.9189378982,"LMCocktail-10.7B-v1":18.9507103867,"SPPO-Mistral7B-PairRM-ExPO":31.9003876312,"Mixtral-8x7B-Instruct-v0.1_concise":22.9626094728,"gpt4_1106_preview_concise":41.8966015912,"Mistral-7B-ReMax-v0.1":20.5513677023,"Llama-3-Instruct-8B-SimPO-ExPO":45.807978034,"dolphin-2.2.1-mistral-7b":13.1214776504,"humpback-llama2-70b":16.2491642314,"openpipe-moa-gpt-4-turbo-v1":68.3786625033,"vicuna-7b-v1.5":7.6168927319,"Starling-LM-7B-alpha":14.6904710794,"falcon-40b-instruct":5.6075325447,"Samba-CoE-v0.2-best-of-16":31.5065442681,"opencoderplus-15b":8.1524101557,"xwinlm-70b-v0.1":24.6496860571,"wizardlm-13b-v1.2":14.4625906943,"aligner-2b_qwen1.5-72b-chat":36.7258688784,"internlm2-chat-7b-ExPO":22.6674802488,"claude-2.1":25.2519438861,"vicuna-7b-v1.3":7.1564609564,"oasst-rlhf-llama-33b":7.9709218373,"zephyr-7b-alpha-ExPO":13.6232522647,"openchat-v3.1-13b":14.5033879568,"SPPO-Llama-3-Instruct-8B-PairRM":38.5628066368,"minotaur-13b":11.4652513168,"tulu-2-dpo-13b-ExPO":17.6509979624,"zephyr-7b-beta-ExPO":14.0012119801,"tulu-2-dpo-7b-ExPO":11.6088057579,"Llama-3-Instruct-8B-SimPO":44.6804680926,"baize-v2-13b":7.012247205,"guanaco-7b":2.8711168131,"ultralm-13b-v2.0-best-of-16":14.1989875666,"claude-2.1_concise":18.2084579084,"openchat-13b":8.8060534912,"tulu-2-dpo-70b":21.2386100384,"deepseek-llm-67b-chat":17.8433840899,"humpback-llama-65b":12.7998599959,"tulu-2-dpo-70b-ExPO":25.7233081711,"TempNet-LLaMA2-Chat-7B-v0.1":5.7396138367,"nous-hermes-13b":9.7178634178,"gpt-3.5-turbo-0613":22.3525129805,"alpaca-7b_concise":4.4672516799,"baichuan-13b-chat":2.0621702536,"claude-3-5-sonnet-20240620":52.3667542714,"gpt-3.5-turbo-1106":19.300589035,"minichat-3b":5.7293328759,"Storm-7B":50.4080792281,"oasst-sft-pythia-12b":3.2701021145,"Conifer-7B-DPO":17.1124958828,"Snorkel-Mistral-PairRM-DPO":26.3914464573,"internlm2-chat-20b-ExPO":27.2257594808,"Samba-CoE-v0.2":27.6242673501,"gemini-pro":24.381776108,"pairrm-tulu-2-70b":21.4284039755,"text_davinci_003":4.5664105675,"gpt4":38.1280897444,"Yi-34B-Chat":27.1905478776,"Starling-LM-7B-beta-ExPO":26.4869564984,"pairrm-Yi-34B-Chat":28.8148408668,"gpt4_1106_preview":50.0,"evo-7b":16.4893860042,"zephyr-7b-beta":13.2031984931,"guanaco-13b":3.0037873296,"alpaca-7b":5.8754871633,"internlm2-chat-20b-ppo":18.7487394854,"gemma-2b-it":5.4374536204,"pairrm-zephyr-7b-beta":15.529867295,"evo-v2-7b":23.357705702,"causallm-14b":15.720325189,"SPPO-Mistral7B-PairRM":30.4941379652,"gpt-3.5-turbo-1106_concise":15.7695209839,"openbuddy-llama-65b-v8":12.4693562891,"claude2-alpaca-13b":11.4988982132,"Starling-LM-7B-alpha-ExPO":19.4741654606,"openbuddy-falcon-7b-v6":4.8261244822,"gemma-7b-it":10.4257604037,"phi-2-sft":5.8537876906,"gpt4_gamed":12.1887640576,"llama-2-70b-chat-hf":14.6896485884,"openbuddy-llama2-70b-v10.1":12.5721732723,"wizardlm-70b":17.5750607375,"ultralm-13b-best-of-16":9.8760888169},"lc_standard_error":{"gpt-3.5-turbo-0301":0.7864976807,"gpt-3.5-turbo-1106_verbose":0.8544953416,"vicuna-13b-v1.5-togetherai":0.6243797898,"Qwen1.5-1.8B-Chat":0.2021610274,"recycled-wizardlm-7b-v1.0":0.4105893841,"aligner-2b_claude-3-opus-20240229":0.7776876699,"Qwen1.5-110B-Chat":0.8945807936,"claude-3-opus-20240229":0.8837504763,"llama-2-7b-chat-hf":0.3326400931,"mistral-medium":0.9075464438,"vicuna-33b-v1.3":0.7099362877,"cohere":0.5206791146,"claude-2":0.8779084794,"guanaco-65b":0.46281361,"Mixtral-8x7B-Instruct-v0.1":0.9011105015,"openchat-v2-w-13b":0.5657607148,"falcon-7b-instruct":0.268726544,"wizardlm-13b-v1.1":0.6712555976,"Meta-Llama-3-8B-Instruct":0.849800882,"FsfairX-Zephyr-Chat-v0.1":0.7594505141,"Infinity-Instruct-3M-0613-Mistral-7B":0.7760697229,"Qwen1.5-72B-Chat":0.9357421321,"xwinlm-7b-v0.1":0.5519849159,"Mixtral-8x22B-Instruct-v0.1":0.9518125819,"vicuna-13b-v1.5":0.5980193852,"dbrx-instruct":0.8999456518,"zephyr-7b-alpha":0.5879820221,"tulu-2-dpo-13b":0.6494943093,"Qwen1.5-7B-Chat":0.6490365375,"Together-MoA-Lite":0.7580510219,"cut-13b":0.6383138465,"Meta-Llama-3-70B-Instruct":0.8691832384,"vicuna-13b-v1.3":0.6100962742,"claude-instant-1.2":0.87464248,"airoboros-65b":0.6004520879,"openbuddy-llama2-13b-v11.1":0.5636847159,"phi-2":0.1293627793,"Together-MoA":0.7392392837,"mistral-large-2402":0.9044632955,"openbuddy-llama-30b-v7.1":0.6099418552,"TempNet-LLaMA2-Chat-70B-v0.1":0.7195404924,"pairrm-tulu-2-13b":0.7958946232,"recycled-wizardlm-7b-v2.0":0.4355543699,"Storm-7B-best-of-64":0.6799412402,"vicuna-7b":0.3964740967,"claude-3-sonnet-20240229":0.949844689,"Mistral-7B-Instruct-v0.2":0.7875592102,"Samba-CoE-v0.1":0.7405123259,"claude":0.858614564,"Nanbeige2-8B-Chat":0.5909370499,"REBEL-Llama-3-8B-Instruct":0.8138922262,"chatglm2-6b":0.2913010016,"gpt-4o-2024-05-13":0.7774399385,"gpt4_1106_preview_verbose":0.8313707608,"TempNet-LLaMA2-Chat-13B-v0.1":0.4783538284,"text_davinci_001":0.2169278281,"Mixtral-8x7B-Instruct-v0.1_verbose":0.7975932103,"baize-v2-7b":0.3307775329,"phi-2-dpo":0.420015191,"alpaca-farm-ppo-human":0.4202234849,"Nanbeige2-16B-Chat":0.8504106275,"gpt4_0613":0.7874508454,"pythia-12b-mix-sft":0.2932467883,"alpaca-7b-neft":0.2516233369,"Qwen1.5-14B-Chat":0.7729838609,"gpt-4-0125-preview":0.7731843456,"guanaco-33b":0.3195322556,"oasst-sft-llama-33b":0.5204539206,"gpt4_0613_verbose":0.8842151461,"llama-2-chat-7b-evol70k-neft":0.4277221418,"gpt35_turbo_instruct":0.3748783811,"platolm-7b":0.3937696385,"llama-2-13b-chat-hf":0.5161956367,"Nanbeige-Plus-Chat-v0.1":0.7209678864,"openchat-v2-13b":0.5398936504,"mistral-orpo-beta":0.6895695724,"Snorkel-Mistral-PairRM-DPO-best-of-16":0.7464891533,"tulu-2-dpo-7b":0.5465634637,"alpaca-7b_verbose":0.2437107339,"OpenHermes-2.5-Mistral-7B":0.7206735233,"claude-2.1_verbose":0.6612722747,"ultralm-13b-v2.0":0.5248779977,"deita-7b-v1.0":0.7398615266,"minichat-1.5-3b":0.4364271175,"Qwen-14B-Chat":0.6714412819,"airoboros-33b":0.5566576337,"alpaca-farm-ppo-sim-gpt4-20k":0.456168214,"ultralm-13b":0.4337345632,"openbuddy-falcon-40b-v9":0.545771106,"openchat8192-13b":0.4356316711,"wizardlm-13b":0.5385026234,"vicuna-13b":0.5388256266,"merlinite-7B-AOT":0.8150560619,"gpt4_0314":0.8997916758,"gpt4_0613_concise":0.7524372534,"jina-chat":0.6805565304,"Contextual-KTO-Mistral-PairRM":0.7122554396,"xwinlm-13b-v0.1":0.7513299972,"LMCocktail-10.7B-v1":0.8369176162,"SPPO-Mistral7B-PairRM-ExPO":0.7655500294,"Mixtral-8x7B-Instruct-v0.1_concise":0.8710401023,"gpt4_1106_preview_concise":0.7406558917,"Mistral-7B-ReMax-v0.1":0.807838924,"Llama-3-Instruct-8B-SimPO-ExPO":0.8703329817,"dolphin-2.2.1-mistral-7b":0.6251596825,"humpback-llama2-70b":0.6984941388,"openpipe-moa-gpt-4-turbo-v1":0.7309418615,"vicuna-7b-v1.5":0.4868743581,"Starling-LM-7B-alpha":0.658381614,"falcon-40b-instruct":0.3565968022,"Samba-CoE-v0.2-best-of-16":0.7338723477,"opencoderplus-15b":0.4567320517,"xwinlm-70b-v0.1":0.9059240217,"wizardlm-13b-v1.2":0.6741078562,"aligner-2b_qwen1.5-72b-chat":0.6787999003,"internlm2-chat-7b-ExPO":0.629923982,"claude-2.1":0.7515108894,"vicuna-7b-v1.3":0.4355620786,"oasst-rlhf-llama-33b":0.4061516205,"zephyr-7b-alpha-ExPO":0.7160268998,"openchat-v3.1-13b":0.6974328561,"SPPO-Llama-3-Instruct-8B-PairRM":0.8694594533,"minotaur-13b":0.368757115,"tulu-2-dpo-13b-ExPO":0.5166082438,"zephyr-7b-beta-ExPO":0.5303710259,"tulu-2-dpo-7b-ExPO":0.4355576278,"Llama-3-Instruct-8B-SimPO":0.8789917177,"baize-v2-13b":0.4685705196,"guanaco-7b":0.2018869696,"ultralm-13b-v2.0-best-of-16":0.6555243163,"claude-2.1_concise":0.6338526283,"openchat-13b":0.4470052867,"tulu-2-dpo-70b":0.8610574163,"deepseek-llm-67b-chat":0.7439504148,"humpback-llama-65b":0.6567402094,"tulu-2-dpo-70b-ExPO":0.4593179402,"TempNet-LLaMA2-Chat-7B-v0.1":0.3407340673,"nous-hermes-13b":0.5572824918,"gpt-3.5-turbo-0613":0.8045156377,"alpaca-7b_concise":0.2820018938,"baichuan-13b-chat":0.1525670221,"claude-3-5-sonnet-20240620":0.7976856335,"gpt-3.5-turbo-1106":0.7682908268,"minichat-3b":0.3565910812,"Storm-7B":0.7188927916,"oasst-sft-pythia-12b":0.2064079261,"Conifer-7B-DPO":0.7602280224,"Snorkel-Mistral-PairRM-DPO":0.6739888325,"internlm2-chat-20b-ExPO":0.5877331102,"Samba-CoE-v0.2":0.6875926799,"gemini-pro":0.8158961767,"pairrm-tulu-2-70b":0.8359305763,"text_davinci_003":0.3109387936,"gpt4":0.9069675584,"Yi-34B-Chat":0.7470363322,"Starling-LM-7B-beta-ExPO":0.7549415682,"pairrm-Yi-34B-Chat":0.8310750322,"gpt4_1106_preview":0.0,"evo-7b":0.502828858,"zephyr-7b-beta":0.6521227924,"guanaco-13b":0.2069624951,"alpaca-7b":0.3755224975,"internlm2-chat-20b-ppo":0.7522583795,"gemma-2b-it":0.3236386036,"pairrm-zephyr-7b-beta":0.7455357676,"evo-v2-7b":0.6353106561,"causallm-14b":0.7103430968,"SPPO-Mistral7B-PairRM":0.8458266977,"gpt-3.5-turbo-1106_concise":0.7318554971,"openbuddy-llama-65b-v8":0.6457736922,"claude2-alpaca-13b":0.6646440129,"Starling-LM-7B-alpha-ExPO":0.4701002864,"openbuddy-falcon-7b-v6":0.3350353845,"gemma-7b-it":0.4807679381,"phi-2-sft":0.3931141644,"gpt4_gamed":0.3987510662,"llama-2-70b-chat-hf":0.6625475757,"openbuddy-llama2-70b-v10.1":0.6740810303,"wizardlm-70b":0.7233004015,"ultralm-13b-best-of-16":0.4814959281},"num_tokens_mean":{"gpt-3.5-turbo-0301":179,"gpt-3.5-turbo-1106_verbose":214,"vicuna-13b-v1.5-togetherai":231,"Qwen1.5-1.8B-Chat":586,"recycled-wizardlm-7b-v1.0":299,"aligner-2b_claude-3-opus-20240229":354,"Qwen1.5-110B-Chat":346,"claude-3-opus-20240229":292,"llama-2-7b-chat-hf":302,"mistral-medium":327,"vicuna-33b-v1.3":315,"cohere":396,"claude-2":227,"guanaco-65b":262,"Mixtral-8x7B-Instruct-v0.1":311,"openchat-v2-w-13b":335,"falcon-7b-instruct":106,"wizardlm-13b-v1.1":325,"Meta-Llama-3-8B-Instruct":412,"FsfairX-Zephyr-Chat-v0.1":505,"Infinity-Instruct-3M-0613-Mistral-7B":106,"Qwen1.5-72B-Chat":342,"xwinlm-7b-v0.1":407,"Mixtral-8x22B-Instruct-v0.1":307,"vicuna-13b-v1.5":229,"dbrx-instruct":310,"zephyr-7b-alpha":283,"tulu-2-dpo-13b":370,"Qwen1.5-7B-Chat":342,"Together-MoA-Lite":420,"cut-13b":348,"Meta-Llama-3-70B-Instruct":416,"vicuna-13b-v1.3":244,"claude-instant-1.2":233,"airoboros-65b":312,"openbuddy-llama2-13b-v11.1":228,"phi-2":142,"Together-MoA":386,"mistral-large-2402":290,"openbuddy-llama-30b-v7.1":208,"TempNet-LLaMA2-Chat-70B-v0.1":384,"pairrm-tulu-2-13b":313,"recycled-wizardlm-7b-v2.0":317,"Storm-7B-best-of-64":486,"vicuna-7b":224,"claude-3-sonnet-20240229":297,"Mistral-7B-Instruct-v0.2":362,"Samba-CoE-v0.1":258,"claude":229,"Nanbeige2-8B-Chat":561,"REBEL-Llama-3-8B-Instruct":509,"chatglm2-6b":234,"gpt-4o-2024-05-13":406,"gpt4_1106_preview_verbose":505,"TempNet-LLaMA2-Chat-13B-v0.1":320,"text_davinci_001":67,"Mixtral-8x7B-Instruct-v0.1_verbose":438,"baize-v2-7b":247,"phi-2-dpo":356,"alpaca-farm-ppo-human":169,"Nanbeige2-16B-Chat":394,"gpt4_0613":245,"pythia-12b-mix-sft":197,"alpaca-7b-neft":216,"Qwen1.5-14B-Chat":348,"gpt-4-0125-preview":417,"guanaco-33b":283,"oasst-sft-llama-33b":161,"gpt4_0613_verbose":313,"llama-2-chat-7b-evol70k-neft":330,"gpt35_turbo_instruct":219,"platolm-7b":272,"llama-2-13b-chat-hf":316,"Nanbeige-Plus-Chat-v0.1":529,"openchat-v2-13b":330,"mistral-orpo-beta":344,"Snorkel-Mistral-PairRM-DPO-best-of-16":526,"tulu-2-dpo-7b":399,"alpaca-7b_verbose":119,"OpenHermes-2.5-Mistral-7B":237,"claude-2.1_verbose":292,"ultralm-13b-v2.0":299,"deita-7b-v1.0":303,"minichat-1.5-3b":322,"Qwen-14B-Chat":223,"airoboros-33b":315,"alpaca-farm-ppo-sim-gpt4-20k":104,"ultralm-13b":233,"openbuddy-falcon-40b-v9":238,"openchat8192-13b":359,"wizardlm-13b":211,"vicuna-13b":224,"merlinite-7B-AOT":385,"gpt4_0314":289,"gpt4_0613_concise":138,"jina-chat":138,"Contextual-KTO-Mistral-PairRM":506,"xwinlm-13b-v0.1":400,"LMCocktail-10.7B-v1":244,"SPPO-Mistral7B-PairRM-ExPO":459,"Mixtral-8x7B-Instruct-v0.1_concise":195,"gpt4_1106_preview_concise":244,"Mistral-7B-ReMax-v0.1":314,"Llama-3-Instruct-8B-SimPO-ExPO":370,"dolphin-2.2.1-mistral-7b":243,"humpback-llama2-70b":234,"openpipe-moa-gpt-4-turbo-v1":377,"vicuna-7b-v1.5":234,"Starling-LM-7B-alpha":392,"falcon-40b-instruct":148,"Samba-CoE-v0.2-best-of-16":312,"opencoderplus-15b":354,"xwinlm-70b-v0.1":381,"wizardlm-13b-v1.2":354,"aligner-2b_qwen1.5-72b-chat":383,"internlm2-chat-7b-ExPO":494,"claude-2.1":228,"vicuna-7b-v1.3":238,"oasst-rlhf-llama-33b":243,"zephyr-7b-alpha-ExPO":267,"openchat-v3.1-13b":318,"SPPO-Llama-3-Instruct-8B-PairRM":443,"minotaur-13b":186,"tulu-2-dpo-13b-ExPO":350,"zephyr-7b-beta-ExPO":297,"tulu-2-dpo-7b-ExPO":373,"Llama-3-Instruct-8B-SimPO":385,"baize-v2-13b":201,"guanaco-7b":311,"ultralm-13b-v2.0-best-of-16":354,"claude-2.1_concise":119,"openchat-13b":349,"tulu-2-dpo-70b":313,"deepseek-llm-67b-chat":250,"humpback-llama-65b":254,"tulu-2-dpo-70b-ExPO":368,"TempNet-LLaMA2-Chat-7B-v0.1":308,"nous-hermes-13b":187,"gpt-3.5-turbo-0613":270,"alpaca-7b_concise":77,"baichuan-13b-chat":310,"claude-3-5-sonnet-20240620":315,"gpt-3.5-turbo-1106":164,"minichat-3b":186,"Storm-7B":429,"oasst-sft-pythia-12b":158,"Conifer-7B-DPO":266,"Snorkel-Mistral-PairRM-DPO":566,"internlm2-chat-20b-ExPO":693,"Samba-CoE-v0.2":292,"gemini-pro":313,"pairrm-tulu-2-70b":405,"text_davinci_003":69,"gpt4":287,"Yi-34B-Chat":447,"Starling-LM-7B-beta-ExPO":447,"pairrm-Yi-34B-Chat":464,"gpt4_1106_preview":431,"evo-7b":382,"zephyr-7b-beta":311,"guanaco-13b":416,"alpaca-7b":85,"internlm2-chat-20b-ppo":513,"gemma-2b-it":222,"pairrm-zephyr-7b-beta":315,"evo-v2-7b":375,"causallm-14b":302,"SPPO-Mistral7B-PairRM":431,"gpt-3.5-turbo-1106_concise":90,"openbuddy-llama-65b-v8":249,"claude2-alpaca-13b":243,"Starling-LM-7B-alpha-ExPO":376,"openbuddy-falcon-7b-v6":248,"gemma-7b-it":241,"phi-2-sft":225,"gpt4_gamed":15,"llama-2-70b-chat-hf":377,"openbuddy-llama2-70b-v10.1":230,"wizardlm-70b":332,"ultralm-13b-best-of-16":400},"num_tokens_std":{"gpt-3.5-turbo-0301":150,"gpt-3.5-turbo-1106_verbose":144,"vicuna-13b-v1.5-togetherai":163,"Qwen1.5-1.8B-Chat":611,"recycled-wizardlm-7b-v1.0":186,"aligner-2b_claude-3-opus-20240229":206,"Qwen1.5-110B-Chat":211,"claude-3-opus-20240229":156,"llama-2-7b-chat-hf":170,"mistral-medium":305,"vicuna-33b-v1.3":198,"cohere":225,"claude-2":114,"guanaco-65b":212,"Mixtral-8x7B-Instruct-v0.1":219,"openchat-v2-w-13b":210,"falcon-7b-instruct":177,"wizardlm-13b-v1.1":213,"Meta-Llama-3-8B-Instruct":231,"FsfairX-Zephyr-Chat-v0.1":311,"Infinity-Instruct-3M-0613-Mistral-7B":102,"Qwen1.5-72B-Chat":346,"xwinlm-7b-v0.1":308,"Mixtral-8x22B-Instruct-v0.1":253,"vicuna-13b-v1.5":164,"dbrx-instruct":193,"zephyr-7b-alpha":272,"tulu-2-dpo-13b":592,"Qwen1.5-7B-Chat":199,"Together-MoA-Lite":233,"cut-13b":208,"Meta-Llama-3-70B-Instruct":238,"vicuna-13b-v1.3":167,"claude-instant-1.2":132,"airoboros-65b":372,"openbuddy-llama2-13b-v11.1":175,"phi-2":312,"Together-MoA":217,"mistral-large-2402":191,"openbuddy-llama-30b-v7.1":162,"TempNet-LLaMA2-Chat-70B-v0.1":258,"pairrm-tulu-2-13b":194,"recycled-wizardlm-7b-v2.0":173,"Storm-7B-best-of-64":287,"vicuna-7b":161,"claude-3-sonnet-20240229":178,"Mistral-7B-Instruct-v0.2":360,"Samba-CoE-v0.1":194,"claude":123,"Nanbeige2-8B-Chat":239,"REBEL-Llama-3-8B-Instruct":463,"chatglm2-6b":352,"gpt-4o-2024-05-13":273,"gpt4_1106_preview_verbose":261,"TempNet-LLaMA2-Chat-13B-v0.1":178,"text_davinci_001":78,"Mixtral-8x7B-Instruct-v0.1_verbose":225,"baize-v2-7b":183,"phi-2-dpo":188,"alpaca-farm-ppo-human":171,"Nanbeige2-16B-Chat":214,"gpt4_0613":164,"pythia-12b-mix-sft":181,"alpaca-7b-neft":97,"Qwen1.5-14B-Chat":224,"gpt-4-0125-preview":242,"guanaco-33b":236,"oasst-sft-llama-33b":145,"gpt4_0613_verbose":166,"llama-2-chat-7b-evol70k-neft":143,"gpt35_turbo_instruct":241,"platolm-7b":187,"llama-2-13b-chat-hf":175,"Nanbeige-Plus-Chat-v0.1":216,"openchat-v2-13b":205,"mistral-orpo-beta":205,"Snorkel-Mistral-PairRM-DPO-best-of-16":255,"tulu-2-dpo-7b":738,"alpaca-7b_verbose":111,"OpenHermes-2.5-Mistral-7B":225,"claude-2.1_verbose":120,"ultralm-13b-v2.0":188,"deita-7b-v1.0":221,"minichat-1.5-3b":215,"Qwen-14B-Chat":227,"airoboros-33b":324,"alpaca-farm-ppo-sim-gpt4-20k":68,"ultralm-13b":158,"openbuddy-falcon-40b-v9":198,"openchat8192-13b":257,"wizardlm-13b":155,"vicuna-13b":153,"merlinite-7B-AOT":180,"gpt4_0314":212,"gpt4_0613_concise":110,"jina-chat":101,"Contextual-KTO-Mistral-PairRM":272,"xwinlm-13b-v0.1":268,"LMCocktail-10.7B-v1":168,"SPPO-Mistral7B-PairRM-ExPO":193,"Mixtral-8x7B-Instruct-v0.1_concise":185,"gpt4_1106_preview_concise":184,"Mistral-7B-ReMax-v0.1":188,"Llama-3-Instruct-8B-SimPO-ExPO":191,"dolphin-2.2.1-mistral-7b":218,"humpback-llama2-70b":214,"openpipe-moa-gpt-4-turbo-v1":193,"vicuna-7b-v1.5":165,"Starling-LM-7B-alpha":250,"falcon-40b-instruct":182,"Samba-CoE-v0.2-best-of-16":199,"opencoderplus-15b":272,"xwinlm-70b-v0.1":267,"wizardlm-13b-v1.2":308,"aligner-2b_qwen1.5-72b-chat":210,"internlm2-chat-7b-ExPO":232,"claude-2.1":114,"vicuna-7b-v1.3":159,"oasst-rlhf-llama-33b":257,"zephyr-7b-alpha-ExPO":204,"openchat-v3.1-13b":224,"SPPO-Llama-3-Instruct-8B-PairRM":208,"minotaur-13b":164,"tulu-2-dpo-13b-ExPO":203,"zephyr-7b-beta-ExPO":217,"tulu-2-dpo-7b-ExPO":217,"Llama-3-Instruct-8B-SimPO":200,"baize-v2-13b":149,"guanaco-7b":292,"ultralm-13b-v2.0-best-of-16":169,"claude-2.1_concise":88,"openchat-13b":241,"tulu-2-dpo-70b":281,"deepseek-llm-67b-chat":206,"humpback-llama-65b":156,"tulu-2-dpo-70b-ExPO":203,"TempNet-LLaMA2-Chat-7B-v0.1":178,"nous-hermes-13b":201,"gpt-3.5-turbo-0613":194,"alpaca-7b_concise":70,"baichuan-13b-chat":400,"claude-3-5-sonnet-20240620":200,"gpt-3.5-turbo-1106":134,"minichat-3b":156,"Storm-7B":188,"oasst-sft-pythia-12b":186,"Conifer-7B-DPO":163,"Snorkel-Mistral-PairRM-DPO":342,"internlm2-chat-20b-ExPO":283,"Samba-CoE-v0.2":196,"gemini-pro":227,"pairrm-tulu-2-70b":438,"text_davinci_003":89,"gpt4":203,"Yi-34B-Chat":235,"Starling-LM-7B-beta-ExPO":143,"pairrm-Yi-34B-Chat":276,"gpt4_1106_preview":249,"evo-7b":241,"zephyr-7b-beta":238,"guanaco-13b":402,"alpaca-7b":69,"internlm2-chat-20b-ppo":403,"gemma-2b-it":168,"pairrm-zephyr-7b-beta":250,"evo-v2-7b":235,"causallm-14b":234,"SPPO-Mistral7B-PairRM":198,"gpt-3.5-turbo-1106_concise":80,"openbuddy-llama-65b-v8":152,"claude2-alpaca-13b":176,"Starling-LM-7B-alpha-ExPO":220,"openbuddy-falcon-7b-v6":189,"gemma-7b-it":160,"phi-2-sft":164,"gpt4_gamed":81,"llama-2-70b-chat-hf":222,"openbuddy-llama2-70b-v10.1":161,"wizardlm-70b":231,"ultralm-13b-best-of-16":164}}
prep_data.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import tiktoken
4
+ from alpaca_eval import utils, metrics, annotators, constants, analyze, plotting, main
5
+ from alpaca_eval.metrics.glm_winrate import get_length_controlled_winrate
6
+ import os
7
+ import pandas as pd
8
+ import json
9
+
10
+
11
+ # Define the path to the top-level directory
12
+ TOP_LEVEL_DIRECTORY = "submodules/alpaca_eval/results"
13
+
14
+ # Initialize an empty dictionary to hold the model name to dataframe mapping
15
+ model_dataframes_outputs = {}
16
+
17
+ # Iterate through each subdirectory in the top-level directory
18
+ for model_name in os.listdir(TOP_LEVEL_DIRECTORY):
19
+ model_dir = os.path.join(TOP_LEVEL_DIRECTORY, model_name)
20
+ if os.path.isdir(model_dir):
21
+ model_output_file = os.path.join(model_dir, "model_outputs.json")
22
+ if os.path.exists(model_output_file):
23
+ model_dataframes_outputs[model_name] = pd.read_json(model_output_file)
24
+
25
+
26
+ def get_num_words(text):
27
+ return len(text.split())
28
+
29
+
30
+ ENCODING = tiktoken.get_encoding("cl100k_base")
31
+
32
+
33
+ def get_num_tokens(text):
34
+ """Uses tiktoken to get the number of tokens in the text."""
35
+ try:
36
+ return len(ENCODING.encode(str(text)))
37
+ except:
38
+ breakpoint()
39
+
40
+
41
+ model_name_to_num_words = {}
42
+ model_name_to_num_tokens = {}
43
+ for model_name, model_dataframe in model_dataframes_outputs.items():
44
+ print(f"model_name_to_num_words for {model_name}")
45
+ model_dataframe["output_num_words"] = model_dataframe["output"].apply(get_num_words)
46
+ model_dataframe["output_num_tokens"] = model_dataframe["output"].apply(
47
+ get_num_tokens
48
+ )
49
+ model_name_to_num_words[model_name] = {
50
+ "mean": int(model_dataframe["output_num_words"].mean()),
51
+ "std": int(model_dataframe["output_num_words"].std()),
52
+ }
53
+ model_name_to_num_tokens[model_name] = {
54
+ "mean": int(model_dataframe["output_num_tokens"].mean()),
55
+ "std": int(model_dataframe["output_num_tokens"].std()),
56
+ }
57
+
58
+ num_words_df = pd.DataFrame(model_name_to_num_words).T
59
+ num_tokens_df = pd.DataFrame(model_name_to_num_tokens).T
60
+
61
+ model_name_to_win_rate = {}
62
+ for model_name in os.listdir(TOP_LEVEL_DIRECTORY):
63
+ print(f"model_name_to_win_rate for {model_name}")
64
+ model_dir = os.path.join(TOP_LEVEL_DIRECTORY, model_name)
65
+ if os.path.isdir(model_dir):
66
+ model_output_file = os.path.join(
67
+ model_dir, "weighted_alpaca_eval_gpt4_turbo", "annotations.json"
68
+ )
69
+ if os.path.exists(model_output_file):
70
+ model_dataframe = pd.read_json(model_output_file)
71
+ model_name_to_win_rate[model_name] = get_length_controlled_winrate(
72
+ model_dataframe
73
+ )
74
+
75
+ win_rate_df = pd.DataFrame(model_name_to_win_rate).T
76
+
77
+ df = num_words_df.join(win_rate_df, how="inner")
78
+ df = df.rename(
79
+ columns={
80
+ "mean": "num_words_mean",
81
+ "std": "num_words_std",
82
+ }
83
+ )
84
+ df = df.join(num_tokens_df, how="inner")
85
+ df = df.rename(
86
+ columns={
87
+ "mean": "num_tokens_mean",
88
+ "std": "num_tokens_std",
89
+ }
90
+ )
91
+
92
+ df.to_json("data/model_win_rates.json")
requirements.txt CHANGED
@@ -1,3 +1,8 @@
1
  streamlit
2
  plotly
3
  pandas
 
 
 
 
 
 
1
  streamlit
2
  plotly
3
  pandas
4
+ tiktoken
5
+ alpaca-eval
6
+ seaborn
7
+ setuptools
8
+ statsmodels