justinxzhao commited on
Commit
707a231
1 Parent(s): ed3afcd

Track large files with Git LFS, and expand app to include a data explorer and more length-based visualizations.

Browse files
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ data/df_response_judging.jsonl filter=lfs diff=lfs merge=lfs -text
37
+ data/df_responses.jsonl filter=lfs diff=lfs merge=lfs -text
.gitignore CHANGED
@@ -1,2 +1,3 @@
1
  env/
2
- submodules/
 
 
1
  env/
2
+ submodules/
3
+ .DS_Store
app.py CHANGED
@@ -3,6 +3,7 @@ import pandas as pd
3
  import plotly.express as px
4
  import plotly.graph_objects as go
5
  import statsmodels.api as sm
 
6
 
7
  # Set the layout to wide
8
  st.set_page_config(layout="wide")
@@ -36,206 +37,615 @@ def prep_rankings_table(df, y_column):
36
  return df_copy
37
 
38
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  def app():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  st.title("AlpacaEval Visualizations")
41
 
42
- st.markdown("## Win rate vs. overall mean length")
43
 
44
  # Load the data
45
- df = pd.read_json("data/model_win_rates.json")
46
-
47
- # Add a model name column for hover labels
48
- df["model_name"] = df.index.astype(str)
49
-
50
- # Define the preset groups
51
- presets = {
52
- "gpt": df[df["model_name"].str.contains("openai|gpt", case=False)][
53
- "model_name"
54
- ].tolist(),
55
- "claude": df[df["model_name"].str.contains("claude", case=False)][
56
- "model_name"
57
- ].tolist(),
58
- "moa": df[df["model_name"].str.contains("moa", case=False)][
59
- "model_name"
60
- ].tolist(),
61
- "llama": df[df["model_name"].str.contains("llama", case=False)][
62
- "model_name"
63
- ].tolist(),
64
- "custom": [],
65
- }
66
-
67
- # Add radio button for preset groups
68
- preset_selection = st.radio(
69
- "Select a preset group of models or choose 'custom' to select manually",
70
- options=["custom", "gpt", "claude", "moa", "llama"],
71
  )
72
 
73
- # Add multiselect for custom model selection
74
- if preset_selection == "custom":
75
- selected_models = st.multiselect(
76
- "Select models to highlight", options=df["model_name"].unique()
77
- )
78
- else:
79
- selected_models = presets[preset_selection]
80
-
81
- def create_scatter_plot(df, y_column, selected_models, title):
82
- fig = go.Figure()
83
-
84
- # Add scatter plots for num_words_mean and num_tokens_mean
85
- fig.add_trace(
86
- go.Scatter(
87
- x=df["num_words_mean"],
88
- y=df[y_column],
89
- mode="markers",
90
- name="words",
91
- text=df["model_name"],
92
- marker=dict(size=5, color="skyblue"),
93
- showlegend=True,
94
- )
95
- )
96
- fig.add_trace(
97
- go.Scatter(
98
- x=df["num_tokens_mean"],
99
- y=df[y_column],
100
- mode="markers",
101
- name="tokens",
102
- text=df["model_name"],
103
- marker=dict(size=5, color="orange"),
104
- showlegend=True,
105
- visible="legendonly", # Make 'words' trace initially visible only in legend
106
- )
107
- )
108
-
109
- # Highlight selected models
110
- if selected_models:
111
- selected_data = df[df["model_name"].isin(selected_models)]
 
 
112
  fig.add_trace(
113
  go.Scatter(
114
- x=selected_data["num_words_mean"],
115
- y=selected_data[y_column],
116
  mode="markers",
117
- name="selected words",
118
- text=selected_data["model_name"],
119
- marker=dict(size=10, color="blue"),
120
  showlegend=True,
121
  )
122
  )
123
  fig.add_trace(
124
  go.Scatter(
125
- x=selected_data["num_tokens_mean"],
126
- y=selected_data[y_column],
127
  mode="markers",
128
- name="selected tokens",
129
- text=selected_data["model_name"],
130
- marker=dict(size=10, color="orangered"),
131
  showlegend=True,
132
- visible="legendonly", # Make 'selected words' trace initially visible only in legend
133
  )
134
  )
135
 
136
- # Add trendlines
137
- def add_trendline(fig, x, y, name, color, visibility="legendonly"):
138
- X = sm.add_constant(df[x])
139
- model = sm.OLS(df[y], X).fit()
140
- trendline = model.predict(X)
141
- fig.add_trace(
142
- go.Scatter(
143
- x=df[x],
144
- y=trendline,
145
- mode="lines",
146
- name=f"{name} trendline",
147
- line=dict(color=color, width=2),
148
- visible=visibility, # Control the initial visibility
149
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
150
  )
151
- return model.rsquared
152
 
153
- r_squared_words = add_trendline(
154
- fig, "num_words_mean", y_column, "words", "blue", visibility=True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
155
  )
156
- r_squared_tokens = add_trendline(
157
- fig, "num_tokens_mean", y_column, "tokens", "orangered"
158
  )
 
 
 
 
 
 
159
 
160
- # Update layout with titles and labels
161
- fig.update_layout(
162
- xaxis_title="Mean length",
163
- yaxis_title=(
164
- "Win rate"
165
- if y_column == "win_rate"
166
- else (
167
- "LC Win Rate"
168
- if y_column == "length_controlled_winrate"
169
- else "Discrete Win Rate"
 
 
170
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
171
  ),
172
- title=title,
173
- legend_title="Legend",
174
  )
175
 
176
- return fig, r_squared_words, r_squared_tokens
 
 
 
 
 
 
 
177
 
178
- y_column1 = "length_controlled_winrate"
179
- y_column2 = "win_rate"
180
- y_column3 = "discrete_win_rate"
181
 
182
- fig1, r_squared_words_1, r_squared_tokens_1 = create_scatter_plot(
183
- df, y_column1, selected_models, "Length-Controlled Win Rate"
184
- )
185
- fig2, r_squared_words_2, r_squared_tokens_2 = create_scatter_plot(
186
- df, y_column2, selected_models, "Win Rate"
187
- )
188
- fig3, r_squared_words_3, r_squared_tokens_3 = create_scatter_plot(
189
- df, y_column3, selected_models, "Discrete Win Rate"
190
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
191
 
192
- # Create tabs for each chart
193
- tab1, tab2, tab3 = st.tabs(["LC Win Rate", "Win Rate", "Discrete Win Rate"])
194
-
195
- with tab1:
196
- col1, col2 = st.columns([3, 2])
197
- col1.plotly_chart(fig1)
198
- col2.markdown("#### Rankings")
199
- prepped_df = prep_rankings_table(df, "length_controlled_winrate")
200
- col2.dataframe(
201
- prepped_df,
202
- hide_index=True,
203
- )
204
- with st.expander("Trendline R²"):
205
- st.markdown(
206
- f"- R² (Words vs {y_column1}): {r_squared_words_1:.2f} \n- R² (Tokens vs {y_column1}): {r_squared_tokens_1:.2f}"
207
- )
208
-
209
- with tab2:
210
- col1, col2 = st.columns([3, 2])
211
- col1.plotly_chart(fig2)
212
- col2.markdown("#### Rankings")
213
- prepped_df = prep_rankings_table(df, "win_rate")
214
- col2.dataframe(
215
- prepped_df,
216
- hide_index=True,
217
- )
218
- with st.expander("Trendline R²"):
219
- st.markdown(
220
- f"- R² (Words vs {y_column2}): {r_squared_words_2:.2f} \n- R² (Tokens vs {y_column2}): {r_squared_tokens_2:.2f}"
221
- )
222
-
223
- with tab3:
224
- col1, col2 = st.columns([3, 2])
225
- col1.plotly_chart(fig3)
226
- col2.markdown("#### Rankings")
227
- prepped_df = prep_rankings_table(df, "discrete_win_rate")
228
- col2.dataframe(
229
- prepped_df,
230
- hide_index=True,
231
- )
232
- with st.expander("Trendline R²"):
233
- st.markdown(
234
- f"- R² (Words vs {y_column3}): {r_squared_words_3:.2f}\n- R² (Tokens vs {y_column3}): {r_squared_tokens_3:.2f}"
235
- )
236
-
237
- with st.expander("Raw data"):
238
- st.dataframe(df)
239
 
240
 
241
  if __name__ == "__main__":
 
3
  import plotly.express as px
4
  import plotly.graph_objects as go
5
  import statsmodels.api as sm
6
+ import random
7
 
8
  # Set the layout to wide
9
  st.set_page_config(layout="wide")
 
37
  return df_copy
38
 
39
 
40
+ def get_preference(preference_score):
41
+ rounded_preference_score = int(preference_score.round(0).iloc[0])
42
+ return get_preference_from_rounded_score(rounded_preference_score)
43
+ # if rounded_preference_score == 2:
44
+ # return "[2>1]"
45
+ # elif rounded_preference_score == 1:
46
+ # return "[1>2]"
47
+
48
+
49
+ def get_preference_from_rounded_score(score):
50
+ if score == 2:
51
+ return "[2>1]"
52
+ elif score == 1:
53
+ return "[1>2]"
54
+ return "[1=2]"
55
+ # raise ValueError(f"Invalid score: {score}")
56
+
57
+
58
  def app():
59
+ fixed_model = "gpt4_1106_preview"
60
+
61
+ # Ensure to initialize session state variables if they do not exist
62
+ if "selected_instruction" not in st.session_state:
63
+ st.session_state.selected_instruction = None
64
+
65
+ if "selected_model" not in st.session_state:
66
+ st.session_state.selected_model = "gpt4"
67
+
68
+ if "selected_judge" not in st.session_state:
69
+ st.session_state.selected_judge = None
70
+
71
+ if "selected_dataset" not in st.session_state:
72
+ st.session_state.selected_dataset = "NEW"
73
+
74
+ if "instruction_options" not in st.session_state:
75
+ st.session_state.instruction_options = []
76
+
77
+ # Function to update the instruction options based on selected dataset
78
+ def update_instruction_options():
79
+ selected_dataset = st.session_state.dataset_selector
80
+ if selected_dataset == "all" or selected_dataset == "NEW":
81
+ instruction_options = df_response_judging["instruction"].unique().tolist()
82
+ elif (
83
+ selected_dataset == "None"
84
+ or selected_dataset is None
85
+ or str(selected_dataset) == ""
86
+ ):
87
+ instruction_options = (
88
+ df_response_judging[pd.isna(df_response_judging["dataset"])][
89
+ "instruction"
90
+ ]
91
+ .unique()
92
+ .tolist()
93
+ )
94
+ else:
95
+ instruction_options = (
96
+ df_response_judging[df_response_judging["dataset"] == selected_dataset][
97
+ "instruction"
98
+ ]
99
+ .unique()
100
+ .tolist()
101
+ )
102
+
103
+ st.session_state.instruction_options = instruction_options
104
+
105
+ def update_instruction():
106
+ st.session_state.selected_instruction = st.session_state.instruction_selector
107
+
108
+ def update_model():
109
+ st.session_state.selected_model = st.session_state.model_selector
110
+
111
+ def update_judge():
112
+ st.session_state.selected_judge = st.session_state.judge_selector
113
+
114
+ def randomize_selection():
115
+ st.session_state.dataset_selector = random.choice(
116
+ ["all"] + df_response_judging["dataset"].dropna().unique().tolist()
117
+ )
118
+ st.session_state.selected_model = random.choice(model_options)
119
+ update_instruction_options()
120
+ st.session_state.selected_instruction = random.choice(
121
+ st.session_state.instruction_options
122
+ )
123
+
124
  st.title("AlpacaEval Visualizations")
125
 
126
+ outer_tabs = st.tabs(["Length bias in overall win rate", "Data explorer"])
127
 
128
  # Load the data
129
+ df = pd.read_json("data/model_win_rates.jsonl", lines=True, orient="records")
130
+ # df_responses = pd.read_json("data/df_responses.jsonl", lines=True, orient="records")
131
+ df_response_judging = pd.read_json(
132
+ "data/df_response_judging.jsonl", lines=True, orient="records"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
133
  )
134
 
135
+ # Prepare the model selector options
136
+ model_options = df_response_judging["generator_2"].unique().tolist()
137
+
138
+ with outer_tabs[0]:
139
+ # Define the preset groups
140
+ presets = {
141
+ "gpt": df[df["model_name"].str.contains("openai|gpt", case=False)][
142
+ "model_name"
143
+ ].tolist(),
144
+ "claude": df[df["model_name"].str.contains("claude", case=False)][
145
+ "model_name"
146
+ ].tolist(),
147
+ "moa": df[df["model_name"].str.contains("moa", case=False)][
148
+ "model_name"
149
+ ].tolist(),
150
+ "llama": df[df["model_name"].str.contains("llama", case=False)][
151
+ "model_name"
152
+ ].tolist(),
153
+ "custom": [],
154
+ }
155
+
156
+ # Add radio button for preset groups
157
+ preset_selection = st.radio(
158
+ "Select a preset group of models or choose 'custom' to select manually.",
159
+ options=["custom", "gpt", "claude", "moa", "llama"],
160
+ )
161
+
162
+ st.divider()
163
+
164
+ # Add multiselect for custom model selection
165
+ if preset_selection == "custom":
166
+ selected_models = st.multiselect(
167
+ "Select models to highlight", options=df["model_name"].unique()
168
+ )
169
+ else:
170
+ selected_models = presets[preset_selection]
171
+
172
+ def create_scatter_plot(df, y_column, selected_models, title):
173
+ fig = go.Figure()
174
+
175
+ # Add scatter plots for num_words_mean and num_tokens_mean
176
  fig.add_trace(
177
  go.Scatter(
178
+ x=df["num_words_mean"],
179
+ y=df[y_column],
180
  mode="markers",
181
+ name="words",
182
+ text=df["model_name"],
183
+ marker=dict(size=5, color="skyblue"),
184
  showlegend=True,
185
  )
186
  )
187
  fig.add_trace(
188
  go.Scatter(
189
+ x=df["num_tokens_mean"],
190
+ y=df[y_column],
191
  mode="markers",
192
+ name="tokens",
193
+ text=df["model_name"],
194
+ marker=dict(size=5, color="orange"),
195
  showlegend=True,
196
+ visible="legendonly", # Make 'words' trace initially visible only in legend
197
  )
198
  )
199
 
200
+ # Highlight selected models
201
+ if selected_models:
202
+ selected_data = df[df["model_name"].isin(selected_models)]
203
+ fig.add_trace(
204
+ go.Scatter(
205
+ x=selected_data["num_words_mean"],
206
+ y=selected_data[y_column],
207
+ mode="markers",
208
+ name="selected words",
209
+ text=selected_data["model_name"],
210
+ marker=dict(size=10, color="blue"),
211
+ showlegend=True,
212
+ )
213
  )
214
+ fig.add_trace(
215
+ go.Scatter(
216
+ x=selected_data["num_tokens_mean"],
217
+ y=selected_data[y_column],
218
+ mode="markers",
219
+ name="selected tokens",
220
+ text=selected_data["model_name"],
221
+ marker=dict(size=10, color="orangered"),
222
+ showlegend=True,
223
+ visible="legendonly", # Make 'selected words' trace initially visible only in legend
224
+ )
225
+ )
226
+
227
+ # Add trendlines
228
+ def add_trendline(fig, x, y, name, color, visibility="legendonly"):
229
+ X = sm.add_constant(df[x])
230
+ model = sm.OLS(df[y], X).fit()
231
+ trendline = model.predict(X)
232
+ fig.add_trace(
233
+ go.Scatter(
234
+ x=df[x],
235
+ y=trendline,
236
+ mode="lines",
237
+ name=f"{name} trendline",
238
+ line=dict(color=color, width=2),
239
+ visible=visibility, # Control the initial visibility
240
+ )
241
+ )
242
+ return model.rsquared
243
+
244
+ r_squared_words = add_trendline(
245
+ fig, "num_words_mean", y_column, "words", "blue", visibility=True
246
+ )
247
+ r_squared_tokens = add_trendline(
248
+ fig, "num_tokens_mean", y_column, "tokens", "orangered"
249
  )
 
250
 
251
+ # Update layout with titles and labels
252
+ fig.update_layout(
253
+ xaxis_title="Mean length",
254
+ yaxis_title=(
255
+ "Win rate"
256
+ if y_column == "win_rate"
257
+ else (
258
+ "LC Win Rate"
259
+ if y_column == "length_controlled_winrate"
260
+ else "Discrete Win Rate"
261
+ )
262
+ ),
263
+ title=title,
264
+ legend_title="Legend",
265
+ )
266
+
267
+ return fig, r_squared_words, r_squared_tokens
268
+
269
+ st.markdown("## Overall win rate")
270
+ y_column1 = "length_controlled_winrate"
271
+ y_column2 = "win_rate"
272
+ y_column3 = "discrete_win_rate"
273
+
274
+ fig1, r_squared_words_1, r_squared_tokens_1 = create_scatter_plot(
275
+ df, y_column1, selected_models, "Length-Controlled Win Rate"
276
  )
277
+ fig2, r_squared_words_2, r_squared_tokens_2 = create_scatter_plot(
278
+ df, y_column2, selected_models, "Win Rate"
279
  )
280
+ fig3, r_squared_words_3, r_squared_tokens_3 = create_scatter_plot(
281
+ df, y_column3, selected_models, "Discrete Win Rate"
282
+ )
283
+
284
+ # Create tabs for each chart
285
+ tab1, tab2, tab3 = st.tabs(["LC Win Rate", "Win Rate", "Discrete Win Rate"])
286
 
287
+ with tab1:
288
+ col1, col2 = st.columns([3, 2])
289
+ col1.plotly_chart(fig1)
290
+ col2.markdown("#### Rankings")
291
+ prepped_df = prep_rankings_table(df, "length_controlled_winrate")
292
+ col2.dataframe(
293
+ prepped_df,
294
+ hide_index=True,
295
+ )
296
+ with st.expander("Trendline "):
297
+ st.markdown(
298
+ f"- R² (Words vs {y_column1}): {r_squared_words_1:.2f} \n- R² (Tokens vs {y_column1}): {r_squared_tokens_1:.2f}"
299
  )
300
+
301
+ with tab2:
302
+ col1, col2 = st.columns([3, 2])
303
+ col1.plotly_chart(fig2)
304
+ col2.markdown("#### Rankings")
305
+ prepped_df = prep_rankings_table(df, "win_rate")
306
+ col2.dataframe(
307
+ prepped_df,
308
+ hide_index=True,
309
+ )
310
+ with st.expander("Trendline R²"):
311
+ st.markdown(
312
+ f"- R² (Words vs {y_column2}): {r_squared_words_2:.2f} \n- R² (Tokens vs {y_column2}): {r_squared_tokens_2:.2f}"
313
+ )
314
+
315
+ with tab3:
316
+ col1, col2 = st.columns([3, 2])
317
+ col1.plotly_chart(fig3)
318
+ col2.markdown("#### Rankings")
319
+ prepped_df = prep_rankings_table(df, "discrete_win_rate")
320
+ col2.dataframe(
321
+ prepped_df,
322
+ hide_index=True,
323
+ )
324
+ with st.expander("Trendline R²"):
325
+ st.markdown(
326
+ f"- R² (Words vs {y_column3}): {r_squared_words_3:.2f}\n- R² (Tokens vs {y_column3}): {r_squared_tokens_3:.2f}"
327
+ )
328
+
329
+ st.markdown("## Length bias in battles")
330
+
331
+ df_response_judging_copy = df_response_judging.copy()
332
+ if not selected_models:
333
+ df_response_judging_copy["output_1_num_words"] = df_response_judging_copy[
334
+ "output_1"
335
+ ].apply(lambda x: len(x.split()))
336
+ df_response_judging_copy["output_2_num_words"] = df_response_judging_copy[
337
+ "output_2"
338
+ ].apply(lambda x: len(x.split()))
339
+ df_response_judging_copy["output_num_words_diff"] = (
340
+ df_response_judging_copy["output_1_num_words"]
341
+ - df_response_judging_copy["output_2_num_words"]
342
+ )
343
+ df_response_judging_copy["assigned_preference"] = (
344
+ df_response_judging_copy["preference"]
345
+ .round(0)
346
+ .apply(get_preference_from_rounded_score)
347
+ )
348
+ else:
349
+ df_response_judging_copy = df_response_judging_copy[
350
+ df_response_judging_copy["generator_2"].isin(selected_models)
351
+ ]
352
+ df_response_judging_copy["output_1_num_words"] = df_response_judging_copy[
353
+ "output_1"
354
+ ].apply(lambda x: len(x.split()))
355
+ df_response_judging_copy["output_2_num_words"] = df_response_judging_copy[
356
+ "output_2"
357
+ ].apply(lambda x: len(x.split()))
358
+ df_response_judging_copy["output_num_words_diff"] = (
359
+ df_response_judging_copy["output_1_num_words"]
360
+ - df_response_judging_copy["output_2_num_words"]
361
+ )
362
+ df_response_judging_copy["assigned_preference"] = (
363
+ df_response_judging_copy["preference"]
364
+ .round(0)
365
+ .apply(get_preference_from_rounded_score)
366
+ )
367
+
368
+ col1, col2 = st.columns(2)
369
+ fig = px.scatter(
370
+ df_response_judging_copy,
371
+ x="output_1_num_words",
372
+ y="output_2_num_words",
373
+ color="assigned_preference",
374
+ title=f"Pairwise preference based on response length",
375
+ labels={
376
+ "output_1_num_words": f"{fixed_model} (1) number of words",
377
+ "output_2_num_words": "Target model (2) number of words",
378
+ },
379
+ color_discrete_map={
380
+ "[1>2]": "blue",
381
+ "[2>1]": "orangered",
382
+ "[1=2]": "green",
383
+ },
384
+ )
385
+ col1.plotly_chart(fig)
386
+
387
+ # Plot of output_num_words_diff histogram, colored by assigned_preference.
388
+ fig = px.histogram(
389
+ df_response_judging_copy,
390
+ x="output_num_words_diff",
391
+ color="assigned_preference",
392
+ title=f"Pairwise preference counts based on difference in response length",
393
+ color_discrete_map={
394
+ "[1>2]": "blue",
395
+ "[2>1]": "orangered",
396
+ "[1=2]": "green",
397
+ },
398
+ range_x=[-500, 500],
399
+ labels={
400
+ "output_num_words_diff": "Length difference in words between gpt4_1106_preview and target model"
401
+ },
402
+ )
403
+ col2.plotly_chart(fig)
404
+
405
+ with st.expander("Raw data"):
406
+ st.dataframe(df)
407
+
408
+ # Data explorer
409
+ with outer_tabs[1]:
410
+ # Add randomize button at the top of the app
411
+ st.markdown("## Choose example")
412
+ st.button(
413
+ ":game_die: Randomize!",
414
+ on_click=randomize_selection,
415
+ type="primary",
416
+ )
417
+
418
+ left_col, right_col = st.columns([1, 3])
419
+
420
+ st.session_state.selected_dataset = left_col.selectbox(
421
+ "Select Dataset",
422
+ ["all"] + df_response_judging["dataset"].dropna().unique().tolist(),
423
+ key="dataset_selector",
424
+ on_change=update_instruction_options,
425
+ )
426
+ update_instruction_options()
427
+ st.session_state.selected_instruction = right_col.selectbox(
428
+ f"Select Instruction ({len(st.session_state.instruction_options)} unique instructions)",
429
+ st.session_state.instruction_options,
430
+ key="instruction_selector",
431
+ on_change=update_instruction,
432
+ index=(
433
+ st.session_state.instruction_options.index(
434
+ st.session_state.selected_instruction
435
+ )
436
+ if st.session_state.selected_instruction
437
+ in st.session_state.instruction_options
438
+ else 0
439
  ),
 
 
440
  )
441
 
442
+ # All the models.
443
+ all_models_judgings_details = df_response_judging[
444
+ (df_response_judging["generator_1"] == fixed_model)
445
+ & (
446
+ df_response_judging["instruction"]
447
+ == st.session_state.selected_instruction
448
+ )
449
+ ]
450
 
451
+ st.divider()
 
 
452
 
453
+ st.markdown(f"## Selected instruction")
454
+ st.info(st.session_state.selected_instruction)
455
+
456
+ st.divider()
457
+
458
+ st.markdown(f"## Overall Battles")
459
+ all_models_judgings_details["output_1_num_words"] = all_models_judgings_details[
460
+ "output_1"
461
+ ].apply(lambda x: len(x.split()))
462
+ all_models_judgings_details["output_2_num_words"] = all_models_judgings_details[
463
+ "output_2"
464
+ ].apply(lambda x: len(x.split()))
465
+ all_models_judgings_details["output_num_words_diff"] = (
466
+ all_models_judgings_details["output_1_num_words"]
467
+ - all_models_judgings_details["output_2_num_words"]
468
+ )
469
+ all_models_judgings_details["assigned_preference"] = (
470
+ all_models_judgings_details["preference"]
471
+ .round(0)
472
+ .apply(get_preference_from_rounded_score)
473
+ )
474
+
475
+ # st.write(all_models_judgings_details)
476
+
477
+ col1, col2, col3 = st.columns(3)
478
+
479
+ fig = px.histogram(
480
+ all_models_judgings_details,
481
+ x="output_num_words_diff",
482
+ color="assigned_preference",
483
+ title=f"Pairwise preference counts based on difference in response length",
484
+ color_discrete_map={
485
+ "[1>2]": "blue",
486
+ "[2>1]": "orangered",
487
+ "[1=2]": "green",
488
+ },
489
+ range_x=[-500, 500],
490
+ labels={
491
+ "output_num_words_diff": "Difference in number of words between response 1 and 2.",
492
+ "assigned_preference": "Assigned Preference",
493
+ },
494
+ )
495
+ col1.plotly_chart(fig)
496
+
497
+ # Plot of assigned preference counts.
498
+ fig = px.histogram(
499
+ all_models_judgings_details,
500
+ x="assigned_preference",
501
+ title=f"Assigned preferences for {fixed_model} vs. all models",
502
+ )
503
+ col2.plotly_chart(fig)
504
+
505
+ # Models that are better than the fixed model.
506
+ num_words_for_fixed_model = len(
507
+ all_models_judgings_details.iloc[0]["output_1"].split()
508
+ )
509
+ better_models = all_models_judgings_details[
510
+ all_models_judgings_details["assigned_preference"] == "[2>1]"
511
+ ]
512
+
513
+ shorter_models = better_models[
514
+ better_models["output_2_num_words"] <= num_words_for_fixed_model
515
+ ]
516
+ longer_models = better_models[
517
+ better_models["output_2_num_words"] > num_words_for_fixed_model
518
+ ]
519
+ col3.markdown(
520
+ f"### Models that are better than {fixed_model} ({num_words_for_fixed_model})"
521
+ )
522
+ if shorter_models.size != 0:
523
+ shorter_models_string = ""
524
+ for _, shorter_model in shorter_models.iterrows():
525
+ if shorter_model["generator_2"] != fixed_model:
526
+ shorter_models_string += f"- {shorter_model['generator_2']} ({shorter_model['output_2_num_words']})\n"
527
+ col3.markdown("**With shorter or equal length responses:**")
528
+ col3.markdown(shorter_models_string)
529
+ else:
530
+ col3.write("None")
531
+ if longer_models.size != 0:
532
+ longer_models_string = ""
533
+ for _, longer_model in longer_models.iterrows():
534
+ if longer_model["generator_2"] != fixed_model:
535
+ longer_models_string += f"- {longer_model['generator_2']} ({longer_model['output_2_num_words']})\n"
536
+ col3.markdown("**With longer responses:**")
537
+ col3.markdown(longer_models_string)
538
+ else:
539
+ col3.write("None")
540
+
541
+ # Judging details.
542
+ st.markdown(f"## Individual Battle Details")
543
+ judging_details = df_response_judging[
544
+ (df_response_judging["generator_1"] == fixed_model)
545
+ & (df_response_judging["generator_2"] == st.session_state.selected_model)
546
+ & (
547
+ df_response_judging["instruction"]
548
+ == st.session_state.selected_instruction
549
+ )
550
+ ]
551
+
552
+ # if not judging_details.empty:
553
+ if not judging_details["preference"].empty:
554
+ preference = get_preference(judging_details["preference"])
555
+ if preference == "[1>2]":
556
+ st.write(
557
+ f"**{fixed_model}** is better than **{st.session_state.selected_model}**"
558
+ )
559
+ else:
560
+ st.write(
561
+ f"**{st.session_state.selected_model}** is better than **{fixed_model}**"
562
+ )
563
+ st.write(
564
+ f"- **Score:** {judging_details['preference'].round(2).item()}\n- **Assigned preference:** {preference}"
565
+ )
566
+
567
+ with st.expander("Additional information"):
568
+ st.write(
569
+ judging_details[
570
+ [
571
+ "instruction",
572
+ "time_per_example",
573
+ "price_per_example",
574
+ "raw_completion",
575
+ ]
576
+ ]
577
+ )
578
+
579
+ # Create two columns for model selectors
580
+ st.markdown("## Responses")
581
+ col1, col2 = st.columns(2)
582
+
583
+ with col1:
584
+ st.selectbox(
585
+ "Reference model",
586
+ [fixed_model],
587
+ key="fixed_model",
588
+ )
589
+
590
+ # Get the response string for the fixed model
591
+ if st.session_state.selected_instruction:
592
+ preference = get_preference(judging_details["preference"])
593
+ response_details_fixed = df_response_judging[
594
+ (
595
+ df_response_judging["instruction"]
596
+ == st.session_state.selected_instruction
597
+ )
598
+ & (df_response_judging["generator_1"] == fixed_model)
599
+ ].iloc[0]
600
+
601
+ st.write(
602
+ f'Number of words: {len(response_details_fixed["output_1"].split())}'
603
+ )
604
+
605
+ # Display the response string
606
+ if preference == "[1>2]":
607
+ st.success(response_details_fixed["output_1"])
608
+ else:
609
+ st.error(response_details_fixed["output_1"])
610
+
611
+ with col2:
612
+ st.session_state.selected_model = st.selectbox(
613
+ "Select Model",
614
+ model_options,
615
+ key="model_selector",
616
+ on_change=update_model,
617
+ index=(
618
+ model_options.index(st.session_state.selected_model)
619
+ if st.session_state.selected_model
620
+ else 0
621
+ ),
622
+ )
623
+
624
+ # Get the response string for the selected model
625
+ if (
626
+ st.session_state.selected_model
627
+ and st.session_state.selected_instruction
628
+ ):
629
+ response_details_dynamic = df_response_judging[
630
+ (
631
+ df_response_judging["instruction"]
632
+ == st.session_state.selected_instruction
633
+ )
634
+ & (
635
+ df_response_judging["generator_2"]
636
+ == st.session_state.selected_model
637
+ )
638
+ ].iloc[0]
639
+
640
+ st.write(
641
+ f'Number of words: {len(response_details_dynamic["output_2"].split())}'
642
+ )
643
 
644
+ # Display the response string
645
+ if preference == "[2>1]":
646
+ st.success(response_details_dynamic["output_2"])
647
+ else:
648
+ st.error(response_details_dynamic["output_2"])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
649
 
650
 
651
  if __name__ == "__main__":
data/df_response_judging.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5514fedae0d0375f5711fad6fdbf37c8e5e09178d16c29052db0256d09cf2240
3
+ size 214865144
data/df_responses.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fbdf08ff250eb5104ee69244672a4baa184b8ae2f928b5c56344102d20926c0d
3
+ size 89555570
data/model_win_rates.json DELETED
@@ -1 +0,0 @@
1
- {"num_words_mean":{"gpt-3.5-turbo-0301":136,"gpt-3.5-turbo-1106_verbose":166,"vicuna-13b-v1.5-togetherai":177,"Qwen1.5-1.8B-Chat":426,"recycled-wizardlm-7b-v1.0":235,"aligner-2b_claude-3-opus-20240229":257,"Qwen1.5-110B-Chat":253,"claude-3-opus-20240229":216,"llama-2-7b-chat-hf":241,"mistral-medium":241,"vicuna-33b-v1.3":237,"cohere":315,"claude-2":174,"guanaco-65b":203,"Mixtral-8x7B-Instruct-v0.1":238,"openchat-v2-w-13b":249,"falcon-7b-instruct":74,"wizardlm-13b-v1.1":244,"Meta-Llama-3-8B-Instruct":301,"FsfairX-Zephyr-Chat-v0.1":342,"Infinity-Instruct-3M-0613-Mistral-7B":79,"Qwen1.5-72B-Chat":243,"xwinlm-7b-v0.1":304,"Mixtral-8x22B-Instruct-v0.1":229,"vicuna-13b-v1.5":177,"dbrx-instruct":235,"zephyr-7b-alpha":208,"tulu-2-dpo-13b":257,"Qwen1.5-7B-Chat":254,"Together-MoA-Lite":297,"cut-13b":269,"Meta-Llama-3-70B-Instruct":299,"vicuna-13b-v1.3":189,"claude-instant-1.2":179,"airoboros-65b":236,"openbuddy-llama2-13b-v11.1":177,"phi-2":102,"Together-MoA":272,"mistral-large-2402":218,"openbuddy-llama-30b-v7.1":162,"TempNet-LLaMA2-Chat-70B-v0.1":296,"pairrm-tulu-2-13b":236,"recycled-wizardlm-7b-v2.0":251,"Storm-7B-best-of-64":336,"vicuna-7b":175,"claude-3-sonnet-20240229":221,"Mistral-7B-Instruct-v0.2":261,"Samba-CoE-v0.1":190,"claude":176,"Nanbeige2-8B-Chat":415,"REBEL-Llama-3-8B-Instruct":341,"chatglm2-6b":175,"gpt-4o-2024-05-13":286,"gpt4_1106_preview_verbose":378,"TempNet-LLaMA2-Chat-13B-v0.1":253,"text_davinci_001":50,"Mixtral-8x7B-Instruct-v0.1_verbose":336,"baize-v2-7b":189,"phi-2-dpo":270,"alpaca-farm-ppo-human":135,"Nanbeige2-16B-Chat":284,"gpt4_0613":183,"pythia-12b-mix-sft":147,"alpaca-7b-neft":170,"Qwen1.5-14B-Chat":252,"gpt-4-0125-preview":313,"guanaco-33b":220,"oasst-sft-llama-33b":125,"gpt4_0613_verbose":237,"llama-2-chat-7b-evol70k-neft":260,"gpt35_turbo_instruct":166,"platolm-7b":209,"llama-2-13b-chat-hf":249,"Nanbeige-Plus-Chat-v0.1":391,"openchat-v2-13b":249,"mistral-orpo-beta":263,"Snorkel-Mistral-PairRM-DPO-best-of-16":393,"tulu-2-dpo-7b":278,"alpaca-7b_verbose":90,"OpenHermes-2.5-Mistral-7B":176,"claude-2.1_verbose":228,"ultralm-13b-v2.0":231,"deita-7b-v1.0":228,"minichat-1.5-3b":242,"Qwen-14B-Chat":167,"airoboros-33b":238,"alpaca-farm-ppo-sim-gpt4-20k":82,"ultralm-13b":181,"openbuddy-falcon-40b-v9":182,"openchat8192-13b":268,"wizardlm-13b":163,"vicuna-13b":175,"merlinite-7B-AOT":281,"gpt4_0314":215,"gpt4_0613_concise":99,"jina-chat":106,"Contextual-KTO-Mistral-PairRM":381,"xwinlm-13b-v0.1":300,"LMCocktail-10.7B-v1":182,"SPPO-Mistral7B-PairRM-ExPO":344,"Mixtral-8x7B-Instruct-v0.1_concise":144,"gpt4_1106_preview_concise":177,"Mistral-7B-ReMax-v0.1":234,"Llama-3-Instruct-8B-SimPO-ExPO":261,"dolphin-2.2.1-mistral-7b":182,"humpback-llama2-70b":178,"openpipe-moa-gpt-4-turbo-v1":272,"vicuna-7b-v1.5":181,"Starling-LM-7B-alpha":301,"falcon-40b-instruct":109,"Samba-CoE-v0.2-best-of-16":225,"opencoderplus-15b":262,"xwinlm-70b-v0.1":282,"wizardlm-13b-v1.2":264,"aligner-2b_qwen1.5-72b-chat":280,"internlm2-chat-7b-ExPO":364,"claude-2.1":177,"vicuna-7b-v1.3":184,"oasst-rlhf-llama-33b":181,"zephyr-7b-alpha-ExPO":201,"openchat-v3.1-13b":235,"SPPO-Llama-3-Instruct-8B-PairRM":317,"minotaur-13b":138,"tulu-2-dpo-13b-ExPO":264,"zephyr-7b-beta-ExPO":224,"tulu-2-dpo-7b-ExPO":277,"Llama-3-Instruct-8B-SimPO":272,"baize-v2-13b":155,"guanaco-7b":233,"ultralm-13b-v2.0-best-of-16":273,"claude-2.1_concise":91,"openchat-13b":260,"tulu-2-dpo-70b":231,"deepseek-llm-67b-chat":189,"humpback-llama-65b":196,"tulu-2-dpo-70b-ExPO":276,"TempNet-LLaMA2-Chat-7B-v0.1":246,"nous-hermes-13b":139,"gpt-3.5-turbo-0613":207,"alpaca-7b_concise":58,"baichuan-13b-chat":225,"claude-3-5-sonnet-20240620":228,"gpt-3.5-turbo-1106":126,"minichat-3b":145,"Storm-7B":300,"oasst-sft-pythia-12b":118,"Conifer-7B-DPO":205,"Snorkel-Mistral-PairRM-DPO":414,"internlm2-chat-20b-ExPO":507,"Samba-CoE-v0.2":210,"gemini-pro":228,"pairrm-tulu-2-70b":264,"text_davinci_003":52,"gpt4":215,"Yi-34B-Chat":339,"Starling-LM-7B-beta-ExPO":336,"pairrm-Yi-34B-Chat":349,"gpt4_1106_preview":323,"evo-7b":280,"zephyr-7b-beta":229,"guanaco-13b":308,"alpaca-7b":66,"internlm2-chat-20b-ppo":371,"gemma-2b-it":165,"pairrm-zephyr-7b-beta":236,"evo-v2-7b":274,"causallm-14b":228,"SPPO-Mistral7B-PairRM":322,"gpt-3.5-turbo-1106_concise":68,"openbuddy-llama-65b-v8":194,"claude2-alpaca-13b":186,"Starling-LM-7B-alpha-ExPO":288,"openbuddy-falcon-7b-v6":192,"gemma-7b-it":176,"phi-2-sft":175,"gpt4_gamed":11,"llama-2-70b-chat-hf":292,"openbuddy-llama2-70b-v10.1":178,"wizardlm-70b":249,"ultralm-13b-best-of-16":311},"num_words_std":{"gpt-3.5-turbo-0301":109,"gpt-3.5-turbo-1106_verbose":110,"vicuna-13b-v1.5-togetherai":119,"Qwen1.5-1.8B-Chat":358,"recycled-wizardlm-7b-v1.0":129,"aligner-2b_claude-3-opus-20240229":145,"Qwen1.5-110B-Chat":153,"claude-3-opus-20240229":113,"llama-2-7b-chat-hf":134,"mistral-medium":166,"vicuna-33b-v1.3":149,"cohere":179,"claude-2":90,"guanaco-65b":157,"Mixtral-8x7B-Instruct-v0.1":163,"openchat-v2-w-13b":151,"falcon-7b-instruct":92,"wizardlm-13b-v1.1":156,"Meta-Llama-3-8B-Instruct":167,"FsfairX-Zephyr-Chat-v0.1":187,"Infinity-Instruct-3M-0613-Mistral-7B":78,"Qwen1.5-72B-Chat":144,"xwinlm-7b-v0.1":207,"Mixtral-8x22B-Instruct-v0.1":168,"vicuna-13b-v1.5":122,"dbrx-instruct":143,"zephyr-7b-alpha":189,"tulu-2-dpo-13b":309,"Qwen1.5-7B-Chat":143,"Together-MoA-Lite":161,"cut-13b":159,"Meta-Llama-3-70B-Instruct":171,"vicuna-13b-v1.3":119,"claude-instant-1.2":104,"airoboros-65b":288,"openbuddy-llama2-13b-v11.1":122,"phi-2":209,"Together-MoA":151,"mistral-large-2402":148,"openbuddy-llama-30b-v7.1":115,"TempNet-LLaMA2-Chat-70B-v0.1":196,"pairrm-tulu-2-13b":135,"recycled-wizardlm-7b-v2.0":134,"Storm-7B-best-of-64":195,"vicuna-7b":124,"claude-3-sonnet-20240229":129,"Mistral-7B-Instruct-v0.2":201,"Samba-CoE-v0.1":138,"claude":97,"Nanbeige2-8B-Chat":173,"REBEL-Llama-3-8B-Instruct":273,"chatglm2-6b":225,"gpt-4o-2024-05-13":190,"gpt4_1106_preview_verbose":190,"TempNet-LLaMA2-Chat-13B-v0.1":136,"text_davinci_001":51,"Mixtral-8x7B-Instruct-v0.1_verbose":169,"baize-v2-7b":133,"phi-2-dpo":137,"alpaca-farm-ppo-human":125,"Nanbeige2-16B-Chat":151,"gpt4_0613":124,"pythia-12b-mix-sft":122,"alpaca-7b-neft":82,"Qwen1.5-14B-Chat":145,"gpt-4-0125-preview":180,"guanaco-33b":170,"oasst-sft-llama-33b":115,"gpt4_0613_verbose":127,"llama-2-chat-7b-evol70k-neft":113,"gpt35_turbo_instruct":157,"platolm-7b":145,"llama-2-13b-chat-hf":135,"Nanbeige-Plus-Chat-v0.1":159,"openchat-v2-13b":151,"mistral-orpo-beta":161,"Snorkel-Mistral-PairRM-DPO-best-of-16":185,"tulu-2-dpo-7b":415,"alpaca-7b_verbose":68,"OpenHermes-2.5-Mistral-7B":154,"claude-2.1_verbose":94,"ultralm-13b-v2.0":137,"deita-7b-v1.0":160,"minichat-1.5-3b":157,"Qwen-14B-Chat":124,"airoboros-33b":246,"alpaca-farm-ppo-sim-gpt4-20k":53,"ultralm-13b":108,"openbuddy-falcon-40b-v9":124,"openchat8192-13b":182,"wizardlm-13b":117,"vicuna-13b":115,"merlinite-7B-AOT":128,"gpt4_0314":160,"gpt4_0613_concise":79,"jina-chat":76,"Contextual-KTO-Mistral-PairRM":205,"xwinlm-13b-v0.1":188,"LMCocktail-10.7B-v1":118,"SPPO-Mistral7B-PairRM-ExPO":144,"Mixtral-8x7B-Instruct-v0.1_concise":131,"gpt4_1106_preview_concise":130,"Mistral-7B-ReMax-v0.1":132,"Llama-3-Instruct-8B-SimPO-ExPO":134,"dolphin-2.2.1-mistral-7b":167,"humpback-llama2-70b":133,"openpipe-moa-gpt-4-turbo-v1":137,"vicuna-7b-v1.5":117,"Starling-LM-7B-alpha":194,"falcon-40b-instruct":111,"Samba-CoE-v0.2-best-of-16":140,"opencoderplus-15b":194,"xwinlm-70b-v0.1":169,"wizardlm-13b-v1.2":188,"aligner-2b_qwen1.5-72b-chat":151,"internlm2-chat-7b-ExPO":172,"claude-2.1":92,"vicuna-7b-v1.3":117,"oasst-rlhf-llama-33b":174,"zephyr-7b-alpha-ExPO":149,"openchat-v3.1-13b":156,"SPPO-Llama-3-Instruct-8B-PairRM":148,"minotaur-13b":109,"tulu-2-dpo-13b-ExPO":139,"zephyr-7b-beta-ExPO":157,"tulu-2-dpo-7b-ExPO":144,"Llama-3-Instruct-8B-SimPO":141,"baize-v2-13b":110,"guanaco-7b":200,"ultralm-13b-v2.0-best-of-16":131,"claude-2.1_concise":68,"openchat-13b":168,"tulu-2-dpo-70b":151,"deepseek-llm-67b-chat":122,"humpback-llama-65b":122,"tulu-2-dpo-70b-ExPO":140,"TempNet-LLaMA2-Chat-7B-v0.1":140,"nous-hermes-13b":130,"gpt-3.5-turbo-0613":149,"alpaca-7b_concise":45,"baichuan-13b-chat":301,"claude-3-5-sonnet-20240620":142,"gpt-3.5-turbo-1106":103,"minichat-3b":119,"Storm-7B":135,"oasst-sft-pythia-12b":120,"Conifer-7B-DPO":126,"Snorkel-Mistral-PairRM-DPO":241,"internlm2-chat-20b-ExPO":206,"Samba-CoE-v0.2":138,"gemini-pro":164,"pairrm-tulu-2-70b":212,"text_davinci_003":66,"gpt4":156,"Yi-34B-Chat":178,"Starling-LM-7B-beta-ExPO":106,"pairrm-Yi-34B-Chat":189,"gpt4_1106_preview":181,"evo-7b":176,"zephyr-7b-beta":160,"guanaco-13b":288,"alpaca-7b":52,"internlm2-chat-20b-ppo":268,"gemma-2b-it":128,"pairrm-zephyr-7b-beta":178,"evo-v2-7b":170,"causallm-14b":166,"SPPO-Mistral7B-PairRM":148,"gpt-3.5-turbo-1106_concise":57,"openbuddy-llama-65b-v8":113,"claude2-alpaca-13b":127,"Starling-LM-7B-alpha-ExPO":174,"openbuddy-falcon-7b-v6":126,"gemma-7b-it":118,"phi-2-sft":123,"gpt4_gamed":60,"llama-2-70b-chat-hf":174,"openbuddy-llama2-70b-v10.1":118,"wizardlm-70b":144,"ultralm-13b-best-of-16":132},"win_rate":{"gpt-3.5-turbo-0301":9.6224532951,"gpt-3.5-turbo-1106_verbose":12.7631698103,"vicuna-13b-v1.5-togetherai":6.9582753694,"Qwen1.5-1.8B-Chat":3.7055568157,"recycled-wizardlm-7b-v1.0":6.6327499605,"aligner-2b_claude-3-opus-20240229":34.4633736232,"Qwen1.5-110B-Chat":33.7770952757,"claude-3-opus-20240229":29.1052695333,"llama-2-7b-chat-hf":4.9613395472,"mistral-medium":21.8557725437,"vicuna-33b-v1.3":12.7059479215,"cohere":12.9014552097,"claude-2":17.1882403567,"guanaco-65b":6.8584945134,"Mixtral-8x7B-Instruct-v0.1":18.2553176264,"openchat-v2-w-13b":9.6153441584,"falcon-7b-instruct":2.1466175532,"wizardlm-13b-v1.1":11.2339095729,"Meta-Llama-3-8B-Instruct":22.5699026093,"FsfairX-Zephyr-Chat-v0.1":35.9464864409,"Infinity-Instruct-3M-0613-Mistral-7B":15.7478281307,"Qwen1.5-72B-Chat":26.4982833956,"xwinlm-7b-v0.1":11.2456517378,"Mixtral-8x22B-Instruct-v0.1":22.2101705475,"vicuna-13b-v1.5":6.7221220149,"dbrx-instruct":19.7553327319,"zephyr-7b-alpha":8.3526639682,"tulu-2-dpo-13b":10.1197883883,"Qwen1.5-7B-Chat":11.7709270696,"Together-MoA-Lite":56.5930456223,"cut-13b":10.7790892025,"Meta-Llama-3-70B-Instruct":33.1778569588,"vicuna-13b-v1.3":7.1372403865,"claude-instant-1.2":16.1273996216,"airoboros-65b":9.3889501497,"openbuddy-llama2-13b-v11.1":6.1747164895,"phi-2":2.350209543,"Together-MoA":59.8688062333,"mistral-large-2402":21.4387759814,"openbuddy-llama-30b-v7.1":6.130014614,"TempNet-LLaMA2-Chat-70B-v0.1":15.0518944202,"pairrm-tulu-2-13b":13.8319010168,"recycled-wizardlm-7b-v2.0":7.3371293705,"Storm-7B-best-of-64":63.0409907519,"vicuna-7b":4.1626111623,"claude-3-sonnet-20240229":25.5563252923,"Mistral-7B-Instruct-v0.2":14.7227726577,"Samba-CoE-v0.1":16.8355018701,"claude":16.9853436124,"Nanbeige2-8B-Chat":39.354502072,"REBEL-Llama-3-8B-Instruct":34.3064238313,"chatglm2-6b":2.7621847965,"gpt-4o-2024-05-13":51.3275757825,"gpt4_1106_preview_verbose":64.303601471,"TempNet-LLaMA2-Chat-13B-v0.1":7.7284050659,"text_davinci_001":2.7640052311,"Mixtral-8x7B-Instruct-v0.1_verbose":24.6140630502,"baize-v2-7b":3.4048149775,"phi-2-dpo":7.7570957018,"alpaca-farm-ppo-human":4.100426815,"Nanbeige2-16B-Chat":37.0360860499,"gpt4_0613":15.7550380876,"pythia-12b-mix-sft":2.578090281,"alpaca-7b-neft":3.1321786695,"Qwen1.5-14B-Chat":18.6458143619,"gpt-4-0125-preview":54.9665397329,"guanaco-33b":5.002493725,"oasst-sft-llama-33b":4.7703909916,"gpt4_0613_verbose":23.2373600435,"llama-2-chat-7b-evol70k-neft":7.6023835122,"gpt35_turbo_instruct":8.4624465044,"platolm-7b":6.3208280585,"llama-2-13b-chat-hf":7.7023099579,"Nanbeige-Plus-Chat-v0.1":56.7030097302,"openchat-v2-13b":8.4350756447,"mistral-orpo-beta":12.5654087946,"Snorkel-Mistral-PairRM-DPO-best-of-16":34.8601328913,"tulu-2-dpo-7b":8.1975153845,"alpaca-7b_verbose":2.9331016025,"OpenHermes-2.5-Mistral-7B":10.3404157058,"claude-2.1_verbose":24.3540710901,"ultralm-13b-v2.0":7.5046229557,"deita-7b-v1.0":12.6466394724,"minichat-1.5-3b":6.5534430528,"Qwen-14B-Chat":7.5023334847,"airoboros-33b":9.0531603961,"alpaca-farm-ppo-sim-gpt4-20k":3.4503419871,"ultralm-13b":5.0745903805,"openbuddy-falcon-40b-v9":5.9557428463,"openchat8192-13b":7.472766808,"wizardlm-13b":5.8781525894,"vicuna-13b":5.8311031845,"merlinite-7B-AOT":29.8963508407,"gpt4_0314":22.0732589287,"gpt4_0613_concise":9.4003205746,"jina-chat":7.7861303934,"Contextual-KTO-Mistral-PairRM":33.2273552,"xwinlm-13b-v0.1":17.4279347502,"LMCocktail-10.7B-v1":13.1534309174,"SPPO-Mistral7B-PairRM-ExPO":35.4431306717,"Mixtral-8x7B-Instruct-v0.1_concise":13.7440401548,"gpt4_1106_preview_concise":22.9201944405,"Mistral-7B-ReMax-v0.1":15.999331369,"Llama-3-Instruct-8B-SimPO-ExPO":40.6328540086,"dolphin-2.2.1-mistral-7b":9.0397997282,"humpback-llama2-70b":10.1217715026,"openpipe-moa-gpt-4-turbo-v1":63.1549345123,"vicuna-7b-v1.5":4.7974939392,"Starling-LM-7B-alpha":14.2459235216,"falcon-40b-instruct":3.3429188225,"Samba-CoE-v0.2-best-of-16":26.9882543183,"opencoderplus-15b":7.406222451,"xwinlm-70b-v0.1":21.8129570739,"wizardlm-13b-v1.2":12.0274803428,"aligner-2b_qwen1.5-72b-chat":31.773037737,"internlm2-chat-7b-ExPO":28.0678174371,"claude-2.1":15.7335067364,"vicuna-7b-v1.3":4.6425118575,"oasst-rlhf-llama-33b":6.2964347858,"zephyr-7b-alpha-ExPO":10.5593543457,"openchat-v3.1-13b":11.0822304894,"SPPO-Llama-3-Instruct-8B-PairRM":39.6728609061,"minotaur-13b":5.7389636691,"tulu-2-dpo-13b-ExPO":15.5514054294,"zephyr-7b-beta-ExPO":11.0611168323,"tulu-2-dpo-7b-ExPO":11.529221039,"Llama-3-Instruct-8B-SimPO":40.5297749846,"baize-v2-13b":4.5905453306,"guanaco-7b":2.8800022662,"ultralm-13b-v2.0-best-of-16":13.8533734712,"claude-2.1_concise":9.2271252406,"openchat-13b":8.0223860109,"tulu-2-dpo-70b":15.9828543741,"deepseek-llm-67b-chat":12.0934222649,"humpback-llama-65b":9.4251390478,"tulu-2-dpo-70b-ExPO":22.9806197059,"TempNet-LLaMA2-Chat-7B-v0.1":5.4301432647,"nous-hermes-13b":5.4118789332,"gpt-3.5-turbo-0613":14.0957985739,"alpaca-7b_concise":1.9911763835,"baichuan-13b-chat":1.9921455615,"claude-3-5-sonnet-20240620":40.5602140968,"gpt-3.5-turbo-1106":9.177964562,"minichat-3b":3.0071507064,"Storm-7B":50.2688690553,"oasst-sft-pythia-12b":1.7901140832,"Conifer-7B-DPO":11.3135856492,"Snorkel-Mistral-PairRM-DPO":30.2200527007,"internlm2-chat-20b-ExPO":46.1853674689,"Samba-CoE-v0.2":21.8473786693,"gemini-pro":18.1776445406,"pairrm-tulu-2-70b":18.6389629674,"text_davinci_003":1.9621476654,"gpt4":23.5767893148,"Yi-34B-Chat":29.6599467188,"Starling-LM-7B-beta-ExPO":29.6008518479,"pairrm-Yi-34B-Chat":31.2412829468,"gpt4_1106_preview":50.0,"evo-7b":15.5774373995,"zephyr-7b-beta":10.9928857554,"guanaco-13b":3.4695968597,"alpaca-7b":2.5914505402,"internlm2-chat-20b-ppo":21.7491545005,"gemma-2b-it":3.4019714381,"pairrm-zephyr-7b-beta":12.8412782556,"evo-v2-7b":20.8341130226,"causallm-14b":11.14616087,"SPPO-Mistral7B-PairRM":32.2453123638,"gpt-3.5-turbo-1106_concise":7.4158649776,"openbuddy-llama-65b-v8":8.7706501509,"claude2-alpaca-13b":7.4373513248,"Starling-LM-7B-alpha-ExPO":18.1797559203,"openbuddy-falcon-7b-v6":3.521174372,"gemma-7b-it":6.9372943797,"phi-2-sft":3.9775677752,"gpt4_gamed":3.7383373714,"llama-2-70b-chat-hf":13.8882583437,"openbuddy-llama2-70b-v10.1":8.0964220963,"wizardlm-70b":14.3838960868,"ultralm-13b-best-of-16":11.3073149478},"standard_error":{"gpt-3.5-turbo-0301":0.9129656687,"gpt-3.5-turbo-1106_verbose":1.0442468192,"vicuna-13b-v1.5-togetherai":0.7825381738,"Qwen1.5-1.8B-Chat":0.5811750995,"recycled-wizardlm-7b-v1.0":0.7713329914,"aligner-2b_claude-3-opus-20240229":1.3146665263,"Qwen1.5-110B-Chat":1.3776163154,"claude-3-opus-20240229":1.3941539442,"llama-2-7b-chat-hf":0.6691754517,"mistral-medium":1.2682402187,"vicuna-33b-v1.3":0.9992557843,"cohere":1.0141034031,"claude-2":1.1748282562,"guanaco-65b":0.8048449272,"Mixtral-8x7B-Instruct-v0.1":1.1885585969,"openchat-v2-w-13b":0.8908241711,"falcon-7b-instruct":0.4542257929,"wizardlm-13b-v1.1":0.9502711246,"Meta-Llama-3-8B-Instruct":1.2575802331,"FsfairX-Zephyr-Chat-v0.1":1.4410058098,"Infinity-Instruct-3M-0613-Mistral-7B":1.1194852006,"Qwen1.5-72B-Chat":1.3042361649,"xwinlm-7b-v0.1":0.9455447881,"Mixtral-8x22B-Instruct-v0.1":1.2780740057,"vicuna-13b-v1.5":0.7674173991,"dbrx-instruct":1.2063251121,"zephyr-7b-alpha":0.8664491645,"tulu-2-dpo-13b":0.929813366,"Qwen1.5-7B-Chat":0.9544463489,"Together-MoA-Lite":1.4464848562,"cut-13b":0.9428953579,"Meta-Llama-3-70B-Instruct":1.3886514096,"vicuna-13b-v1.3":0.7846846272,"claude-instant-1.2":1.1341036838,"airoboros-65b":0.8816208133,"openbuddy-llama2-13b-v11.1":0.753544387,"phi-2":0.4496590406,"Together-MoA":1.4343056045,"mistral-large-2402":1.2485232545,"openbuddy-llama-30b-v7.1":0.7645283386,"TempNet-LLaMA2-Chat-70B-v0.1":1.0801507581,"pairrm-tulu-2-13b":1.0835284665,"recycled-wizardlm-7b-v2.0":0.8012012288,"Storm-7B-best-of-64":1.4253258915,"vicuna-7b":0.6135107768,"claude-3-sonnet-20240229":1.3419811052,"Mistral-7B-Instruct-v0.2":1.0785266447,"Samba-CoE-v0.1":1.1180386125,"claude":1.1687959793,"Nanbeige2-8B-Chat":1.4524224246,"REBEL-Llama-3-8B-Instruct":1.3914900256,"chatglm2-6b":0.5020758951,"gpt-4o-2024-05-13":1.470009459,"gpt4_1106_preview_verbose":1.3348590089,"TempNet-LLaMA2-Chat-13B-v0.1":0.8268032188,"text_davinci_001":0.5177668864,"Mixtral-8x7B-Instruct-v0.1_verbose":1.2975757386,"baize-v2-7b":0.5826293992,"phi-2-dpo":0.8357079426,"alpaca-farm-ppo-human":0.6304721407,"Nanbeige2-16B-Chat":1.4340261273,"gpt4_0613":1.0754642482,"pythia-12b-mix-sft":0.5127326717,"alpaca-7b-neft":0.5522241753,"Qwen1.5-14B-Chat":1.1351340211,"gpt-4-0125-preview":1.4286740089,"guanaco-33b":0.6697115752,"oasst-sft-llama-33b":0.6385940189,"gpt4_0613_verbose":1.2835395056,"llama-2-chat-7b-evol70k-neft":0.8110538776,"gpt35_turbo_instruct":0.8724086934,"platolm-7b":0.7405704765,"llama-2-13b-chat-hf":0.8286143394,"Nanbeige-Plus-Chat-v0.1":1.482841875,"openchat-v2-13b":0.8235980231,"mistral-orpo-beta":0.9929774686,"Snorkel-Mistral-PairRM-DPO-best-of-16":1.3599450437,"tulu-2-dpo-7b":0.8749615125,"alpaca-7b_verbose":0.5302092824,"OpenHermes-2.5-Mistral-7B":0.9356553899,"claude-2.1_verbose":1.29358621,"ultralm-13b-v2.0":0.8150376948,"deita-7b-v1.0":1.0352555321,"minichat-1.5-3b":0.7674159339,"Qwen-14B-Chat":0.8147265702,"airoboros-33b":0.8607792116,"alpaca-farm-ppo-sim-gpt4-20k":0.5834901038,"ultralm-13b":0.6707048924,"openbuddy-falcon-40b-v9":0.7388621614,"openchat8192-13b":0.8038094305,"wizardlm-13b":0.704420227,"vicuna-13b":0.7422829864,"merlinite-7B-AOT":1.3666520485,"gpt4_0314":1.2466725495,"gpt4_0613_concise":0.9010212759,"jina-chat":0.8398450576,"Contextual-KTO-Mistral-PairRM":1.3779687478,"xwinlm-13b-v0.1":1.1450161467,"LMCocktail-10.7B-v1":1.0457195357,"SPPO-Mistral7B-PairRM-ExPO":1.3981308966,"Mixtral-8x7B-Instruct-v0.1_concise":1.0718682992,"gpt4_1106_preview_concise":1.2325177143,"Mistral-7B-ReMax-v0.1":1.1288683901,"Llama-3-Instruct-8B-SimPO-ExPO":1.4439449942,"dolphin-2.2.1-mistral-7b":0.8892901247,"humpback-llama2-70b":0.9401806122,"openpipe-moa-gpt-4-turbo-v1":1.4229800988,"vicuna-7b-v1.5":0.6655960677,"Starling-LM-7B-alpha":1.0685460609,"falcon-40b-instruct":0.5541127159,"Samba-CoE-v0.2-best-of-16":1.318903,"opencoderplus-15b":0.8024858021,"xwinlm-70b-v0.1":1.2303274476,"wizardlm-13b-v1.2":0.9717618177,"aligner-2b_qwen1.5-72b-chat":1.2392772646,"internlm2-chat-7b-ExPO":1.3159792318,"claude-2.1":1.1203158654,"vicuna-7b-v1.3":0.6420919828,"oasst-rlhf-llama-33b":0.7417944201,"zephyr-7b-alpha-ExPO":0.9774634449,"openchat-v3.1-13b":0.9501308701,"SPPO-Llama-3-Instruct-8B-PairRM":1.4247223562,"minotaur-13b":0.7271241247,"tulu-2-dpo-13b-ExPO":1.1714853384,"zephyr-7b-beta-ExPO":1.0204784889,"tulu-2-dpo-7b-ExPO":1.0497814893,"Llama-3-Instruct-8B-SimPO":1.4225744647,"baize-v2-13b":0.6497033227,"guanaco-7b":0.5202924149,"ultralm-13b-v2.0-best-of-16":1.049344706,"claude-2.1_concise":0.8921752289,"openchat-13b":0.8368334957,"tulu-2-dpo-70b":1.1457861368,"deepseek-llm-67b-chat":1.0173843633,"humpback-llama-65b":0.9300866723,"tulu-2-dpo-70b-ExPO":1.3591734083,"TempNet-LLaMA2-Chat-7B-v0.1":0.7210775889,"nous-hermes-13b":0.7081240036,"gpt-3.5-turbo-0613":1.0371186215,"alpaca-7b_concise":0.4437510224,"baichuan-13b-chat":0.4176985079,"claude-3-5-sonnet-20240620":1.4679655404,"gpt-3.5-turbo-1106":0.8904117512,"minichat-3b":0.5041245962,"Storm-7B":1.4728176781,"oasst-sft-pythia-12b":0.3985580883,"Conifer-7B-DPO":0.9870897936,"Snorkel-Mistral-PairRM-DPO":1.3328273013,"internlm2-chat-20b-ExPO":1.4638315246,"Samba-CoE-v0.2":1.2171089783,"gemini-pro":1.1588503791,"pairrm-tulu-2-70b":1.19249667,"text_davinci_003":0.4346747594,"gpt4":1.2757042012,"Yi-34B-Chat":1.3225712598,"Starling-LM-7B-beta-ExPO":1.3252049543,"pairrm-Yi-34B-Chat":1.3482437399,"gpt4_1106_preview":0.0,"evo-7b":1.0835570389,"zephyr-7b-beta":0.9617876718,"guanaco-13b":0.5518606726,"alpaca-7b":0.4870855383,"internlm2-chat-20b-ppo":1.244366241,"gemma-2b-it":0.538998125,"pairrm-zephyr-7b-beta":1.0535874942,"evo-v2-7b":1.2159901798,"causallm-14b":0.9544127301,"SPPO-Mistral7B-PairRM":1.390800011,"gpt-3.5-turbo-1106_concise":0.8374438114,"openbuddy-llama-65b-v8":0.8871992619,"claude2-alpaca-13b":0.8249428868,"Starling-LM-7B-alpha-ExPO":1.2498324796,"openbuddy-falcon-7b-v6":0.5655836443,"gemma-7b-it":0.7869665732,"phi-2-sft":0.6098271417,"gpt4_gamed":0.6278799634,"llama-2-70b-chat-hf":1.0799847727,"openbuddy-llama2-70b-v10.1":0.8498371494,"wizardlm-70b":1.0395048913,"ultralm-13b-best-of-16":0.9418434059},"n_wins":{"gpt-3.5-turbo-0301":71.0,"gpt-3.5-turbo-1106_verbose":94.0,"vicuna-13b-v1.5-togetherai":53.0,"Qwen1.5-1.8B-Chat":27.0,"recycled-wizardlm-7b-v1.0":53.0,"aligner-2b_claude-3-opus-20240229":225.0,"Qwen1.5-110B-Chat":255.0,"claude-3-opus-20240229":223.0,"llama-2-7b-chat-hf":38.0,"mistral-medium":164.0,"vicuna-33b-v1.3":90.0,"cohere":96.0,"claude-2":131.0,"guanaco-65b":54.0,"Mixtral-8x7B-Instruct-v0.1":135.0,"openchat-v2-w-13b":67.0,"falcon-7b-instruct":16.0,"wizardlm-13b-v1.1":79.0,"Meta-Llama-3-8B-Instruct":176.0,"FsfairX-Zephyr-Chat-v0.1":285.0,"Infinity-Instruct-3M-0613-Mistral-7B":118.0,"Qwen1.5-72B-Chat":201.0,"xwinlm-7b-v0.1":77.0,"Mixtral-8x22B-Instruct-v0.1":174.0,"vicuna-13b-v1.5":48.0,"dbrx-instruct":147.0,"zephyr-7b-alpha":59.0,"tulu-2-dpo-13b":75.0,"Qwen1.5-7B-Chat":80.0,"Together-MoA-Lite":456.0,"cut-13b":83.0,"Meta-Llama-3-70B-Instruct":266.0,"vicuna-13b-v1.3":50.0,"claude-instant-1.2":120.0,"airoboros-65b":67.0,"openbuddy-llama2-13b-v11.1":42.0,"phi-2":15.0,"Together-MoA":490.0,"mistral-large-2402":166.0,"openbuddy-llama-30b-v7.1":47.0,"TempNet-LLaMA2-Chat-70B-v0.1":111.0,"pairrm-tulu-2-13b":110.0,"recycled-wizardlm-7b-v2.0":50.0,"Storm-7B-best-of-64":519.0,"vicuna-7b":28.0,"claude-3-sonnet-20240229":193.0,"Mistral-7B-Instruct-v0.2":113.0,"Samba-CoE-v0.1":124.0,"claude":129.0,"Nanbeige2-8B-Chat":323.0,"REBEL-Llama-3-8B-Instruct":268.0,"chatglm2-6b":19.0,"gpt-4o-2024-05-13":429.0,"gpt4_1106_preview_verbose":525.0,"TempNet-LLaMA2-Chat-13B-v0.1":56.0,"text_davinci_001":23.0,"Mixtral-8x7B-Instruct-v0.1_verbose":194.0,"baize-v2-7b":26.0,"phi-2-dpo":57.0,"alpaca-farm-ppo-human":32.0,"Nanbeige2-16B-Chat":288.0,"gpt4_0613":117.0,"pythia-12b-mix-sft":19.0,"alpaca-7b-neft":22.0,"Qwen1.5-14B-Chat":137.0,"gpt-4-0125-preview":446.0,"guanaco-33b":37.0,"oasst-sft-llama-33b":36.0,"gpt4_0613_verbose":171.0,"llama-2-chat-7b-evol70k-neft":57.0,"gpt35_turbo_instruct":66.0,"platolm-7b":42.0,"llama-2-13b-chat-hf":60.0,"Nanbeige-Plus-Chat-v0.1":456.0,"openchat-v2-13b":56.0,"mistral-orpo-beta":95.0,"Snorkel-Mistral-PairRM-DPO-best-of-16":270.0,"tulu-2-dpo-7b":64.0,"alpaca-7b_verbose":22.0,"OpenHermes-2.5-Mistral-7B":75.0,"claude-2.1_verbose":191.0,"ultralm-13b-v2.0":51.0,"deita-7b-v1.0":96.0,"minichat-1.5-3b":48.0,"Qwen-14B-Chat":57.0,"airoboros-33b":64.0,"alpaca-farm-ppo-sim-gpt4-20k":26.0,"ultralm-13b":38.0,"openbuddy-falcon-40b-v9":45.0,"openchat8192-13b":51.0,"wizardlm-13b":42.0,"vicuna-13b":44.0,"merlinite-7B-AOT":234.0,"gpt4_0314":172.0,"gpt4_0613_concise":71.0,"jina-chat":59.0,"Contextual-KTO-Mistral-PairRM":260.0,"xwinlm-13b-v0.1":129.0,"LMCocktail-10.7B-v1":104.0,"SPPO-Mistral7B-PairRM-ExPO":274.0,"Mixtral-8x7B-Instruct-v0.1_concise":105.0,"gpt4_1106_preview_concise":172.0,"Mistral-7B-ReMax-v0.1":120.0,"Llama-3-Instruct-8B-SimPO-ExPO":325.0,"dolphin-2.2.1-mistral-7b":68.0,"humpback-llama2-70b":77.0,"openpipe-moa-gpt-4-turbo-v1":515.0,"vicuna-7b-v1.5":35.0,"Starling-LM-7B-alpha":102.0,"falcon-40b-instruct":27.0,"Samba-CoE-v0.2-best-of-16":201.0,"opencoderplus-15b":52.0,"xwinlm-70b-v0.1":166.0,"wizardlm-13b-v1.2":82.0,"aligner-2b_qwen1.5-72b-chat":180.0,"internlm2-chat-7b-ExPO":209.0,"claude-2.1":115.0,"vicuna-7b-v1.3":31.0,"oasst-rlhf-llama-33b":44.0,"zephyr-7b-alpha-ExPO":79.0,"openchat-v3.1-13b":80.0,"SPPO-Llama-3-Instruct-8B-PairRM":310.0,"minotaur-13b":42.0,"tulu-2-dpo-13b-ExPO":121.0,"zephyr-7b-beta-ExPO":89.0,"tulu-2-dpo-7b-ExPO":91.0,"Llama-3-Instruct-8B-SimPO":319.0,"baize-v2-13b":32.0,"guanaco-7b":21.0,"ultralm-13b-v2.0-best-of-16":98.0,"claude-2.1_concise":72.0,"openchat-13b":58.0,"tulu-2-dpo-70b":119.0,"deepseek-llm-67b-chat":90.0,"humpback-llama-65b":70.0,"tulu-2-dpo-70b-ExPO":184.0,"TempNet-LLaMA2-Chat-7B-v0.1":39.0,"nous-hermes-13b":43.0,"gpt-3.5-turbo-0613":99.0,"alpaca-7b_concise":15.0,"baichuan-13b-chat":14.0,"claude-3-5-sonnet-20240620":312.0,"gpt-3.5-turbo-1106":64.0,"minichat-3b":22.0,"Storm-7B":397.0,"oasst-sft-pythia-12b":13.0,"Conifer-7B-DPO":87.0,"Snorkel-Mistral-PairRM-DPO":231.0,"internlm2-chat-20b-ExPO":375.0,"Samba-CoE-v0.2":159.0,"gemini-pro":135.0,"pairrm-tulu-2-70b":140.0,"text_davinci_003":14.0,"gpt4":179.0,"Yi-34B-Chat":219.0,"Starling-LM-7B-beta-ExPO":225.0,"pairrm-Yi-34B-Chat":239.0,"gpt4_1106_preview":0.0,"evo-7b":112.0,"zephyr-7b-beta":78.0,"guanaco-13b":22.0,"alpaca-7b":17.0,"internlm2-chat-20b-ppo":170.0,"gemma-2b-it":23.0,"pairrm-zephyr-7b-beta":98.0,"evo-v2-7b":158.0,"causallm-14b":81.0,"SPPO-Mistral7B-PairRM":249.0,"gpt-3.5-turbo-1106_concise":57.0,"openbuddy-llama-65b-v8":64.0,"claude2-alpaca-13b":59.0,"Starling-LM-7B-alpha-ExPO":148.0,"openbuddy-falcon-7b-v6":27.0,"gemma-7b-it":50.0,"phi-2-sft":28.0,"gpt4_gamed":32.0,"llama-2-70b-chat-hf":104.0,"openbuddy-llama2-70b-v10.1":57.0,"wizardlm-70b":106.0,"ultralm-13b-best-of-16":80.0},"n_wins_base":{"gpt-3.5-turbo-0301":733.0,"gpt-3.5-turbo-1106_verbose":709.0,"vicuna-13b-v1.5-togetherai":747.0,"Qwen1.5-1.8B-Chat":774.0,"recycled-wizardlm-7b-v1.0":752.0,"aligner-2b_claude-3-opus-20240229":475.0,"Qwen1.5-110B-Chat":545.0,"claude-3-opus-20240229":579.0,"llama-2-7b-chat-hf":766.0,"mistral-medium":639.0,"vicuna-33b-v1.3":711.0,"cohere":709.0,"claude-2":673.0,"guanaco-65b":751.0,"Mixtral-8x7B-Instruct-v0.1":668.0,"openchat-v2-w-13b":736.0,"falcon-7b-instruct":787.0,"wizardlm-13b-v1.1":723.0,"Meta-Llama-3-8B-Instruct":626.0,"FsfairX-Zephyr-Chat-v0.1":517.0,"Infinity-Instruct-3M-0613-Mistral-7B":687.0,"Qwen1.5-72B-Chat":600.0,"xwinlm-7b-v0.1":727.0,"Mixtral-8x22B-Instruct-v0.1":628.0,"vicuna-13b-v1.5":753.0,"dbrx-instruct":657.0,"zephyr-7b-alpha":745.0,"tulu-2-dpo-13b":728.0,"Qwen1.5-7B-Chat":721.0,"Together-MoA-Lite":347.0,"cut-13b":721.0,"Meta-Llama-3-70B-Instruct":537.0,"vicuna-13b-v1.3":751.0,"claude-instant-1.2":682.0,"airoboros-65b":735.0,"openbuddy-llama2-13b-v11.1":761.0,"phi-2":785.0,"Together-MoA":314.0,"mistral-large-2402":638.0,"openbuddy-llama-30b-v7.1":755.0,"TempNet-LLaMA2-Chat-70B-v0.1":691.0,"pairrm-tulu-2-13b":694.0,"recycled-wizardlm-7b-v2.0":755.0,"Storm-7B-best-of-64":286.0,"vicuna-7b":775.0,"claude-3-sonnet-20240229":608.0,"Mistral-7B-Instruct-v0.2":691.0,"Samba-CoE-v0.1":680.0,"claude":676.0,"Nanbeige2-8B-Chat":480.0,"REBEL-Llama-3-8B-Instruct":537.0,"chatglm2-6b":781.0,"gpt-4o-2024-05-13":369.0,"gpt4_1106_preview_verbose":268.0,"TempNet-LLaMA2-Chat-13B-v0.1":749.0,"text_davinci_001":777.0,"Mixtral-8x7B-Instruct-v0.1_verbose":609.0,"baize-v2-7b":779.0,"phi-2-dpo":748.0,"alpaca-farm-ppo-human":770.0,"Nanbeige2-16B-Chat":514.0,"gpt4_0613":684.0,"pythia-12b-mix-sft":786.0,"alpaca-7b-neft":783.0,"Qwen1.5-14B-Chat":664.0,"gpt-4-0125-preview":347.0,"guanaco-33b":768.0,"oasst-sft-llama-33b":764.0,"gpt4_0613_verbose":630.0,"llama-2-chat-7b-evol70k-neft":748.0,"gpt35_turbo_instruct":735.0,"platolm-7b":759.0,"llama-2-13b-chat-hf":744.0,"Nanbeige-Plus-Chat-v0.1":347.0,"openchat-v2-13b":746.0,"mistral-orpo-beta":707.0,"Snorkel-Mistral-PairRM-DPO-best-of-16":533.0,"tulu-2-dpo-7b":740.0,"alpaca-7b_verbose":778.0,"OpenHermes-2.5-Mistral-7B":727.0,"claude-2.1_verbose":613.0,"ultralm-13b-v2.0":754.0,"deita-7b-v1.0":708.0,"minichat-1.5-3b":757.0,"Qwen-14B-Chat":742.0,"airoboros-33b":740.0,"alpaca-farm-ppo-sim-gpt4-20k":776.0,"ultralm-13b":765.0,"openbuddy-falcon-40b-v9":758.0,"openchat8192-13b":754.0,"wizardlm-13b":759.0,"vicuna-13b":759.0,"merlinite-7B-AOT":571.0,"gpt4_0314":627.0,"gpt4_0613_concise":729.0,"jina-chat":743.0,"Contextual-KTO-Mistral-PairRM":544.0,"xwinlm-13b-v0.1":672.0,"LMCocktail-10.7B-v1":700.0,"SPPO-Mistral7B-PairRM-ExPO":531.0,"Mixtral-8x7B-Instruct-v0.1_concise":700.0,"gpt4_1106_preview_concise":622.0,"Mistral-7B-ReMax-v0.1":683.0,"Llama-3-Instruct-8B-SimPO-ExPO":479.0,"dolphin-2.2.1-mistral-7b":734.0,"humpback-llama2-70b":727.0,"openpipe-moa-gpt-4-turbo-v1":283.0,"vicuna-7b-v1.5":767.0,"Starling-LM-7B-alpha":702.0,"falcon-40b-instruct":777.0,"Samba-CoE-v0.2-best-of-16":601.0,"opencoderplus-15b":750.0,"xwinlm-70b-v0.1":635.0,"wizardlm-13b-v1.2":720.0,"aligner-2b_qwen1.5-72b-chat":473.0,"internlm2-chat-7b-ExPO":595.0,"claude-2.1":688.0,"vicuna-7b-v1.3":771.0,"oasst-rlhf-llama-33b":759.0,"zephyr-7b-alpha-ExPO":725.0,"openchat-v3.1-13b":720.0,"SPPO-Llama-3-Instruct-8B-PairRM":494.0,"minotaur-13b":758.0,"tulu-2-dpo-13b-ExPO":679.0,"zephyr-7b-beta-ExPO":716.0,"tulu-2-dpo-7b-ExPO":714.0,"Llama-3-Instruct-8B-SimPO":485.0,"baize-v2-13b":770.0,"guanaco-7b":783.0,"ultralm-13b-v2.0-best-of-16":705.0,"claude-2.1_concise":730.0,"openchat-13b":746.0,"tulu-2-dpo-70b":683.0,"deepseek-llm-67b-chat":713.0,"humpback-llama-65b":734.0,"tulu-2-dpo-70b-ExPO":620.0,"TempNet-LLaMA2-Chat-7B-v0.1":765.0,"nous-hermes-13b":761.0,"gpt-3.5-turbo-0613":700.0,"alpaca-7b_concise":787.0,"baichuan-13b-chat":790.0,"claude-3-5-sonnet-20240620":493.0,"gpt-3.5-turbo-1106":737.0,"minichat-3b":779.0,"Storm-7B":408.0,"oasst-sft-pythia-12b":790.0,"Conifer-7B-DPO":717.0,"Snorkel-Mistral-PairRM-DPO":572.0,"internlm2-chat-20b-ExPO":430.0,"Samba-CoE-v0.2":645.0,"gemini-pro":665.0,"pairrm-tulu-2-70b":665.0,"text_davinci_003":787.0,"gpt4":618.0,"Yi-34B-Chat":582.0,"Starling-LM-7B-beta-ExPO":580.0,"pairrm-Yi-34B-Chat":563.0,"gpt4_1106_preview":0.0,"evo-7b":689.0,"zephyr-7b-beta":725.0,"guanaco-13b":780.0,"alpaca-7b":785.0,"internlm2-chat-20b-ppo":632.0,"gemma-2b-it":782.0,"pairrm-zephyr-7b-beta":706.0,"evo-v2-7b":644.0,"causallm-14b":720.0,"SPPO-Mistral7B-PairRM":556.0,"gpt-3.5-turbo-1106_concise":744.0,"openbuddy-llama-65b-v8":738.0,"claude2-alpaca-13b":746.0,"Starling-LM-7B-alpha-ExPO":657.0,"openbuddy-falcon-7b-v6":778.0,"gemma-7b-it":754.0,"phi-2-sft":777.0,"gpt4_gamed":771.0,"llama-2-70b-chat-hf":700.0,"openbuddy-llama2-70b-v10.1":744.0,"wizardlm-70b":697.0,"ultralm-13b-best-of-16":723.0},"n_draws":{"gpt-3.5-turbo-0301":1.0,"gpt-3.5-turbo-1106_verbose":2.0,"vicuna-13b-v1.5-togetherai":5.0,"Qwen1.5-1.8B-Chat":3.0,"recycled-wizardlm-7b-v1.0":0.0,"aligner-2b_claude-3-opus-20240229":105.0,"Qwen1.5-110B-Chat":5.0,"claude-3-opus-20240229":3.0,"llama-2-7b-chat-hf":1.0,"mistral-medium":2.0,"vicuna-33b-v1.3":4.0,"cohere":0.0,"claude-2":1.0,"guanaco-65b":0.0,"Mixtral-8x7B-Instruct-v0.1":2.0,"openchat-v2-w-13b":2.0,"falcon-7b-instruct":2.0,"wizardlm-13b-v1.1":3.0,"Meta-Llama-3-8B-Instruct":3.0,"FsfairX-Zephyr-Chat-v0.1":3.0,"Infinity-Instruct-3M-0613-Mistral-7B":0.0,"Qwen1.5-72B-Chat":4.0,"xwinlm-7b-v0.1":1.0,"Mixtral-8x22B-Instruct-v0.1":3.0,"vicuna-13b-v1.5":4.0,"dbrx-instruct":1.0,"zephyr-7b-alpha":1.0,"tulu-2-dpo-13b":2.0,"Qwen1.5-7B-Chat":4.0,"Together-MoA-Lite":2.0,"cut-13b":1.0,"Meta-Llama-3-70B-Instruct":2.0,"vicuna-13b-v1.3":4.0,"claude-instant-1.2":3.0,"airoboros-65b":3.0,"openbuddy-llama2-13b-v11.1":2.0,"phi-2":3.0,"Together-MoA":1.0,"mistral-large-2402":1.0,"openbuddy-llama-30b-v7.1":3.0,"TempNet-LLaMA2-Chat-70B-v0.1":2.0,"pairrm-tulu-2-13b":1.0,"recycled-wizardlm-7b-v2.0":0.0,"Storm-7B-best-of-64":0.0,"vicuna-7b":2.0,"claude-3-sonnet-20240229":4.0,"Mistral-7B-Instruct-v0.2":1.0,"Samba-CoE-v0.1":1.0,"claude":0.0,"Nanbeige2-8B-Chat":2.0,"REBEL-Llama-3-8B-Instruct":0.0,"chatglm2-6b":5.0,"gpt-4o-2024-05-13":7.0,"gpt4_1106_preview_verbose":12.0,"TempNet-LLaMA2-Chat-13B-v0.1":0.0,"text_davinci_001":3.0,"Mixtral-8x7B-Instruct-v0.1_verbose":2.0,"baize-v2-7b":0.0,"phi-2-dpo":0.0,"alpaca-farm-ppo-human":3.0,"Nanbeige2-16B-Chat":3.0,"gpt4_0613":4.0,"pythia-12b-mix-sft":0.0,"alpaca-7b-neft":0.0,"Qwen1.5-14B-Chat":4.0,"gpt-4-0125-preview":12.0,"guanaco-33b":0.0,"oasst-sft-llama-33b":5.0,"gpt4_0613_verbose":4.0,"llama-2-chat-7b-evol70k-neft":0.0,"gpt35_turbo_instruct":3.0,"platolm-7b":2.0,"llama-2-13b-chat-hf":1.0,"Nanbeige-Plus-Chat-v0.1":2.0,"openchat-v2-13b":3.0,"mistral-orpo-beta":3.0,"Snorkel-Mistral-PairRM-DPO-best-of-16":2.0,"tulu-2-dpo-7b":1.0,"alpaca-7b_verbose":2.0,"OpenHermes-2.5-Mistral-7B":3.0,"claude-2.1_verbose":1.0,"ultralm-13b-v2.0":0.0,"deita-7b-v1.0":1.0,"minichat-1.5-3b":0.0,"Qwen-14B-Chat":6.0,"airoboros-33b":1.0,"alpaca-farm-ppo-sim-gpt4-20k":3.0,"ultralm-13b":2.0,"openbuddy-falcon-40b-v9":2.0,"openchat8192-13b":0.0,"wizardlm-13b":4.0,"vicuna-13b":2.0,"merlinite-7B-AOT":0.0,"gpt4_0314":6.0,"gpt4_0613_concise":5.0,"jina-chat":3.0,"Contextual-KTO-Mistral-PairRM":1.0,"xwinlm-13b-v0.1":4.0,"LMCocktail-10.7B-v1":1.0,"SPPO-Mistral7B-PairRM-ExPO":0.0,"Mixtral-8x7B-Instruct-v0.1_concise":0.0,"gpt4_1106_preview_concise":11.0,"Mistral-7B-ReMax-v0.1":2.0,"Llama-3-Instruct-8B-SimPO-ExPO":1.0,"dolphin-2.2.1-mistral-7b":3.0,"humpback-llama2-70b":1.0,"openpipe-moa-gpt-4-turbo-v1":7.0,"vicuna-7b-v1.5":3.0,"Starling-LM-7B-alpha":1.0,"falcon-40b-instruct":1.0,"Samba-CoE-v0.2-best-of-16":3.0,"opencoderplus-15b":3.0,"xwinlm-70b-v0.1":4.0,"wizardlm-13b-v1.2":3.0,"aligner-2b_qwen1.5-72b-chat":152.0,"internlm2-chat-7b-ExPO":1.0,"claude-2.1":2.0,"vicuna-7b-v1.3":3.0,"oasst-rlhf-llama-33b":2.0,"zephyr-7b-alpha-ExPO":1.0,"openchat-v3.1-13b":5.0,"SPPO-Llama-3-Instruct-8B-PairRM":1.0,"minotaur-13b":4.0,"tulu-2-dpo-13b-ExPO":5.0,"zephyr-7b-beta-ExPO":0.0,"tulu-2-dpo-7b-ExPO":0.0,"Llama-3-Instruct-8B-SimPO":1.0,"baize-v2-13b":3.0,"guanaco-7b":1.0,"ultralm-13b-v2.0-best-of-16":2.0,"claude-2.1_concise":3.0,"openchat-13b":1.0,"tulu-2-dpo-70b":3.0,"deepseek-llm-67b-chat":2.0,"humpback-llama-65b":1.0,"tulu-2-dpo-70b-ExPO":1.0,"TempNet-LLaMA2-Chat-7B-v0.1":1.0,"nous-hermes-13b":1.0,"gpt-3.5-turbo-0613":6.0,"alpaca-7b_concise":2.0,"baichuan-13b-chat":1.0,"claude-3-5-sonnet-20240620":0.0,"gpt-3.5-turbo-1106":4.0,"minichat-3b":4.0,"Storm-7B":0.0,"oasst-sft-pythia-12b":2.0,"Conifer-7B-DPO":1.0,"Snorkel-Mistral-PairRM-DPO":1.0,"internlm2-chat-20b-ExPO":0.0,"Samba-CoE-v0.2":1.0,"gemini-pro":5.0,"pairrm-tulu-2-70b":0.0,"text_davinci_003":4.0,"gpt4":8.0,"Yi-34B-Chat":4.0,"Starling-LM-7B-beta-ExPO":0.0,"pairrm-Yi-34B-Chat":3.0,"gpt4_1106_preview":805.0,"evo-7b":4.0,"zephyr-7b-beta":2.0,"guanaco-13b":3.0,"alpaca-7b":3.0,"internlm2-chat-20b-ppo":3.0,"gemma-2b-it":0.0,"pairrm-zephyr-7b-beta":1.0,"evo-v2-7b":3.0,"causallm-14b":4.0,"SPPO-Mistral7B-PairRM":0.0,"gpt-3.5-turbo-1106_concise":4.0,"openbuddy-llama-65b-v8":3.0,"claude2-alpaca-13b":0.0,"Starling-LM-7B-alpha-ExPO":0.0,"openbuddy-falcon-7b-v6":0.0,"gemma-7b-it":1.0,"phi-2-sft":0.0,"gpt4_gamed":2.0,"llama-2-70b-chat-hf":0.0,"openbuddy-llama2-70b-v10.1":4.0,"wizardlm-70b":2.0,"ultralm-13b-best-of-16":2.0},"n_total":{"gpt-3.5-turbo-0301":805.0,"gpt-3.5-turbo-1106_verbose":805.0,"vicuna-13b-v1.5-togetherai":805.0,"Qwen1.5-1.8B-Chat":804.0,"recycled-wizardlm-7b-v1.0":805.0,"aligner-2b_claude-3-opus-20240229":805.0,"Qwen1.5-110B-Chat":805.0,"claude-3-opus-20240229":805.0,"llama-2-7b-chat-hf":805.0,"mistral-medium":805.0,"vicuna-33b-v1.3":805.0,"cohere":805.0,"claude-2":805.0,"guanaco-65b":805.0,"Mixtral-8x7B-Instruct-v0.1":805.0,"openchat-v2-w-13b":805.0,"falcon-7b-instruct":805.0,"wizardlm-13b-v1.1":805.0,"Meta-Llama-3-8B-Instruct":805.0,"FsfairX-Zephyr-Chat-v0.1":805.0,"Infinity-Instruct-3M-0613-Mistral-7B":805.0,"Qwen1.5-72B-Chat":805.0,"xwinlm-7b-v0.1":805.0,"Mixtral-8x22B-Instruct-v0.1":805.0,"vicuna-13b-v1.5":805.0,"dbrx-instruct":805.0,"zephyr-7b-alpha":805.0,"tulu-2-dpo-13b":805.0,"Qwen1.5-7B-Chat":805.0,"Together-MoA-Lite":805.0,"cut-13b":805.0,"Meta-Llama-3-70B-Instruct":805.0,"vicuna-13b-v1.3":805.0,"claude-instant-1.2":805.0,"airoboros-65b":805.0,"openbuddy-llama2-13b-v11.1":805.0,"phi-2":803.0,"Together-MoA":805.0,"mistral-large-2402":805.0,"openbuddy-llama-30b-v7.1":805.0,"TempNet-LLaMA2-Chat-70B-v0.1":804.0,"pairrm-tulu-2-13b":805.0,"recycled-wizardlm-7b-v2.0":805.0,"Storm-7B-best-of-64":805.0,"vicuna-7b":805.0,"claude-3-sonnet-20240229":805.0,"Mistral-7B-Instruct-v0.2":805.0,"Samba-CoE-v0.1":805.0,"claude":805.0,"Nanbeige2-8B-Chat":805.0,"REBEL-Llama-3-8B-Instruct":805.0,"chatglm2-6b":805.0,"gpt-4o-2024-05-13":805.0,"gpt4_1106_preview_verbose":805.0,"TempNet-LLaMA2-Chat-13B-v0.1":805.0,"text_davinci_001":803.0,"Mixtral-8x7B-Instruct-v0.1_verbose":805.0,"baize-v2-7b":805.0,"phi-2-dpo":805.0,"alpaca-farm-ppo-human":805.0,"Nanbeige2-16B-Chat":805.0,"gpt4_0613":805.0,"pythia-12b-mix-sft":805.0,"alpaca-7b-neft":805.0,"Qwen1.5-14B-Chat":805.0,"gpt-4-0125-preview":805.0,"guanaco-33b":805.0,"oasst-sft-llama-33b":805.0,"gpt4_0613_verbose":805.0,"llama-2-chat-7b-evol70k-neft":805.0,"gpt35_turbo_instruct":804.0,"platolm-7b":803.0,"llama-2-13b-chat-hf":805.0,"Nanbeige-Plus-Chat-v0.1":805.0,"openchat-v2-13b":805.0,"mistral-orpo-beta":805.0,"Snorkel-Mistral-PairRM-DPO-best-of-16":805.0,"tulu-2-dpo-7b":805.0,"alpaca-7b_verbose":802.0,"OpenHermes-2.5-Mistral-7B":805.0,"claude-2.1_verbose":805.0,"ultralm-13b-v2.0":805.0,"deita-7b-v1.0":805.0,"minichat-1.5-3b":805.0,"Qwen-14B-Chat":805.0,"airoboros-33b":805.0,"alpaca-farm-ppo-sim-gpt4-20k":805.0,"ultralm-13b":805.0,"openbuddy-falcon-40b-v9":805.0,"openchat8192-13b":805.0,"wizardlm-13b":805.0,"vicuna-13b":805.0,"merlinite-7B-AOT":805.0,"gpt4_0314":805.0,"gpt4_0613_concise":805.0,"jina-chat":805.0,"Contextual-KTO-Mistral-PairRM":805.0,"xwinlm-13b-v0.1":805.0,"LMCocktail-10.7B-v1":805.0,"SPPO-Mistral7B-PairRM-ExPO":805.0,"Mixtral-8x7B-Instruct-v0.1_concise":805.0,"gpt4_1106_preview_concise":805.0,"Mistral-7B-ReMax-v0.1":805.0,"Llama-3-Instruct-8B-SimPO-ExPO":805.0,"dolphin-2.2.1-mistral-7b":805.0,"humpback-llama2-70b":805.0,"openpipe-moa-gpt-4-turbo-v1":805.0,"vicuna-7b-v1.5":805.0,"Starling-LM-7B-alpha":805.0,"falcon-40b-instruct":805.0,"Samba-CoE-v0.2-best-of-16":805.0,"opencoderplus-15b":805.0,"xwinlm-70b-v0.1":805.0,"wizardlm-13b-v1.2":805.0,"aligner-2b_qwen1.5-72b-chat":805.0,"internlm2-chat-7b-ExPO":805.0,"claude-2.1":805.0,"vicuna-7b-v1.3":805.0,"oasst-rlhf-llama-33b":805.0,"zephyr-7b-alpha-ExPO":805.0,"openchat-v3.1-13b":805.0,"SPPO-Llama-3-Instruct-8B-PairRM":805.0,"minotaur-13b":804.0,"tulu-2-dpo-13b-ExPO":805.0,"zephyr-7b-beta-ExPO":805.0,"tulu-2-dpo-7b-ExPO":805.0,"Llama-3-Instruct-8B-SimPO":805.0,"baize-v2-13b":805.0,"guanaco-7b":805.0,"ultralm-13b-v2.0-best-of-16":805.0,"claude-2.1_concise":805.0,"openchat-13b":805.0,"tulu-2-dpo-70b":805.0,"deepseek-llm-67b-chat":805.0,"humpback-llama-65b":805.0,"tulu-2-dpo-70b-ExPO":805.0,"TempNet-LLaMA2-Chat-7B-v0.1":805.0,"nous-hermes-13b":805.0,"gpt-3.5-turbo-0613":805.0,"alpaca-7b_concise":804.0,"baichuan-13b-chat":805.0,"claude-3-5-sonnet-20240620":805.0,"gpt-3.5-turbo-1106":805.0,"minichat-3b":805.0,"Storm-7B":805.0,"oasst-sft-pythia-12b":805.0,"Conifer-7B-DPO":805.0,"Snorkel-Mistral-PairRM-DPO":804.0,"internlm2-chat-20b-ExPO":805.0,"Samba-CoE-v0.2":805.0,"gemini-pro":805.0,"pairrm-tulu-2-70b":805.0,"text_davinci_003":805.0,"gpt4":805.0,"Yi-34B-Chat":805.0,"Starling-LM-7B-beta-ExPO":805.0,"pairrm-Yi-34B-Chat":805.0,"gpt4_1106_preview":805.0,"evo-7b":805.0,"zephyr-7b-beta":805.0,"guanaco-13b":805.0,"alpaca-7b":805.0,"internlm2-chat-20b-ppo":805.0,"gemma-2b-it":805.0,"pairrm-zephyr-7b-beta":805.0,"evo-v2-7b":805.0,"causallm-14b":805.0,"SPPO-Mistral7B-PairRM":805.0,"gpt-3.5-turbo-1106_concise":805.0,"openbuddy-llama-65b-v8":805.0,"claude2-alpaca-13b":805.0,"Starling-LM-7B-alpha-ExPO":805.0,"openbuddy-falcon-7b-v6":805.0,"gemma-7b-it":805.0,"phi-2-sft":805.0,"gpt4_gamed":805.0,"llama-2-70b-chat-hf":804.0,"openbuddy-llama2-70b-v10.1":805.0,"wizardlm-70b":805.0,"ultralm-13b-best-of-16":805.0},"discrete_win_rate":{"gpt-3.5-turbo-0301":8.8819875776,"gpt-3.5-turbo-1106_verbose":11.801242236,"vicuna-13b-v1.5-togetherai":6.8944099379,"Qwen1.5-1.8B-Chat":3.5447761194,"recycled-wizardlm-7b-v1.0":6.5838509317,"aligner-2b_claude-3-opus-20240229":34.4720496894,"Qwen1.5-110B-Chat":31.9875776398,"claude-3-opus-20240229":27.8881987578,"llama-2-7b-chat-hf":4.7826086957,"mistral-medium":20.4968944099,"vicuna-33b-v1.3":11.4285714286,"cohere":11.9254658385,"claude-2":16.3354037267,"guanaco-65b":6.7080745342,"Mixtral-8x7B-Instruct-v0.1":16.8944099379,"openchat-v2-w-13b":8.4472049689,"falcon-7b-instruct":2.1118012422,"wizardlm-13b-v1.1":10.0,"Meta-Llama-3-8B-Instruct":22.049689441,"FsfairX-Zephyr-Chat-v0.1":35.5900621118,"Infinity-Instruct-3M-0613-Mistral-7B":14.6583850932,"Qwen1.5-72B-Chat":25.2173913043,"xwinlm-7b-v0.1":9.6273291925,"Mixtral-8x22B-Instruct-v0.1":21.801242236,"vicuna-13b-v1.5":6.2111801242,"dbrx-instruct":18.3229813665,"zephyr-7b-alpha":7.3913043478,"tulu-2-dpo-13b":9.4409937888,"Qwen1.5-7B-Chat":10.1863354037,"Together-MoA-Lite":56.7701863354,"cut-13b":10.3726708075,"Meta-Llama-3-70B-Instruct":33.1677018634,"vicuna-13b-v1.3":6.4596273292,"claude-instant-1.2":15.0931677019,"airoboros-65b":8.5093167702,"openbuddy-llama2-13b-v11.1":5.3416149068,"phi-2":2.0547945205,"Together-MoA":60.9316770186,"mistral-large-2402":20.6832298137,"openbuddy-llama-30b-v7.1":6.0248447205,"TempNet-LLaMA2-Chat-70B-v0.1":13.9303482587,"pairrm-tulu-2-13b":13.7267080745,"recycled-wizardlm-7b-v2.0":6.2111801242,"Storm-7B-best-of-64":64.4720496894,"vicuna-7b":3.602484472,"claude-3-sonnet-20240229":24.2236024845,"Mistral-7B-Instruct-v0.2":14.099378882,"Samba-CoE-v0.1":15.4658385093,"claude":16.0248447205,"Nanbeige2-8B-Chat":40.248447205,"REBEL-Llama-3-8B-Instruct":33.2919254658,"chatglm2-6b":2.6708074534,"gpt-4o-2024-05-13":53.7267080745,"gpt4_1106_preview_verbose":65.9627329193,"TempNet-LLaMA2-Chat-13B-v0.1":6.9565217391,"text_davinci_001":3.0510585305,"Mixtral-8x7B-Instruct-v0.1_verbose":24.2236024845,"baize-v2-7b":3.2298136646,"phi-2-dpo":7.0807453416,"alpaca-farm-ppo-human":4.1614906832,"Nanbeige2-16B-Chat":35.9627329193,"gpt4_0613":14.7826086957,"pythia-12b-mix-sft":2.3602484472,"alpaca-7b-neft":2.7329192547,"Qwen1.5-14B-Chat":17.2670807453,"gpt-4-0125-preview":56.149068323,"guanaco-33b":4.5962732919,"oasst-sft-llama-33b":4.7826086957,"gpt4_0613_verbose":21.4906832298,"llama-2-chat-7b-evol70k-neft":7.0807453416,"gpt35_turbo_instruct":8.3955223881,"platolm-7b":5.3549190535,"llama-2-13b-chat-hf":7.5155279503,"Nanbeige-Plus-Chat-v0.1":56.7701863354,"openchat-v2-13b":7.1428571429,"mistral-orpo-beta":11.9875776398,"Snorkel-Mistral-PairRM-DPO-best-of-16":33.6645962733,"tulu-2-dpo-7b":8.0124223602,"alpaca-7b_verbose":2.8678304239,"OpenHermes-2.5-Mistral-7B":9.5031055901,"claude-2.1_verbose":23.7888198758,"ultralm-13b-v2.0":6.3354037267,"deita-7b-v1.0":11.9875776398,"minichat-1.5-3b":5.9627329193,"Qwen-14B-Chat":7.4534161491,"airoboros-33b":8.0124223602,"alpaca-farm-ppo-sim-gpt4-20k":3.4161490683,"ultralm-13b":4.8447204969,"openbuddy-falcon-40b-v9":5.7142857143,"openchat8192-13b":6.3354037267,"wizardlm-13b":5.4658385093,"vicuna-13b":5.5900621118,"merlinite-7B-AOT":29.0683229814,"gpt4_0314":21.7391304348,"gpt4_0613_concise":9.1304347826,"jina-chat":7.5155279503,"Contextual-KTO-Mistral-PairRM":32.3602484472,"xwinlm-13b-v0.1":16.2732919255,"LMCocktail-10.7B-v1":12.9813664596,"SPPO-Mistral7B-PairRM-ExPO":34.0372670807,"Mixtral-8x7B-Instruct-v0.1_concise":13.0434782609,"gpt4_1106_preview_concise":22.049689441,"Mistral-7B-ReMax-v0.1":15.0310559006,"Llama-3-Instruct-8B-SimPO-ExPO":40.4347826087,"dolphin-2.2.1-mistral-7b":8.6335403727,"humpback-llama2-70b":9.6273291925,"openpipe-moa-gpt-4-turbo-v1":64.4099378882,"vicuna-7b-v1.5":4.5341614907,"Starling-LM-7B-alpha":12.7329192547,"falcon-40b-instruct":3.4161490683,"Samba-CoE-v0.2-best-of-16":25.1552795031,"opencoderplus-15b":6.6459627329,"xwinlm-70b-v0.1":20.8695652174,"wizardlm-13b-v1.2":10.3726708075,"aligner-2b_qwen1.5-72b-chat":31.801242236,"internlm2-chat-7b-ExPO":26.0248447205,"claude-2.1":14.4099378882,"vicuna-7b-v1.3":4.0372670807,"oasst-rlhf-llama-33b":5.5900621118,"zephyr-7b-alpha-ExPO":9.8757763975,"openchat-v3.1-13b":10.248447205,"SPPO-Llama-3-Instruct-8B-PairRM":38.5714285714,"minotaur-13b":5.4726368159,"tulu-2-dpo-13b-ExPO":15.3416149068,"zephyr-7b-beta-ExPO":11.0559006211,"tulu-2-dpo-7b-ExPO":11.3043478261,"Llama-3-Instruct-8B-SimPO":39.6894409938,"baize-v2-13b":4.1614906832,"guanaco-7b":2.6708074534,"ultralm-13b-v2.0-best-of-16":12.298136646,"claude-2.1_concise":9.1304347826,"openchat-13b":7.2670807453,"tulu-2-dpo-70b":14.9689440994,"deepseek-llm-67b-chat":11.3043478261,"humpback-llama-65b":8.7577639752,"tulu-2-dpo-70b-ExPO":22.9192546584,"TempNet-LLaMA2-Chat-7B-v0.1":4.9068322981,"nous-hermes-13b":5.4037267081,"gpt-3.5-turbo-0613":12.6708074534,"alpaca-7b_concise":1.9900497512,"baichuan-13b-chat":1.801242236,"claude-3-5-sonnet-20240620":38.7577639752,"gpt-3.5-turbo-1106":8.198757764,"minichat-3b":2.9813664596,"Storm-7B":49.3167701863,"oasst-sft-pythia-12b":1.7391304348,"Conifer-7B-DPO":10.8695652174,"Snorkel-Mistral-PairRM-DPO":28.7935323383,"internlm2-chat-20b-ExPO":46.5838509317,"Samba-CoE-v0.2":19.8136645963,"gemini-pro":17.0807453416,"pairrm-tulu-2-70b":17.3913043478,"text_davinci_003":1.9875776398,"gpt4":22.7329192547,"Yi-34B-Chat":27.4534161491,"Starling-LM-7B-beta-ExPO":27.950310559,"pairrm-Yi-34B-Chat":29.8757763975,"gpt4_1106_preview":50.0,"evo-7b":14.1614906832,"zephyr-7b-beta":9.8136645963,"guanaco-13b":2.9192546584,"alpaca-7b":2.298136646,"internlm2-chat-20b-ppo":21.3043478261,"gemma-2b-it":2.8571428571,"pairrm-zephyr-7b-beta":12.2360248447,"evo-v2-7b":19.8136645963,"causallm-14b":10.3105590062,"SPPO-Mistral7B-PairRM":30.9316770186,"gpt-3.5-turbo-1106_concise":7.3291925466,"openbuddy-llama-65b-v8":8.1366459627,"claude2-alpaca-13b":7.3291925466,"Starling-LM-7B-alpha-ExPO":18.3850931677,"openbuddy-falcon-7b-v6":3.3540372671,"gemma-7b-it":6.2732919255,"phi-2-sft":3.4782608696,"gpt4_gamed":4.099378882,"llama-2-70b-chat-hf":12.9353233831,"openbuddy-llama2-70b-v10.1":7.3291925466,"wizardlm-70b":13.2919254658,"ultralm-13b-best-of-16":10.0621118012},"length_controlled_winrate":{"gpt-3.5-turbo-0301":18.093241552,"gpt-3.5-turbo-1106_verbose":22.0009370217,"vicuna-13b-v1.5-togetherai":11.6853569655,"Qwen1.5-1.8B-Chat":2.5884988492,"recycled-wizardlm-7b-v1.0":6.901477322,"aligner-2b_claude-3-opus-20240229":41.8230717152,"Qwen1.5-110B-Chat":43.905552211,"claude-3-opus-20240229":40.5095080124,"llama-2-7b-chat-hf":5.3548212795,"mistral-medium":28.6143374017,"vicuna-33b-v1.3":17.5745753109,"cohere":10.8930208866,"claude-2":28.1551961416,"guanaco-65b":8.2529169916,"Mixtral-8x7B-Instruct-v0.1":23.6884826013,"openchat-v2-w-13b":12.030427771,"falcon-7b-instruct":4.0369375668,"wizardlm-13b-v1.1":13.9157205928,"Meta-Llama-3-8B-Instruct":22.9187846731,"FsfairX-Zephyr-Chat-v0.1":34.787447623,"Infinity-Instruct-3M-0613-Mistral-7B":25.5015577947,"Qwen1.5-72B-Chat":36.571754112,"xwinlm-7b-v0.1":10.8122056273,"Mixtral-8x22B-Instruct-v0.1":30.8788102941,"vicuna-13b-v1.5":10.4844382985,"dbrx-instruct":25.1853410397,"zephyr-7b-alpha":10.2897608887,"tulu-2-dpo-13b":11.5544794281,"Qwen1.5-7B-Chat":14.7484310443,"Together-MoA-Lite":59.1415240989,"cut-13b":12.1547817539,"Meta-Llama-3-70B-Instruct":34.4245971745,"vicuna-13b-v1.3":10.8431649437,"claude-instant-1.2":25.6122590254,"airoboros-65b":11.0076424064,"openbuddy-llama2-13b-v11.1":9.159089775,"phi-2":4.3986822709,"Together-MoA":65.3799697685,"mistral-large-2402":32.6520799853,"openbuddy-llama-30b-v7.1":10.2144949912,"TempNet-LLaMA2-Chat-70B-v0.1":15.8311627784,"pairrm-tulu-2-13b":17.405203698,"recycled-wizardlm-7b-v2.0":7.5216099553,"Storm-7B-best-of-64":61.637895572,"vicuna-7b":6.2772177385,"claude-3-sonnet-20240229":34.8724743624,"Mistral-7B-Instruct-v0.2":17.111251846,"Samba-CoE-v0.1":22.8658373348,"claude":27.2895044437,"Nanbeige2-8B-Chat":25.2417704867,"REBEL-Llama-3-8B-Instruct":31.4699427971,"chatglm2-6b":4.3592829268,"gpt-4o-2024-05-13":57.4568288333,"gpt4_1106_preview_verbose":51.5750079797,"TempNet-LLaMA2-Chat-13B-v0.1":8.5783553109,"text_davinci_001":9.0257288521,"Mixtral-8x7B-Instruct-v0.1_verbose":23.2231207809,"baize-v2-7b":4.382564905,"phi-2-dpo":7.7708946203,"alpaca-farm-ppo-human":6.4186032949,"Nanbeige2-16B-Chat":40.5912863493,"gpt4_0613":30.1833223167,"pythia-12b-mix-sft":4.2213618614,"alpaca-7b-neft":3.5091458375,"Qwen1.5-14B-Chat":23.8966467702,"gpt-4-0125-preview":56.3562938462,"guanaco-33b":5.6900190909,"oasst-sft-llama-33b":9.8664121438,"gpt4_0613_verbose":33.8212668866,"llama-2-chat-7b-evol70k-neft":7.5330526555,"gpt35_turbo_instruct":17.7278010829,"platolm-7b":10.5434020728,"llama-2-13b-chat-hf":8.4360145489,"Nanbeige-Plus-Chat-v0.1":44.4596624034,"openchat-v2-13b":10.3996073385,"mistral-orpo-beta":14.7167494307,"Snorkel-Mistral-PairRM-DPO-best-of-16":29.9743216131,"tulu-2-dpo-7b":9.2002656115,"alpaca-7b_verbose":6.8163068164,"OpenHermes-2.5-Mistral-7B":16.2485776967,"claude-2.1_verbose":30.2911791666,"ultralm-13b-v2.0":9.1290184442,"deita-7b-v1.0":16.0590135397,"minichat-1.5-3b":7.7016328215,"Qwen-14B-Chat":12.3787417907,"airoboros-33b":10.7190026781,"alpaca-farm-ppo-sim-gpt4-20k":7.1218081016,"ultralm-13b":7.1081913613,"openbuddy-falcon-40b-v9":8.9889364779,"openchat8192-13b":7.8970617346,"wizardlm-13b":9.8281507688,"vicuna-13b":9.2220600237,"merlinite-7B-AOT":31.721885287,"gpt4_0314":35.3070612164,"gpt4_0613_concise":21.5779909145,"jina-chat":15.8660040495,"Contextual-KTO-Mistral-PairRM":29.7058089397,"xwinlm-13b-v0.1":17.9189378982,"LMCocktail-10.7B-v1":18.9507103867,"SPPO-Mistral7B-PairRM-ExPO":31.9003876312,"Mixtral-8x7B-Instruct-v0.1_concise":22.9626094728,"gpt4_1106_preview_concise":41.8966015912,"Mistral-7B-ReMax-v0.1":20.5513677023,"Llama-3-Instruct-8B-SimPO-ExPO":45.807978034,"dolphin-2.2.1-mistral-7b":13.1214776504,"humpback-llama2-70b":16.2491642314,"openpipe-moa-gpt-4-turbo-v1":68.3786625033,"vicuna-7b-v1.5":7.6168927319,"Starling-LM-7B-alpha":14.6904710794,"falcon-40b-instruct":5.6075325447,"Samba-CoE-v0.2-best-of-16":31.5065442681,"opencoderplus-15b":8.1524101557,"xwinlm-70b-v0.1":24.6496860571,"wizardlm-13b-v1.2":14.4625906943,"aligner-2b_qwen1.5-72b-chat":36.7258688784,"internlm2-chat-7b-ExPO":22.6674802488,"claude-2.1":25.2519438861,"vicuna-7b-v1.3":7.1564609564,"oasst-rlhf-llama-33b":7.9709218373,"zephyr-7b-alpha-ExPO":13.6232522647,"openchat-v3.1-13b":14.5033879568,"SPPO-Llama-3-Instruct-8B-PairRM":38.5628066368,"minotaur-13b":11.4652513168,"tulu-2-dpo-13b-ExPO":17.6509979624,"zephyr-7b-beta-ExPO":14.0012119801,"tulu-2-dpo-7b-ExPO":11.6088057579,"Llama-3-Instruct-8B-SimPO":44.6804680926,"baize-v2-13b":7.012247205,"guanaco-7b":2.8711168131,"ultralm-13b-v2.0-best-of-16":14.1989875666,"claude-2.1_concise":18.2084579084,"openchat-13b":8.8060534912,"tulu-2-dpo-70b":21.2386100384,"deepseek-llm-67b-chat":17.8433840899,"humpback-llama-65b":12.7998599959,"tulu-2-dpo-70b-ExPO":25.7233081711,"TempNet-LLaMA2-Chat-7B-v0.1":5.7396138367,"nous-hermes-13b":9.7178634178,"gpt-3.5-turbo-0613":22.3525129805,"alpaca-7b_concise":4.4672516799,"baichuan-13b-chat":2.0621702536,"claude-3-5-sonnet-20240620":52.3667542714,"gpt-3.5-turbo-1106":19.300589035,"minichat-3b":5.7293328759,"Storm-7B":50.4080792281,"oasst-sft-pythia-12b":3.2701021145,"Conifer-7B-DPO":17.1124958828,"Snorkel-Mistral-PairRM-DPO":26.3914464573,"internlm2-chat-20b-ExPO":27.2257594808,"Samba-CoE-v0.2":27.6242673501,"gemini-pro":24.381776108,"pairrm-tulu-2-70b":21.4284039755,"text_davinci_003":4.5664105675,"gpt4":38.1280897444,"Yi-34B-Chat":27.1905478776,"Starling-LM-7B-beta-ExPO":26.4869564984,"pairrm-Yi-34B-Chat":28.8148408668,"gpt4_1106_preview":50.0,"evo-7b":16.4893860042,"zephyr-7b-beta":13.2031984931,"guanaco-13b":3.0037873296,"alpaca-7b":5.8754871633,"internlm2-chat-20b-ppo":18.7487394854,"gemma-2b-it":5.4374536204,"pairrm-zephyr-7b-beta":15.529867295,"evo-v2-7b":23.357705702,"causallm-14b":15.720325189,"SPPO-Mistral7B-PairRM":30.4941379652,"gpt-3.5-turbo-1106_concise":15.7695209839,"openbuddy-llama-65b-v8":12.4693562891,"claude2-alpaca-13b":11.4988982132,"Starling-LM-7B-alpha-ExPO":19.4741654606,"openbuddy-falcon-7b-v6":4.8261244822,"gemma-7b-it":10.4257604037,"phi-2-sft":5.8537876906,"gpt4_gamed":12.1887640576,"llama-2-70b-chat-hf":14.6896485884,"openbuddy-llama2-70b-v10.1":12.5721732723,"wizardlm-70b":17.5750607375,"ultralm-13b-best-of-16":9.8760888169},"lc_standard_error":{"gpt-3.5-turbo-0301":0.7864976807,"gpt-3.5-turbo-1106_verbose":0.8544953416,"vicuna-13b-v1.5-togetherai":0.6243797898,"Qwen1.5-1.8B-Chat":0.2021610274,"recycled-wizardlm-7b-v1.0":0.4105893841,"aligner-2b_claude-3-opus-20240229":0.7776876699,"Qwen1.5-110B-Chat":0.8945807936,"claude-3-opus-20240229":0.8837504763,"llama-2-7b-chat-hf":0.3326400931,"mistral-medium":0.9075464438,"vicuna-33b-v1.3":0.7099362877,"cohere":0.5206791146,"claude-2":0.8779084794,"guanaco-65b":0.46281361,"Mixtral-8x7B-Instruct-v0.1":0.9011105015,"openchat-v2-w-13b":0.5657607148,"falcon-7b-instruct":0.268726544,"wizardlm-13b-v1.1":0.6712555976,"Meta-Llama-3-8B-Instruct":0.849800882,"FsfairX-Zephyr-Chat-v0.1":0.7594505141,"Infinity-Instruct-3M-0613-Mistral-7B":0.7760697229,"Qwen1.5-72B-Chat":0.9357421321,"xwinlm-7b-v0.1":0.5519849159,"Mixtral-8x22B-Instruct-v0.1":0.9518125819,"vicuna-13b-v1.5":0.5980193852,"dbrx-instruct":0.8999456518,"zephyr-7b-alpha":0.5879820221,"tulu-2-dpo-13b":0.6494943093,"Qwen1.5-7B-Chat":0.6490365375,"Together-MoA-Lite":0.7580510219,"cut-13b":0.6383138465,"Meta-Llama-3-70B-Instruct":0.8691832384,"vicuna-13b-v1.3":0.6100962742,"claude-instant-1.2":0.87464248,"airoboros-65b":0.6004520879,"openbuddy-llama2-13b-v11.1":0.5636847159,"phi-2":0.1293627793,"Together-MoA":0.7392392837,"mistral-large-2402":0.9044632955,"openbuddy-llama-30b-v7.1":0.6099418552,"TempNet-LLaMA2-Chat-70B-v0.1":0.7195404924,"pairrm-tulu-2-13b":0.7958946232,"recycled-wizardlm-7b-v2.0":0.4355543699,"Storm-7B-best-of-64":0.6799412402,"vicuna-7b":0.3964740967,"claude-3-sonnet-20240229":0.949844689,"Mistral-7B-Instruct-v0.2":0.7875592102,"Samba-CoE-v0.1":0.7405123259,"claude":0.858614564,"Nanbeige2-8B-Chat":0.5909370499,"REBEL-Llama-3-8B-Instruct":0.8138922262,"chatglm2-6b":0.2913010016,"gpt-4o-2024-05-13":0.7774399385,"gpt4_1106_preview_verbose":0.8313707608,"TempNet-LLaMA2-Chat-13B-v0.1":0.4783538284,"text_davinci_001":0.2169278281,"Mixtral-8x7B-Instruct-v0.1_verbose":0.7975932103,"baize-v2-7b":0.3307775329,"phi-2-dpo":0.420015191,"alpaca-farm-ppo-human":0.4202234849,"Nanbeige2-16B-Chat":0.8504106275,"gpt4_0613":0.7874508454,"pythia-12b-mix-sft":0.2932467883,"alpaca-7b-neft":0.2516233369,"Qwen1.5-14B-Chat":0.7729838609,"gpt-4-0125-preview":0.7731843456,"guanaco-33b":0.3195322556,"oasst-sft-llama-33b":0.5204539206,"gpt4_0613_verbose":0.8842151461,"llama-2-chat-7b-evol70k-neft":0.4277221418,"gpt35_turbo_instruct":0.3748783811,"platolm-7b":0.3937696385,"llama-2-13b-chat-hf":0.5161956367,"Nanbeige-Plus-Chat-v0.1":0.7209678864,"openchat-v2-13b":0.5398936504,"mistral-orpo-beta":0.6895695724,"Snorkel-Mistral-PairRM-DPO-best-of-16":0.7464891533,"tulu-2-dpo-7b":0.5465634637,"alpaca-7b_verbose":0.2437107339,"OpenHermes-2.5-Mistral-7B":0.7206735233,"claude-2.1_verbose":0.6612722747,"ultralm-13b-v2.0":0.5248779977,"deita-7b-v1.0":0.7398615266,"minichat-1.5-3b":0.4364271175,"Qwen-14B-Chat":0.6714412819,"airoboros-33b":0.5566576337,"alpaca-farm-ppo-sim-gpt4-20k":0.456168214,"ultralm-13b":0.4337345632,"openbuddy-falcon-40b-v9":0.545771106,"openchat8192-13b":0.4356316711,"wizardlm-13b":0.5385026234,"vicuna-13b":0.5388256266,"merlinite-7B-AOT":0.8150560619,"gpt4_0314":0.8997916758,"gpt4_0613_concise":0.7524372534,"jina-chat":0.6805565304,"Contextual-KTO-Mistral-PairRM":0.7122554396,"xwinlm-13b-v0.1":0.7513299972,"LMCocktail-10.7B-v1":0.8369176162,"SPPO-Mistral7B-PairRM-ExPO":0.7655500294,"Mixtral-8x7B-Instruct-v0.1_concise":0.8710401023,"gpt4_1106_preview_concise":0.7406558917,"Mistral-7B-ReMax-v0.1":0.807838924,"Llama-3-Instruct-8B-SimPO-ExPO":0.8703329817,"dolphin-2.2.1-mistral-7b":0.6251596825,"humpback-llama2-70b":0.6984941388,"openpipe-moa-gpt-4-turbo-v1":0.7309418615,"vicuna-7b-v1.5":0.4868743581,"Starling-LM-7B-alpha":0.658381614,"falcon-40b-instruct":0.3565968022,"Samba-CoE-v0.2-best-of-16":0.7338723477,"opencoderplus-15b":0.4567320517,"xwinlm-70b-v0.1":0.9059240217,"wizardlm-13b-v1.2":0.6741078562,"aligner-2b_qwen1.5-72b-chat":0.6787999003,"internlm2-chat-7b-ExPO":0.629923982,"claude-2.1":0.7515108894,"vicuna-7b-v1.3":0.4355620786,"oasst-rlhf-llama-33b":0.4061516205,"zephyr-7b-alpha-ExPO":0.7160268998,"openchat-v3.1-13b":0.6974328561,"SPPO-Llama-3-Instruct-8B-PairRM":0.8694594533,"minotaur-13b":0.368757115,"tulu-2-dpo-13b-ExPO":0.5166082438,"zephyr-7b-beta-ExPO":0.5303710259,"tulu-2-dpo-7b-ExPO":0.4355576278,"Llama-3-Instruct-8B-SimPO":0.8789917177,"baize-v2-13b":0.4685705196,"guanaco-7b":0.2018869696,"ultralm-13b-v2.0-best-of-16":0.6555243163,"claude-2.1_concise":0.6338526283,"openchat-13b":0.4470052867,"tulu-2-dpo-70b":0.8610574163,"deepseek-llm-67b-chat":0.7439504148,"humpback-llama-65b":0.6567402094,"tulu-2-dpo-70b-ExPO":0.4593179402,"TempNet-LLaMA2-Chat-7B-v0.1":0.3407340673,"nous-hermes-13b":0.5572824918,"gpt-3.5-turbo-0613":0.8045156377,"alpaca-7b_concise":0.2820018938,"baichuan-13b-chat":0.1525670221,"claude-3-5-sonnet-20240620":0.7976856335,"gpt-3.5-turbo-1106":0.7682908268,"minichat-3b":0.3565910812,"Storm-7B":0.7188927916,"oasst-sft-pythia-12b":0.2064079261,"Conifer-7B-DPO":0.7602280224,"Snorkel-Mistral-PairRM-DPO":0.6739888325,"internlm2-chat-20b-ExPO":0.5877331102,"Samba-CoE-v0.2":0.6875926799,"gemini-pro":0.8158961767,"pairrm-tulu-2-70b":0.8359305763,"text_davinci_003":0.3109387936,"gpt4":0.9069675584,"Yi-34B-Chat":0.7470363322,"Starling-LM-7B-beta-ExPO":0.7549415682,"pairrm-Yi-34B-Chat":0.8310750322,"gpt4_1106_preview":0.0,"evo-7b":0.502828858,"zephyr-7b-beta":0.6521227924,"guanaco-13b":0.2069624951,"alpaca-7b":0.3755224975,"internlm2-chat-20b-ppo":0.7522583795,"gemma-2b-it":0.3236386036,"pairrm-zephyr-7b-beta":0.7455357676,"evo-v2-7b":0.6353106561,"causallm-14b":0.7103430968,"SPPO-Mistral7B-PairRM":0.8458266977,"gpt-3.5-turbo-1106_concise":0.7318554971,"openbuddy-llama-65b-v8":0.6457736922,"claude2-alpaca-13b":0.6646440129,"Starling-LM-7B-alpha-ExPO":0.4701002864,"openbuddy-falcon-7b-v6":0.3350353845,"gemma-7b-it":0.4807679381,"phi-2-sft":0.3931141644,"gpt4_gamed":0.3987510662,"llama-2-70b-chat-hf":0.6625475757,"openbuddy-llama2-70b-v10.1":0.6740810303,"wizardlm-70b":0.7233004015,"ultralm-13b-best-of-16":0.4814959281},"num_tokens_mean":{"gpt-3.5-turbo-0301":179,"gpt-3.5-turbo-1106_verbose":214,"vicuna-13b-v1.5-togetherai":231,"Qwen1.5-1.8B-Chat":586,"recycled-wizardlm-7b-v1.0":299,"aligner-2b_claude-3-opus-20240229":354,"Qwen1.5-110B-Chat":346,"claude-3-opus-20240229":292,"llama-2-7b-chat-hf":302,"mistral-medium":327,"vicuna-33b-v1.3":315,"cohere":396,"claude-2":227,"guanaco-65b":262,"Mixtral-8x7B-Instruct-v0.1":311,"openchat-v2-w-13b":335,"falcon-7b-instruct":106,"wizardlm-13b-v1.1":325,"Meta-Llama-3-8B-Instruct":412,"FsfairX-Zephyr-Chat-v0.1":505,"Infinity-Instruct-3M-0613-Mistral-7B":106,"Qwen1.5-72B-Chat":342,"xwinlm-7b-v0.1":407,"Mixtral-8x22B-Instruct-v0.1":307,"vicuna-13b-v1.5":229,"dbrx-instruct":310,"zephyr-7b-alpha":283,"tulu-2-dpo-13b":370,"Qwen1.5-7B-Chat":342,"Together-MoA-Lite":420,"cut-13b":348,"Meta-Llama-3-70B-Instruct":416,"vicuna-13b-v1.3":244,"claude-instant-1.2":233,"airoboros-65b":312,"openbuddy-llama2-13b-v11.1":228,"phi-2":142,"Together-MoA":386,"mistral-large-2402":290,"openbuddy-llama-30b-v7.1":208,"TempNet-LLaMA2-Chat-70B-v0.1":384,"pairrm-tulu-2-13b":313,"recycled-wizardlm-7b-v2.0":317,"Storm-7B-best-of-64":486,"vicuna-7b":224,"claude-3-sonnet-20240229":297,"Mistral-7B-Instruct-v0.2":362,"Samba-CoE-v0.1":258,"claude":229,"Nanbeige2-8B-Chat":561,"REBEL-Llama-3-8B-Instruct":509,"chatglm2-6b":234,"gpt-4o-2024-05-13":406,"gpt4_1106_preview_verbose":505,"TempNet-LLaMA2-Chat-13B-v0.1":320,"text_davinci_001":67,"Mixtral-8x7B-Instruct-v0.1_verbose":438,"baize-v2-7b":247,"phi-2-dpo":356,"alpaca-farm-ppo-human":169,"Nanbeige2-16B-Chat":394,"gpt4_0613":245,"pythia-12b-mix-sft":197,"alpaca-7b-neft":216,"Qwen1.5-14B-Chat":348,"gpt-4-0125-preview":417,"guanaco-33b":283,"oasst-sft-llama-33b":161,"gpt4_0613_verbose":313,"llama-2-chat-7b-evol70k-neft":330,"gpt35_turbo_instruct":219,"platolm-7b":272,"llama-2-13b-chat-hf":316,"Nanbeige-Plus-Chat-v0.1":529,"openchat-v2-13b":330,"mistral-orpo-beta":344,"Snorkel-Mistral-PairRM-DPO-best-of-16":526,"tulu-2-dpo-7b":399,"alpaca-7b_verbose":119,"OpenHermes-2.5-Mistral-7B":237,"claude-2.1_verbose":292,"ultralm-13b-v2.0":299,"deita-7b-v1.0":303,"minichat-1.5-3b":322,"Qwen-14B-Chat":223,"airoboros-33b":315,"alpaca-farm-ppo-sim-gpt4-20k":104,"ultralm-13b":233,"openbuddy-falcon-40b-v9":238,"openchat8192-13b":359,"wizardlm-13b":211,"vicuna-13b":224,"merlinite-7B-AOT":385,"gpt4_0314":289,"gpt4_0613_concise":138,"jina-chat":138,"Contextual-KTO-Mistral-PairRM":506,"xwinlm-13b-v0.1":400,"LMCocktail-10.7B-v1":244,"SPPO-Mistral7B-PairRM-ExPO":459,"Mixtral-8x7B-Instruct-v0.1_concise":195,"gpt4_1106_preview_concise":244,"Mistral-7B-ReMax-v0.1":314,"Llama-3-Instruct-8B-SimPO-ExPO":370,"dolphin-2.2.1-mistral-7b":243,"humpback-llama2-70b":234,"openpipe-moa-gpt-4-turbo-v1":377,"vicuna-7b-v1.5":234,"Starling-LM-7B-alpha":392,"falcon-40b-instruct":148,"Samba-CoE-v0.2-best-of-16":312,"opencoderplus-15b":354,"xwinlm-70b-v0.1":381,"wizardlm-13b-v1.2":354,"aligner-2b_qwen1.5-72b-chat":383,"internlm2-chat-7b-ExPO":494,"claude-2.1":228,"vicuna-7b-v1.3":238,"oasst-rlhf-llama-33b":243,"zephyr-7b-alpha-ExPO":267,"openchat-v3.1-13b":318,"SPPO-Llama-3-Instruct-8B-PairRM":443,"minotaur-13b":186,"tulu-2-dpo-13b-ExPO":350,"zephyr-7b-beta-ExPO":297,"tulu-2-dpo-7b-ExPO":373,"Llama-3-Instruct-8B-SimPO":385,"baize-v2-13b":201,"guanaco-7b":311,"ultralm-13b-v2.0-best-of-16":354,"claude-2.1_concise":119,"openchat-13b":349,"tulu-2-dpo-70b":313,"deepseek-llm-67b-chat":250,"humpback-llama-65b":254,"tulu-2-dpo-70b-ExPO":368,"TempNet-LLaMA2-Chat-7B-v0.1":308,"nous-hermes-13b":187,"gpt-3.5-turbo-0613":270,"alpaca-7b_concise":77,"baichuan-13b-chat":310,"claude-3-5-sonnet-20240620":315,"gpt-3.5-turbo-1106":164,"minichat-3b":186,"Storm-7B":429,"oasst-sft-pythia-12b":158,"Conifer-7B-DPO":266,"Snorkel-Mistral-PairRM-DPO":566,"internlm2-chat-20b-ExPO":693,"Samba-CoE-v0.2":292,"gemini-pro":313,"pairrm-tulu-2-70b":405,"text_davinci_003":69,"gpt4":287,"Yi-34B-Chat":447,"Starling-LM-7B-beta-ExPO":447,"pairrm-Yi-34B-Chat":464,"gpt4_1106_preview":431,"evo-7b":382,"zephyr-7b-beta":311,"guanaco-13b":416,"alpaca-7b":85,"internlm2-chat-20b-ppo":513,"gemma-2b-it":222,"pairrm-zephyr-7b-beta":315,"evo-v2-7b":375,"causallm-14b":302,"SPPO-Mistral7B-PairRM":431,"gpt-3.5-turbo-1106_concise":90,"openbuddy-llama-65b-v8":249,"claude2-alpaca-13b":243,"Starling-LM-7B-alpha-ExPO":376,"openbuddy-falcon-7b-v6":248,"gemma-7b-it":241,"phi-2-sft":225,"gpt4_gamed":15,"llama-2-70b-chat-hf":377,"openbuddy-llama2-70b-v10.1":230,"wizardlm-70b":332,"ultralm-13b-best-of-16":400},"num_tokens_std":{"gpt-3.5-turbo-0301":150,"gpt-3.5-turbo-1106_verbose":144,"vicuna-13b-v1.5-togetherai":163,"Qwen1.5-1.8B-Chat":611,"recycled-wizardlm-7b-v1.0":186,"aligner-2b_claude-3-opus-20240229":206,"Qwen1.5-110B-Chat":211,"claude-3-opus-20240229":156,"llama-2-7b-chat-hf":170,"mistral-medium":305,"vicuna-33b-v1.3":198,"cohere":225,"claude-2":114,"guanaco-65b":212,"Mixtral-8x7B-Instruct-v0.1":219,"openchat-v2-w-13b":210,"falcon-7b-instruct":177,"wizardlm-13b-v1.1":213,"Meta-Llama-3-8B-Instruct":231,"FsfairX-Zephyr-Chat-v0.1":311,"Infinity-Instruct-3M-0613-Mistral-7B":102,"Qwen1.5-72B-Chat":346,"xwinlm-7b-v0.1":308,"Mixtral-8x22B-Instruct-v0.1":253,"vicuna-13b-v1.5":164,"dbrx-instruct":193,"zephyr-7b-alpha":272,"tulu-2-dpo-13b":592,"Qwen1.5-7B-Chat":199,"Together-MoA-Lite":233,"cut-13b":208,"Meta-Llama-3-70B-Instruct":238,"vicuna-13b-v1.3":167,"claude-instant-1.2":132,"airoboros-65b":372,"openbuddy-llama2-13b-v11.1":175,"phi-2":312,"Together-MoA":217,"mistral-large-2402":191,"openbuddy-llama-30b-v7.1":162,"TempNet-LLaMA2-Chat-70B-v0.1":258,"pairrm-tulu-2-13b":194,"recycled-wizardlm-7b-v2.0":173,"Storm-7B-best-of-64":287,"vicuna-7b":161,"claude-3-sonnet-20240229":178,"Mistral-7B-Instruct-v0.2":360,"Samba-CoE-v0.1":194,"claude":123,"Nanbeige2-8B-Chat":239,"REBEL-Llama-3-8B-Instruct":463,"chatglm2-6b":352,"gpt-4o-2024-05-13":273,"gpt4_1106_preview_verbose":261,"TempNet-LLaMA2-Chat-13B-v0.1":178,"text_davinci_001":78,"Mixtral-8x7B-Instruct-v0.1_verbose":225,"baize-v2-7b":183,"phi-2-dpo":188,"alpaca-farm-ppo-human":171,"Nanbeige2-16B-Chat":214,"gpt4_0613":164,"pythia-12b-mix-sft":181,"alpaca-7b-neft":97,"Qwen1.5-14B-Chat":224,"gpt-4-0125-preview":242,"guanaco-33b":236,"oasst-sft-llama-33b":145,"gpt4_0613_verbose":166,"llama-2-chat-7b-evol70k-neft":143,"gpt35_turbo_instruct":241,"platolm-7b":187,"llama-2-13b-chat-hf":175,"Nanbeige-Plus-Chat-v0.1":216,"openchat-v2-13b":205,"mistral-orpo-beta":205,"Snorkel-Mistral-PairRM-DPO-best-of-16":255,"tulu-2-dpo-7b":738,"alpaca-7b_verbose":111,"OpenHermes-2.5-Mistral-7B":225,"claude-2.1_verbose":120,"ultralm-13b-v2.0":188,"deita-7b-v1.0":221,"minichat-1.5-3b":215,"Qwen-14B-Chat":227,"airoboros-33b":324,"alpaca-farm-ppo-sim-gpt4-20k":68,"ultralm-13b":158,"openbuddy-falcon-40b-v9":198,"openchat8192-13b":257,"wizardlm-13b":155,"vicuna-13b":153,"merlinite-7B-AOT":180,"gpt4_0314":212,"gpt4_0613_concise":110,"jina-chat":101,"Contextual-KTO-Mistral-PairRM":272,"xwinlm-13b-v0.1":268,"LMCocktail-10.7B-v1":168,"SPPO-Mistral7B-PairRM-ExPO":193,"Mixtral-8x7B-Instruct-v0.1_concise":185,"gpt4_1106_preview_concise":184,"Mistral-7B-ReMax-v0.1":188,"Llama-3-Instruct-8B-SimPO-ExPO":191,"dolphin-2.2.1-mistral-7b":218,"humpback-llama2-70b":214,"openpipe-moa-gpt-4-turbo-v1":193,"vicuna-7b-v1.5":165,"Starling-LM-7B-alpha":250,"falcon-40b-instruct":182,"Samba-CoE-v0.2-best-of-16":199,"opencoderplus-15b":272,"xwinlm-70b-v0.1":267,"wizardlm-13b-v1.2":308,"aligner-2b_qwen1.5-72b-chat":210,"internlm2-chat-7b-ExPO":232,"claude-2.1":114,"vicuna-7b-v1.3":159,"oasst-rlhf-llama-33b":257,"zephyr-7b-alpha-ExPO":204,"openchat-v3.1-13b":224,"SPPO-Llama-3-Instruct-8B-PairRM":208,"minotaur-13b":164,"tulu-2-dpo-13b-ExPO":203,"zephyr-7b-beta-ExPO":217,"tulu-2-dpo-7b-ExPO":217,"Llama-3-Instruct-8B-SimPO":200,"baize-v2-13b":149,"guanaco-7b":292,"ultralm-13b-v2.0-best-of-16":169,"claude-2.1_concise":88,"openchat-13b":241,"tulu-2-dpo-70b":281,"deepseek-llm-67b-chat":206,"humpback-llama-65b":156,"tulu-2-dpo-70b-ExPO":203,"TempNet-LLaMA2-Chat-7B-v0.1":178,"nous-hermes-13b":201,"gpt-3.5-turbo-0613":194,"alpaca-7b_concise":70,"baichuan-13b-chat":400,"claude-3-5-sonnet-20240620":200,"gpt-3.5-turbo-1106":134,"minichat-3b":156,"Storm-7B":188,"oasst-sft-pythia-12b":186,"Conifer-7B-DPO":163,"Snorkel-Mistral-PairRM-DPO":342,"internlm2-chat-20b-ExPO":283,"Samba-CoE-v0.2":196,"gemini-pro":227,"pairrm-tulu-2-70b":438,"text_davinci_003":89,"gpt4":203,"Yi-34B-Chat":235,"Starling-LM-7B-beta-ExPO":143,"pairrm-Yi-34B-Chat":276,"gpt4_1106_preview":249,"evo-7b":241,"zephyr-7b-beta":238,"guanaco-13b":402,"alpaca-7b":69,"internlm2-chat-20b-ppo":403,"gemma-2b-it":168,"pairrm-zephyr-7b-beta":250,"evo-v2-7b":235,"causallm-14b":234,"SPPO-Mistral7B-PairRM":198,"gpt-3.5-turbo-1106_concise":80,"openbuddy-llama-65b-v8":152,"claude2-alpaca-13b":176,"Starling-LM-7B-alpha-ExPO":220,"openbuddy-falcon-7b-v6":189,"gemma-7b-it":160,"phi-2-sft":164,"gpt4_gamed":81,"llama-2-70b-chat-hf":222,"openbuddy-llama2-70b-v10.1":161,"wizardlm-70b":231,"ultralm-13b-best-of-16":164}}
 
 
data/model_win_rates.jsonl ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"num_words_mean":257,"num_words_std":145,"win_rate":34.4633736232,"standard_error":1.3146665263,"n_wins":225.0,"n_wins_base":475.0,"n_draws":105.0,"n_total":805.0,"discrete_win_rate":34.4720496894,"length_controlled_winrate":41.8230717152,"lc_standard_error":0.7776876699,"num_tokens_mean":354,"num_tokens_std":206,"model_name":"aligner-2b_claude-3-opus-20240229"}
2
+ {"num_words_mean":253,"num_words_std":153,"win_rate":33.7770952757,"standard_error":1.3776163154,"n_wins":255.0,"n_wins_base":545.0,"n_draws":5.0,"n_total":805.0,"discrete_win_rate":31.9875776398,"length_controlled_winrate":43.905552211,"lc_standard_error":0.8945807936,"num_tokens_mean":346,"num_tokens_std":211,"model_name":"Qwen1.5-110B-Chat"}
3
+ {"num_words_mean":216,"num_words_std":113,"win_rate":29.1052695333,"standard_error":1.3941539442,"n_wins":223.0,"n_wins_base":579.0,"n_draws":3.0,"n_total":805.0,"discrete_win_rate":27.8881987578,"length_controlled_winrate":40.5095080124,"lc_standard_error":0.8837504763,"num_tokens_mean":292,"num_tokens_std":156,"model_name":"claude-3-opus-20240229"}
4
+ {"num_words_mean":241,"num_words_std":166,"win_rate":21.8557725437,"standard_error":1.2682402187,"n_wins":164.0,"n_wins_base":639.0,"n_draws":2.0,"n_total":805.0,"discrete_win_rate":20.4968944099,"length_controlled_winrate":28.6143374017,"lc_standard_error":0.9075464438,"num_tokens_mean":327,"num_tokens_std":305,"model_name":"mistral-medium"}
5
+ {"num_words_mean":174,"num_words_std":90,"win_rate":17.1882403567,"standard_error":1.1748282562,"n_wins":131.0,"n_wins_base":673.0,"n_draws":1.0,"n_total":805.0,"discrete_win_rate":16.3354037267,"length_controlled_winrate":28.1551961416,"lc_standard_error":0.8779084794,"num_tokens_mean":227,"num_tokens_std":114,"model_name":"claude-2"}
6
+ {"num_words_mean":342,"num_words_std":187,"win_rate":35.9464864409,"standard_error":1.4410058098,"n_wins":285.0,"n_wins_base":517.0,"n_draws":3.0,"n_total":805.0,"discrete_win_rate":35.5900621118,"length_controlled_winrate":34.787447623,"lc_standard_error":0.7594505141,"num_tokens_mean":505,"num_tokens_std":311,"model_name":"FsfairX-Zephyr-Chat-v0.1"}
7
+ {"num_words_mean":79,"num_words_std":78,"win_rate":15.7478281307,"standard_error":1.1194852006,"n_wins":118.0,"n_wins_base":687.0,"n_draws":0.0,"n_total":805.0,"discrete_win_rate":14.6583850932,"length_controlled_winrate":25.5015577947,"lc_standard_error":0.7760697229,"num_tokens_mean":106,"num_tokens_std":102,"model_name":"Infinity-Instruct-3M-0613-Mistral-7B"}
8
+ {"num_words_mean":243,"num_words_std":144,"win_rate":26.4982833956,"standard_error":1.3042361649,"n_wins":201.0,"n_wins_base":600.0,"n_draws":4.0,"n_total":805.0,"discrete_win_rate":25.2173913043,"length_controlled_winrate":36.571754112,"lc_standard_error":0.9357421321,"num_tokens_mean":342,"num_tokens_std":346,"model_name":"Qwen1.5-72B-Chat"}
9
+ {"num_words_mean":229,"num_words_std":168,"win_rate":22.2101705475,"standard_error":1.2780740057,"n_wins":174.0,"n_wins_base":628.0,"n_draws":3.0,"n_total":805.0,"discrete_win_rate":21.801242236,"length_controlled_winrate":30.8788102941,"lc_standard_error":0.9518125819,"num_tokens_mean":307,"num_tokens_std":253,"model_name":"Mixtral-8x22B-Instruct-v0.1"}
10
+ {"num_words_mean":235,"num_words_std":143,"win_rate":19.7553327319,"standard_error":1.2063251121,"n_wins":147.0,"n_wins_base":657.0,"n_draws":1.0,"n_total":805.0,"discrete_win_rate":18.3229813665,"length_controlled_winrate":25.1853410397,"lc_standard_error":0.8999456518,"num_tokens_mean":310,"num_tokens_std":193,"model_name":"dbrx-instruct"}
11
+ {"num_words_mean":297,"num_words_std":161,"win_rate":56.5930456223,"standard_error":1.4464848562,"n_wins":456.0,"n_wins_base":347.0,"n_draws":2.0,"n_total":805.0,"discrete_win_rate":56.7701863354,"length_controlled_winrate":59.1415240989,"lc_standard_error":0.7580510219,"num_tokens_mean":420,"num_tokens_std":233,"model_name":"Together-MoA-Lite"}
12
+ {"num_words_mean":299,"num_words_std":171,"win_rate":33.1778569588,"standard_error":1.3886514096,"n_wins":266.0,"n_wins_base":537.0,"n_draws":2.0,"n_total":805.0,"discrete_win_rate":33.1677018634,"length_controlled_winrate":34.4245971745,"lc_standard_error":0.8691832384,"num_tokens_mean":416,"num_tokens_std":238,"model_name":"Meta-Llama-3-70B-Instruct"}
13
+ {"num_words_mean":179,"num_words_std":104,"win_rate":16.1273996216,"standard_error":1.1341036838,"n_wins":120.0,"n_wins_base":682.0,"n_draws":3.0,"n_total":805.0,"discrete_win_rate":15.0931677019,"length_controlled_winrate":25.6122590254,"lc_standard_error":0.87464248,"num_tokens_mean":233,"num_tokens_std":132,"model_name":"claude-instant-1.2"}
14
+ {"num_words_mean":272,"num_words_std":151,"win_rate":59.8688062333,"standard_error":1.4343056045,"n_wins":490.0,"n_wins_base":314.0,"n_draws":1.0,"n_total":805.0,"discrete_win_rate":60.9316770186,"length_controlled_winrate":65.3799697685,"lc_standard_error":0.7392392837,"num_tokens_mean":386,"num_tokens_std":217,"model_name":"Together-MoA"}
15
+ {"num_words_mean":218,"num_words_std":148,"win_rate":21.4387759814,"standard_error":1.2485232545,"n_wins":166.0,"n_wins_base":638.0,"n_draws":1.0,"n_total":805.0,"discrete_win_rate":20.6832298137,"length_controlled_winrate":32.6520799853,"lc_standard_error":0.9044632955,"num_tokens_mean":290,"num_tokens_std":191,"model_name":"mistral-large-2402"}
16
+ {"num_words_mean":336,"num_words_std":195,"win_rate":63.0409907519,"standard_error":1.4253258915,"n_wins":519.0,"n_wins_base":286.0,"n_draws":0.0,"n_total":805.0,"discrete_win_rate":64.4720496894,"length_controlled_winrate":61.637895572,"lc_standard_error":0.6799412402,"num_tokens_mean":486,"num_tokens_std":287,"model_name":"Storm-7B-best-of-64"}
17
+ {"num_words_mean":221,"num_words_std":129,"win_rate":25.5563252923,"standard_error":1.3419811052,"n_wins":193.0,"n_wins_base":608.0,"n_draws":4.0,"n_total":805.0,"discrete_win_rate":24.2236024845,"length_controlled_winrate":34.8724743624,"lc_standard_error":0.949844689,"num_tokens_mean":297,"num_tokens_std":178,"model_name":"claude-3-sonnet-20240229"}
18
+ {"num_words_mean":176,"num_words_std":97,"win_rate":16.9853436124,"standard_error":1.1687959793,"n_wins":129.0,"n_wins_base":676.0,"n_draws":0.0,"n_total":805.0,"discrete_win_rate":16.0248447205,"length_controlled_winrate":27.2895044437,"lc_standard_error":0.858614564,"num_tokens_mean":229,"num_tokens_std":123,"model_name":"claude"}
19
+ {"num_words_mean":415,"num_words_std":173,"win_rate":39.354502072,"standard_error":1.4524224246,"n_wins":323.0,"n_wins_base":480.0,"n_draws":2.0,"n_total":805.0,"discrete_win_rate":40.248447205,"length_controlled_winrate":25.2417704867,"lc_standard_error":0.5909370499,"num_tokens_mean":561,"num_tokens_std":239,"model_name":"Nanbeige2-8B-Chat"}
20
+ {"num_words_mean":341,"num_words_std":273,"win_rate":34.3064238313,"standard_error":1.3914900256,"n_wins":268.0,"n_wins_base":537.0,"n_draws":0.0,"n_total":805.0,"discrete_win_rate":33.2919254658,"length_controlled_winrate":31.4699427971,"lc_standard_error":0.8138922262,"num_tokens_mean":509,"num_tokens_std":463,"model_name":"REBEL-Llama-3-8B-Instruct"}
21
+ {"num_words_mean":286,"num_words_std":190,"win_rate":51.3275757825,"standard_error":1.470009459,"n_wins":429.0,"n_wins_base":369.0,"n_draws":7.0,"n_total":805.0,"discrete_win_rate":53.7267080745,"length_controlled_winrate":57.4568288333,"lc_standard_error":0.7774399385,"num_tokens_mean":406,"num_tokens_std":273,"model_name":"gpt-4o-2024-05-13"}
22
+ {"num_words_mean":378,"num_words_std":190,"win_rate":64.303601471,"standard_error":1.3348590089,"n_wins":525.0,"n_wins_base":268.0,"n_draws":12.0,"n_total":805.0,"discrete_win_rate":65.9627329193,"length_controlled_winrate":51.5750079797,"lc_standard_error":0.8313707608,"num_tokens_mean":505,"num_tokens_std":261,"model_name":"gpt4_1106_preview_verbose"}
23
+ {"num_words_mean":284,"num_words_std":151,"win_rate":37.0360860499,"standard_error":1.4340261273,"n_wins":288.0,"n_wins_base":514.0,"n_draws":3.0,"n_total":805.0,"discrete_win_rate":35.9627329193,"length_controlled_winrate":40.5912863493,"lc_standard_error":0.8504106275,"num_tokens_mean":394,"num_tokens_std":214,"model_name":"Nanbeige2-16B-Chat"}
24
+ {"num_words_mean":183,"num_words_std":124,"win_rate":15.7550380876,"standard_error":1.0754642482,"n_wins":117.0,"n_wins_base":684.0,"n_draws":4.0,"n_total":805.0,"discrete_win_rate":14.7826086957,"length_controlled_winrate":30.1833223167,"lc_standard_error":0.7874508454,"num_tokens_mean":245,"num_tokens_std":164,"model_name":"gpt4_0613"}
25
+ {"num_words_mean":313,"num_words_std":180,"win_rate":54.9665397329,"standard_error":1.4286740089,"n_wins":446.0,"n_wins_base":347.0,"n_draws":12.0,"n_total":805.0,"discrete_win_rate":56.149068323,"length_controlled_winrate":56.3562938462,"lc_standard_error":0.7731843456,"num_tokens_mean":417,"num_tokens_std":242,"model_name":"gpt-4-0125-preview"}
26
+ {"num_words_mean":237,"num_words_std":127,"win_rate":23.2373600435,"standard_error":1.2835395056,"n_wins":171.0,"n_wins_base":630.0,"n_draws":4.0,"n_total":805.0,"discrete_win_rate":21.4906832298,"length_controlled_winrate":33.8212668866,"lc_standard_error":0.8842151461,"num_tokens_mean":313,"num_tokens_std":166,"model_name":"gpt4_0613_verbose"}
27
+ {"num_words_mean":391,"num_words_std":159,"win_rate":56.7030097302,"standard_error":1.482841875,"n_wins":456.0,"n_wins_base":347.0,"n_draws":2.0,"n_total":805.0,"discrete_win_rate":56.7701863354,"length_controlled_winrate":44.4596624034,"lc_standard_error":0.7209678864,"num_tokens_mean":529,"num_tokens_std":216,"model_name":"Nanbeige-Plus-Chat-v0.1"}
28
+ {"num_words_mean":393,"num_words_std":185,"win_rate":34.8601328913,"standard_error":1.3599450437,"n_wins":270.0,"n_wins_base":533.0,"n_draws":2.0,"n_total":805.0,"discrete_win_rate":33.6645962733,"length_controlled_winrate":29.9743216131,"lc_standard_error":0.7464891533,"num_tokens_mean":526,"num_tokens_std":255,"model_name":"Snorkel-Mistral-PairRM-DPO-best-of-16"}
29
+ {"num_words_mean":228,"num_words_std":94,"win_rate":24.3540710901,"standard_error":1.29358621,"n_wins":191.0,"n_wins_base":613.0,"n_draws":1.0,"n_total":805.0,"discrete_win_rate":23.7888198758,"length_controlled_winrate":30.2911791666,"lc_standard_error":0.6612722747,"num_tokens_mean":292,"num_tokens_std":120,"model_name":"claude-2.1_verbose"}
30
+ {"num_words_mean":281,"num_words_std":128,"win_rate":29.8963508407,"standard_error":1.3666520485,"n_wins":234.0,"n_wins_base":571.0,"n_draws":0.0,"n_total":805.0,"discrete_win_rate":29.0683229814,"length_controlled_winrate":31.721885287,"lc_standard_error":0.8150560619,"num_tokens_mean":385,"num_tokens_std":180,"model_name":"merlinite-7B-AOT"}
31
+ {"num_words_mean":215,"num_words_std":160,"win_rate":22.0732589287,"standard_error":1.2466725495,"n_wins":172.0,"n_wins_base":627.0,"n_draws":6.0,"n_total":805.0,"discrete_win_rate":21.7391304348,"length_controlled_winrate":35.3070612164,"lc_standard_error":0.8997916758,"num_tokens_mean":289,"num_tokens_std":212,"model_name":"gpt4_0314"}
32
+ {"num_words_mean":381,"num_words_std":205,"win_rate":33.2273552,"standard_error":1.3779687478,"n_wins":260.0,"n_wins_base":544.0,"n_draws":1.0,"n_total":805.0,"discrete_win_rate":32.3602484472,"length_controlled_winrate":29.7058089397,"lc_standard_error":0.7122554396,"num_tokens_mean":506,"num_tokens_std":272,"model_name":"Contextual-KTO-Mistral-PairRM"}
33
+ {"num_words_mean":344,"num_words_std":144,"win_rate":35.4431306717,"standard_error":1.3981308966,"n_wins":274.0,"n_wins_base":531.0,"n_draws":0.0,"n_total":805.0,"discrete_win_rate":34.0372670807,"length_controlled_winrate":31.9003876312,"lc_standard_error":0.7655500294,"num_tokens_mean":459,"num_tokens_std":193,"model_name":"SPPO-Mistral7B-PairRM-ExPO"}
34
+ {"num_words_mean":177,"num_words_std":130,"win_rate":22.9201944405,"standard_error":1.2325177143,"n_wins":172.0,"n_wins_base":622.0,"n_draws":11.0,"n_total":805.0,"discrete_win_rate":22.049689441,"length_controlled_winrate":41.8966015912,"lc_standard_error":0.7406558917,"num_tokens_mean":244,"num_tokens_std":184,"model_name":"gpt4_1106_preview_concise"}
35
+ {"num_words_mean":261,"num_words_std":134,"win_rate":40.6328540086,"standard_error":1.4439449942,"n_wins":325.0,"n_wins_base":479.0,"n_draws":1.0,"n_total":805.0,"discrete_win_rate":40.4347826087,"length_controlled_winrate":45.807978034,"lc_standard_error":0.8703329817,"num_tokens_mean":370,"num_tokens_std":191,"model_name":"Llama-3-Instruct-8B-SimPO-ExPO"}
36
+ {"num_words_mean":272,"num_words_std":137,"win_rate":63.1549345123,"standard_error":1.4229800988,"n_wins":515.0,"n_wins_base":283.0,"n_draws":7.0,"n_total":805.0,"discrete_win_rate":64.4099378882,"length_controlled_winrate":68.3786625033,"lc_standard_error":0.7309418615,"num_tokens_mean":377,"num_tokens_std":193,"model_name":"openpipe-moa-gpt-4-turbo-v1"}
37
+ {"num_words_mean":225,"num_words_std":140,"win_rate":26.9882543183,"standard_error":1.318903,"n_wins":201.0,"n_wins_base":601.0,"n_draws":3.0,"n_total":805.0,"discrete_win_rate":25.1552795031,"length_controlled_winrate":31.5065442681,"lc_standard_error":0.7338723477,"num_tokens_mean":312,"num_tokens_std":199,"model_name":"Samba-CoE-v0.2-best-of-16"}
38
+ {"num_words_mean":280,"num_words_std":151,"win_rate":31.773037737,"standard_error":1.2392772646,"n_wins":180.0,"n_wins_base":473.0,"n_draws":152.0,"n_total":805.0,"discrete_win_rate":31.801242236,"length_controlled_winrate":36.7258688784,"lc_standard_error":0.6787999003,"num_tokens_mean":383,"num_tokens_std":210,"model_name":"aligner-2b_qwen1.5-72b-chat"}
39
+ {"num_words_mean":177,"num_words_std":92,"win_rate":15.7335067364,"standard_error":1.1203158654,"n_wins":115.0,"n_wins_base":688.0,"n_draws":2.0,"n_total":805.0,"discrete_win_rate":14.4099378882,"length_controlled_winrate":25.2519438861,"lc_standard_error":0.7515108894,"num_tokens_mean":228,"num_tokens_std":114,"model_name":"claude-2.1"}
40
+ {"num_words_mean":317,"num_words_std":148,"win_rate":39.6728609061,"standard_error":1.4247223562,"n_wins":310.0,"n_wins_base":494.0,"n_draws":1.0,"n_total":805.0,"discrete_win_rate":38.5714285714,"length_controlled_winrate":38.5628066368,"lc_standard_error":0.8694594533,"num_tokens_mean":443,"num_tokens_std":208,"model_name":"SPPO-Llama-3-Instruct-8B-PairRM"}
41
+ {"num_words_mean":272,"num_words_std":141,"win_rate":40.5297749846,"standard_error":1.4225744647,"n_wins":319.0,"n_wins_base":485.0,"n_draws":1.0,"n_total":805.0,"discrete_win_rate":39.6894409938,"length_controlled_winrate":44.6804680926,"lc_standard_error":0.8789917177,"num_tokens_mean":385,"num_tokens_std":200,"model_name":"Llama-3-Instruct-8B-SimPO"}
42
+ {"num_words_mean":276,"num_words_std":140,"win_rate":22.9806197059,"standard_error":1.3591734083,"n_wins":184.0,"n_wins_base":620.0,"n_draws":1.0,"n_total":805.0,"discrete_win_rate":22.9192546584,"length_controlled_winrate":25.7233081711,"lc_standard_error":0.4593179402,"num_tokens_mean":368,"num_tokens_std":203,"model_name":"tulu-2-dpo-70b-ExPO"}
43
+ {"num_words_mean":228,"num_words_std":142,"win_rate":40.5602140968,"standard_error":1.4679655404,"n_wins":312.0,"n_wins_base":493.0,"n_draws":0.0,"n_total":805.0,"discrete_win_rate":38.7577639752,"length_controlled_winrate":52.3667542714,"lc_standard_error":0.7976856335,"num_tokens_mean":315,"num_tokens_std":200,"model_name":"claude-3-5-sonnet-20240620"}
44
+ {"num_words_mean":300,"num_words_std":135,"win_rate":50.2688690553,"standard_error":1.4728176781,"n_wins":397.0,"n_wins_base":408.0,"n_draws":0.0,"n_total":805.0,"discrete_win_rate":49.3167701863,"length_controlled_winrate":50.4080792281,"lc_standard_error":0.7188927916,"num_tokens_mean":429,"num_tokens_std":188,"model_name":"Storm-7B"}
45
+ {"num_words_mean":414,"num_words_std":241,"win_rate":30.2200527007,"standard_error":1.3328273013,"n_wins":231.0,"n_wins_base":572.0,"n_draws":1.0,"n_total":804.0,"discrete_win_rate":28.7935323383,"length_controlled_winrate":26.3914464573,"lc_standard_error":0.6739888325,"num_tokens_mean":566,"num_tokens_std":342,"model_name":"Snorkel-Mistral-PairRM-DPO"}
46
+ {"num_words_mean":507,"num_words_std":206,"win_rate":46.1853674689,"standard_error":1.4638315246,"n_wins":375.0,"n_wins_base":430.0,"n_draws":0.0,"n_total":805.0,"discrete_win_rate":46.5838509317,"length_controlled_winrate":27.2257594808,"lc_standard_error":0.5877331102,"num_tokens_mean":693,"num_tokens_std":283,"model_name":"internlm2-chat-20b-ExPO"}
47
+ {"num_words_mean":210,"num_words_std":138,"win_rate":21.8473786693,"standard_error":1.2171089783,"n_wins":159.0,"n_wins_base":645.0,"n_draws":1.0,"n_total":805.0,"discrete_win_rate":19.8136645963,"length_controlled_winrate":27.6242673501,"lc_standard_error":0.6875926799,"num_tokens_mean":292,"num_tokens_std":196,"model_name":"Samba-CoE-v0.2"}
48
+ {"num_words_mean":215,"num_words_std":156,"win_rate":23.5767893148,"standard_error":1.2757042012,"n_wins":179.0,"n_wins_base":618.0,"n_draws":8.0,"n_total":805.0,"discrete_win_rate":22.7329192547,"length_controlled_winrate":38.1280897444,"lc_standard_error":0.9069675584,"num_tokens_mean":287,"num_tokens_std":203,"model_name":"gpt4"}
49
+ {"num_words_mean":339,"num_words_std":178,"win_rate":29.6599467188,"standard_error":1.3225712598,"n_wins":219.0,"n_wins_base":582.0,"n_draws":4.0,"n_total":805.0,"discrete_win_rate":27.4534161491,"length_controlled_winrate":27.1905478776,"lc_standard_error":0.7470363322,"num_tokens_mean":447,"num_tokens_std":235,"model_name":"Yi-34B-Chat"}
50
+ {"num_words_mean":336,"num_words_std":106,"win_rate":29.6008518479,"standard_error":1.3252049543,"n_wins":225.0,"n_wins_base":580.0,"n_draws":0.0,"n_total":805.0,"discrete_win_rate":27.950310559,"length_controlled_winrate":26.4869564984,"lc_standard_error":0.7549415682,"num_tokens_mean":447,"num_tokens_std":143,"model_name":"Starling-LM-7B-beta-ExPO"}
51
+ {"num_words_mean":349,"num_words_std":189,"win_rate":31.2412829468,"standard_error":1.3482437399,"n_wins":239.0,"n_wins_base":563.0,"n_draws":3.0,"n_total":805.0,"discrete_win_rate":29.8757763975,"length_controlled_winrate":28.8148408668,"lc_standard_error":0.8310750322,"num_tokens_mean":464,"num_tokens_std":276,"model_name":"pairrm-Yi-34B-Chat"}
52
+ {"num_words_mean":323,"num_words_std":181,"win_rate":50.0,"standard_error":0.0,"n_wins":0.0,"n_wins_base":0.0,"n_draws":805.0,"n_total":805.0,"discrete_win_rate":50.0,"length_controlled_winrate":50.0,"lc_standard_error":0.0,"num_tokens_mean":431,"num_tokens_std":249,"model_name":"gpt4_1106_preview"}
53
+ {"num_words_mean":322,"num_words_std":148,"win_rate":32.2453123638,"standard_error":1.390800011,"n_wins":249.0,"n_wins_base":556.0,"n_draws":0.0,"n_total":805.0,"discrete_win_rate":30.9316770186,"length_controlled_winrate":30.4941379652,"lc_standard_error":0.8458266977,"num_tokens_mean":431,"num_tokens_std":198,"model_name":"SPPO-Mistral7B-PairRM"}
prep_data.py CHANGED
@@ -20,7 +20,9 @@ for model_name in os.listdir(TOP_LEVEL_DIRECTORY):
20
  if os.path.isdir(model_dir):
21
  model_output_file = os.path.join(model_dir, "model_outputs.json")
22
  if os.path.exists(model_output_file):
23
- model_dataframes_outputs[model_name] = pd.read_json(model_output_file)
 
 
24
 
25
 
26
  def get_num_words(text):
@@ -42,6 +44,7 @@ model_name_to_num_words = {}
42
  model_name_to_num_tokens = {}
43
  for model_name, model_dataframe in model_dataframes_outputs.items():
44
  print(f"model_name_to_num_words for {model_name}")
 
45
  model_dataframe["output_num_words"] = model_dataframe["output"].apply(get_num_words)
46
  model_dataframe["output_num_tokens"] = model_dataframe["output"].apply(
47
  get_num_tokens
@@ -88,5 +91,8 @@ df = df.rename(
88
  "std": "num_tokens_std",
89
  }
90
  )
 
91
 
92
- df.to_json("data/model_win_rates.json")
 
 
 
20
  if os.path.isdir(model_dir):
21
  model_output_file = os.path.join(model_dir, "model_outputs.json")
22
  if os.path.exists(model_output_file):
23
+ df = pd.read_json(model_output_file)
24
+ df["model_name"] = model_name
25
+ model_dataframes_outputs[model_name] = df
26
 
27
 
28
  def get_num_words(text):
 
44
  model_name_to_num_tokens = {}
45
  for model_name, model_dataframe in model_dataframes_outputs.items():
46
  print(f"model_name_to_num_words for {model_name}")
47
+ model_dataframe["model_name"] = model_name
48
  model_dataframe["output_num_words"] = model_dataframe["output"].apply(get_num_words)
49
  model_dataframe["output_num_tokens"] = model_dataframe["output"].apply(
50
  get_num_tokens
 
91
  "std": "num_tokens_std",
92
  }
93
  )
94
+ df["model_name"] = df.index
95
 
96
+ df = df[df["length_controlled_winrate"] > 25]
97
+
98
+ df.to_json("data/model_win_rates.jsonl", orient="records", lines=True)
prep_data_annotations.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import tiktoken
4
+ from alpaca_eval import utils, metrics, annotators, constants, analyze, plotting, main
5
+ from alpaca_eval.metrics.glm_winrate import get_length_controlled_winrate
6
+ import os
7
+ import pandas as pd
8
+ import json
9
+
10
+
11
+ # Define the path to the top-level directory
12
+ TOP_LEVEL_DIRECTORY = "submodules/alpaca_eval/results"
13
+
14
+ df = pd.read_json("data/model_win_rates.jsonl", lines=True, orient="records")
15
+ relevant_models = df["model_name"].unique().tolist()
16
+
17
+ # Initialize an empty dictionary to hold the model name to dataframe mapping
18
+ model_dataframes_outputs = {}
19
+
20
+ # Iterate through each subdirectory in the top-level directory
21
+ df_response_judging = pd.DataFrame()
22
+ for model_name in os.listdir(TOP_LEVEL_DIRECTORY):
23
+ if model_name not in relevant_models:
24
+ continue
25
+ model_dir = os.path.join(TOP_LEVEL_DIRECTORY, model_name)
26
+ if os.path.isdir(model_dir):
27
+ model_output_file = os.path.join(
28
+ model_dir, "weighted_alpaca_eval_gpt4_turbo/annotations.json"
29
+ )
30
+ if os.path.exists(model_output_file):
31
+ df_response_judging = pd.concat(
32
+ [df_response_judging, pd.read_json(model_output_file)]
33
+ )
34
+
35
+ df_responses = pd.DataFrame()
36
+ for model_name in os.listdir(TOP_LEVEL_DIRECTORY):
37
+ if model_name not in relevant_models:
38
+ continue
39
+ model_dir = os.path.join(TOP_LEVEL_DIRECTORY, model_name)
40
+ if os.path.isdir(model_dir):
41
+ model_output_file = os.path.join(model_dir, "model_outputs.json")
42
+ if os.path.exists(model_output_file):
43
+ df_responses = pd.concat([df_responses, pd.read_json(model_output_file)])
44
+
45
+ df_responses = df_responses.drop("all_generated_texts", axis=1)
46
+ df_responses = df_responses.drop("Unnamed: 0.1", axis=1)
47
+ df_responses = df_responses.drop("index", axis=1)
48
+ df_responses = df_responses.drop("Unnamed: 0", axis=1)
49
+ df_responses = df_responses.drop("scores", axis=1)
50
+ df_responses = df_responses.drop("all_results_idx_best", axis=1)
51
+ df_responses = df_responses.drop("original_output", axis=1)
52
+ df_responses = df_responses.drop("new_prompt", axis=1)
53
+
54
+ breakpoint()
55
+
56
+ # Whitelist.
57
+
58
+
59
+ df_response_judging.to_json(
60
+ "data/df_response_judging.jsonl", lines=True, orient="records"
61
+ )
62
+ df_responses.to_json("data/df_responses.jsonl", lines=True, orient="records")