Shane commited on
Commit
df04a09
1 Parent(s): 9545133

changed website

Browse files
Files changed (8) hide show
  1. .gitignore +1 -0
  2. app.py +65 -338
  3. app_old.py +464 -0
  4. src/constants.py +1 -1
  5. src/md.py +1 -2
  6. src/plt.py +0 -53
  7. src/utils.py +75 -91
  8. src/utils_old.py +171 -0
.gitignore CHANGED
@@ -1,3 +1,4 @@
1
  evals/
 
2
  __pycache__/*
3
  *.pyc
 
1
  evals/
2
+ results/
3
  __pycache__/*
4
  *.pyc
app.py CHANGED
@@ -3,288 +3,87 @@ import os
3
  from huggingface_hub import HfApi, snapshot_download
4
  from apscheduler.schedulers.background import BackgroundScheduler
5
  from datasets import load_dataset
6
- from src.utils import load_all_data
7
  from src.md import ABOUT_TEXT, TOP_TEXT
8
- from src.plt import plot_avg_correlation
9
- from src.constants import subset_mapping, length_categories, example_counts
10
  from src.css import custom_css
11
  import numpy as np
12
 
13
  api = HfApi()
14
 
15
  COLLAB_TOKEN = os.environ.get("COLLAB_TOKEN")
16
- evals_repo = "allenai/reward-bench-results"
17
 
18
- eval_set_repo = "allenai/reward-bench"
19
- repo_dir_rewardbench = "./evals/rewardbench/"
20
 
21
  def restart_space():
22
- api.restart_space(repo_id="allenai/reward-bench", token=COLLAB_TOKEN)
23
 
24
  print("Pulling evaluation results")
25
  repo = snapshot_download(
26
- local_dir=repo_dir_rewardbench,
27
- ignore_patterns=["pref-sets-scores/*", "eval-set-scores/*"],
28
  repo_id=evals_repo,
29
  use_auth_token=COLLAB_TOKEN,
30
  tqdm_class=None,
31
  etag_timeout=30,
32
  repo_type="dataset",
33
  )
34
-
35
-
36
- def avg_over_rewardbench(dataframe_core, dataframe_prefs):
37
- """
38
- Averages over the subsets alpacaeval, mt-bench, llmbar, refusals, hep and returns dataframe with only these columns.
39
-
40
- We average over 4 core sections (per prompt weighting):
41
- 1. Chat: Includes the easy chat subsets (alpacaeval-easy, alpacaeval-length, alpacaeval-hard, mt-bench-easy, mt-bench-medium)
42
- 2. Chat Hard: Includes the hard chat subsets (mt-bench-hard, llmbar-natural, llmbar-adver-neighbor, llmbar-adver-GPTInst, llmbar-adver-GPTOut, llmbar-adver-manual)
43
- 3. Safety: Includes the safety subsets (refusals-dangerous, refusals-offensive, xstest-should-refuse, xstest-should-respond, do not answer)
44
- 4. Reasoning: Includes the code and math subsets (math-prm, hep-cpp, hep-go, hep-java, hep-js, hep-python, hep-rust)
45
- 5. Prior Sets (0.5 weight): Includes the test sets (anthropic_helpful, mtbench_human, shp, summarize)
46
- """
47
- new_df = dataframe_core.copy()
48
- dataframe_prefs = dataframe_prefs.copy()
49
-
50
- # for main subsets, keys in subset_mapping, take the weighted avg by example_counts and store for the models
51
- for subset, sub_subsets in subset_mapping.items():
52
- subset_cols = [col for col in new_df.columns if col in sub_subsets]
53
- sub_data = new_df[subset_cols].values # take the relevant column values
54
- sub_counts = [example_counts[s] for s in subset_cols] # take the example counts
55
- new_df[subset] = np.average(sub_data, axis=1, weights=sub_counts) # take the weighted average
56
- # new_df[subset] = np.round(np.nanmean(new_df[subset_cols].values, axis=1), 2)
57
-
58
- data_cols = list(subset_mapping.keys())
59
- keep_columns = ["model",] + ["model_type"] + data_cols
60
- # keep_columns = ["model", "average"] + subsets
61
- new_df = new_df[keep_columns]
62
-
63
- # selected average from pref_sets
64
- pref_columns = ["anthropic_helpful", "anthropic_hhh", "shp", "summarize"]
65
- pref_data = dataframe_prefs[pref_columns].values
66
-
67
- # add column test sets knowing the rows are not identical, take superset
68
- dataframe_prefs["Prior Sets (0.5 weight)"] = np.nanmean(pref_data, axis=1)
69
-
70
- # add column Test Sets empty to new_df
71
- new_df["Prior Sets (0.5 weight)"] = np.nan
72
- # per row in new_df if model is in dataframe_prefs, add the value to new_df["Prior Sets (0.5 weight)"]
73
- values = []
74
- for i, row in new_df.iterrows():
75
- model = row["model"]
76
- if model in dataframe_prefs["model"].values:
77
- values.append(dataframe_prefs[dataframe_prefs["model"] == model]["Prior Sets (0.5 weight)"].values[0])
78
- # new_df.at[i, "Prior Sets (0.5 weight)"] = dataframe_prefs[dataframe_prefs["model"] == model]["Prior Sets (0.5 weight)"].values[0]
79
- else:
80
- values.append(np.nan)
81
-
82
- new_df["Prior Sets (0.5 weight)"] = values
83
-
84
- # add total average
85
- data_cols += ["Prior Sets (0.5 weight)"]
86
- final_data = new_df[data_cols].values
87
- masked_data = np.ma.masked_array(final_data, np.isnan(final_data))
88
- weights = [2, 2, 2, 2, 1]
89
- average = np.ma.average(masked_data, axis=1, weights=weights)
90
- new_df["average"] = average.filled(np.nan)
91
- # new_df["average"] = np.nanmean(new_df[data_cols].values, axis=1)
92
-
93
- # make average third column
94
- keep_columns = ["model", "model_type", "average"] + data_cols
95
- new_df = new_df[keep_columns]
96
- return new_df
97
-
98
- def expand_subsets(dataframe):
99
- # TODO need to modify data/ script to do this
100
- pass
101
-
102
-
103
- def length_bias_check(dataframe):
104
- """
105
- Takes the raw rewardbench dataframe and splits the data into new buckets according to length_categories.
106
- Then, take the average of the three buckets as "average"
107
- """
108
- new_df = dataframe.copy()
109
- existing_subsets = new_df.columns[3:] # model, model_type, average
110
- final_subsets = ["Length Bias", "Neutral", "Terse Bias"]
111
- # new data is empty list dict for each final subset
112
- new_data = {s: [] for s in final_subsets}
113
-
114
- # now, subsets correspond to those with True, Nuetral, and False length bias
115
- # check if length_categories[subset] == "True" or "False" or "Neutral"
116
- for subset in existing_subsets:
117
- subset_data = new_df[subset].values
118
- subset_length = length_categories[subset]
119
- # route to the correct bucket
120
- if subset_length == "True":
121
- new_data["Length Bias"].append(subset_data)
122
- elif subset_length == "Neutral":
123
- new_data["Neutral"].append(subset_data)
124
- elif subset_length == "False":
125
- new_data["Terse Bias"].append(subset_data)
126
-
127
- # take average of new_data and add to new_df (removing other columns than model)
128
- for subset in final_subsets:
129
- new_df[subset] = np.nanmean(new_data[subset], axis=0)
130
- keep_columns = ["model"] + final_subsets
131
- new_df = new_df[keep_columns]
132
- # recompute average
133
- # new_df["average"] = np.round(np.nanmean(new_df[final_subsets].values, axis=1), 2)
134
-
135
- return new_df
136
-
137
-
138
 
139
- rewardbench_data = load_all_data(repo_dir_rewardbench, subdir="eval-set").sort_values(by='average', ascending=False)
140
- rewardbench_data_length = length_bias_check(rewardbench_data).sort_values(by='Terse Bias', ascending=False)
141
- prefs_data = load_all_data(repo_dir_rewardbench, subdir="pref-sets").sort_values(by='average', ascending=False)
142
- # prefs_data_sub = expand_subsets(prefs_data).sort_values(by='average', ascending=False)
143
-
144
- rewardbench_data_avg = avg_over_rewardbench(rewardbench_data, prefs_data).sort_values(by='average', ascending=False)
145
-
146
- def prep_df(df):
147
- # add column to 0th entry with count (column name itself empty)
148
- df.insert(0, '', range(1, 1 + len(df)))
149
-
150
- # replace "model" with "Model" and "model_type" with "Model Type" and "average" with "Average"
151
- df = df.rename(columns={"model": "Model", "model_type": "Model Type", "average": "Average"})
152
 
153
- # if "Model Type" in columns
154
- if "Model Type" in df.columns:
155
- # get model_types that have generative in them
156
- mask = df["Model Type"].str.contains("generative", case=False, na=False)
157
-
158
- # set these values to "Generative"
159
- df.loc[mask, "Model Type"] = "Generative"
160
 
161
- return df
162
-
163
- # add count column to all dataframes
164
- rewardbench_data = prep_df(rewardbench_data)
165
- rewardbench_data_avg = prep_df(rewardbench_data_avg).rename(columns={"Average": "Score"})
166
- # adjust weight of this average to 50% for Prior Sets (0.5 weight), 1 for others
167
-
168
- rewardbench_data_length = prep_df(rewardbench_data_length)
169
- prefs_data = prep_df(prefs_data)
170
-
171
- col_types_rewardbench = ["number"] + ["markdown"] + ["str"] + ["number"] * (len(rewardbench_data.columns) - 1)
172
- col_types_rewardbench_avg = ["number"] + ["markdown"]+ ["str"] + ["number"] * (len(rewardbench_data_avg.columns) - 1)
173
- cols_rewardbench_data_length = ["markdown"] + ["number"] * (len(rewardbench_data_length.columns) - 1)
174
- col_types_prefs = ["number"] + ["markdown"] + ["number"] * (len(prefs_data.columns) - 1)
175
- # col_types_prefs_sub = ["markdown"] + ["number"] * (len(prefs_data_sub.columns) - 1)
176
 
177
  # for showing random samples
178
- eval_set = load_dataset(eval_set_repo, use_auth_token=COLLAB_TOKEN, split="filtered")
179
- def random_sample(r: gr.Request, subset):
180
- if subset is None or subset == []:
181
  sample_index = np.random.randint(0, len(eval_set) - 1)
182
  sample = eval_set[sample_index]
183
- else: # filter by subsets (can be list)
184
- if isinstance(subset, str):
185
- subset = [subset]
186
- # filter down dataset to only include the subset(s)
187
- eval_set_filtered = eval_set.filter(lambda x: x["subset"] in subset)
188
  sample_index = np.random.randint(0, len(eval_set_filtered) - 1)
189
  sample = eval_set_filtered[sample_index]
190
 
191
  markdown_text = '\n\n'.join([f"**{key}**:\n\n{value}" for key, value in sample.items()])
192
  return markdown_text
193
 
194
- subsets = eval_set.unique("subset")
195
 
196
- color_map = {
197
- "Generative": "#7497db",
198
- "Custom Classifier": "#E8ECF2",
199
- "Seq. Classifier": "#ffcd75",
200
- "DPO": "#75809c",
201
- }
202
- def color_model_type_column(df, color_map):
203
- """
204
- Apply color to the 'Model Type' column of the DataFrame based on a given color mapping.
205
 
206
- Parameters:
207
- df (pd.DataFrame): The DataFrame containing the 'Model Type' column.
208
- color_map (dict): A dictionary mapping model types to colors.
209
-
210
- Returns:
211
- pd.Styler: The styled DataFrame.
212
- """
213
- # Function to apply color based on the model type
214
- def apply_color(val):
215
- color = color_map.get(val, "default") # Default color if not specified in color_map
216
- return f'background-color: {color}'
217
-
218
- # Format for different columns
219
- format_dict = {col: "{:.1f}" for col in df.columns if col not in ['Average', 'Model', 'Model Type']}
220
- format_dict['Average'] = "{:.2f}"
221
- format_dict[''] = "{:d}"
222
-
223
- return df.style.applymap(apply_color, subset=['Model Type']).format(format_dict, na_rep='')
224
-
225
- def regex_table(dataframe, regex, filter_button, style=True):
226
  """
227
  Takes a model name as a regex, then returns only the rows that has that in it.
228
  """
 
 
229
  # Split regex statement by comma and trim whitespace around regexes
230
  regex_list = [x.strip() for x in regex.split(",")]
231
  # Join the list into a single regex pattern with '|' acting as OR
232
  combined_regex = '|'.join(regex_list)
233
 
234
- # remove internal ai2 data
235
- dataframe = dataframe[~dataframe["Model"].str.contains("ai2", case=False, na=False)]
236
-
237
- # if filter_button, remove all rows with "ai2" in the model name
238
- update_scores = False
239
- if isinstance(filter_button, list) or isinstance(filter_button, str):
240
- if "Prior Sets" not in filter_button and 'Prior Sets (0.5 weight)' in dataframe.columns:
241
- update_scores = True
242
- # remove the column "Prior Sets (0.5 weight)" from the outputted table
243
- dataframe = dataframe.drop(columns=['Prior Sets (0.5 weight)'])
244
- if "Seq. Classifiers" not in filter_button:
245
- dataframe = dataframe[~dataframe["Model Type"].str.contains("Seq. Classifier", case=False, na=False)]
246
- if "DPO" not in filter_button:
247
- dataframe = dataframe[~dataframe["Model Type"].str.contains("DPO", case=False, na=False)]
248
- if "Custom Classifiers" not in filter_button:
249
- dataframe = dataframe[~dataframe["Model Type"].str.contains("Custom Classifier", case=False, na=False)]
250
- if "Generative" not in filter_button:
251
- dataframe = dataframe[~dataframe["Model Type"].str.contains("generative", case=False, na=False)]
252
  # Filter the dataframe such that 'model' contains any of the regex patterns
253
  data = dataframe[dataframe["Model"].str.contains(combined_regex, case=False, na=False)]
254
 
255
- # if update the score to not use prior sets, do so
256
- if update_scores:
257
- data["Score"] = (data["Chat"] + data["Chat Hard"] + data["Safety"] + data["Reasoning"]) / 4
258
- # if "Prior Sets (0.5 weight)" in data.columns:
259
- # data["Prior Sets (0.5 weight)"] = np.nan
260
- # sort array by Score column
261
- data = data.sort_values(by='Score', ascending=False)
262
-
263
  data.reset_index(drop=True, inplace=True)
264
 
265
- # replace column '' with count/rank
266
- data[''] = np.arange(1, 1 + len(data))
267
-
268
- # if Score exists, round to 2 decimals
269
- if "Score" in data.columns:
270
- data["Score"] = np.round(np.array(data["Score"].values).astype(float), 2)
271
- if "Average" in data.columns:
272
- data["Average"] = np.round(np.array(data["Average"].values).astype(float), 1)
273
- # round all others to 1 decimal
274
- for col in data.columns:
275
- if col not in ["", "Model", "Model Type", "Score", "Average"]:
276
- # replace any data[col].values == '' with np.nan
277
- data[col] = data[col].replace('', np.nan)
278
- data[col] = np.round(np.array(data[col].values).astype(float), 1)
279
  if style:
280
- # apply color
281
- data = color_model_type_column(data, color_map)
282
-
 
283
  return data
284
 
285
- # import ipdb; ipdb.set_trace()
286
 
287
- total_models = len(regex_table(rewardbench_data_avg.copy(), "", ["Seq. Classifiers", "DPO", "Custom Classifiers", "Generative"], style=False).values)
288
 
289
  with gr.Blocks(css=custom_css) as app:
290
  # create tabs for the app, moving the current table to one titled "rewardbench" and the benchmark_text to a tab called "About"
@@ -299,104 +98,50 @@ with gr.Blocks(css=custom_css) as app:
299
  ![](file/src/logo.png)
300
  """)
301
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
302
- with gr.TabItem("🏆 RewardBench Leaderboard"):
303
  with gr.Row():
304
  search_1 = gr.Textbox(label="Model Search (delimit with , )",
305
- placeholder="Model Search (delimit with , )",
306
- show_label=False)
307
- model_types_1 = gr.CheckboxGroup(["Seq. Classifiers", "DPO", "Custom Classifiers", "Generative", "Prior Sets"],
308
- value=["Seq. Classifiers", "DPO", "Custom Classifiers", "Generative"],
309
- label="Model Types",
310
- show_label=False,
311
- # info="Which model types to include.",
312
- )
313
  with gr.Row():
314
  # reference data
315
  rewardbench_table_hidden = gr.Dataframe(
316
- rewardbench_data_avg.values,
317
- datatype=col_types_rewardbench_avg,
318
- headers=rewardbench_data_avg.columns.tolist(),
319
  visible=False,
320
  )
321
  rewardbench_table = gr.Dataframe(
322
- regex_table(rewardbench_data_avg.copy(), "", ["Seq. Classifiers", "DPO", "Custom Classifiers", "Generative"]),
323
- datatype=col_types_rewardbench_avg,
324
- headers=rewardbench_data_avg.columns.tolist(),
325
- elem_id="rewardbench_dataframe_avg",
326
- height=1000,
327
- )
328
-
329
- with gr.TabItem("🔍 RewardBench - Detailed"):
330
- with gr.Row():
331
- search_2 = gr.Textbox(label="Model Search (delimit with , )", show_label=False, placeholder="Model Search (delimit with , )")
332
- model_types_2 = gr.CheckboxGroup(["Seq. Classifiers", "DPO", "Custom Classifiers", "Generative"],
333
- value=["Seq. Classifiers", "DPO", "Generative", "Custom Classifiers"],
334
- label="Model Types",
335
- show_label=False,
336
- # info="Which model types to include."
337
- )
338
- with gr.Row():
339
- # ref data
340
- rewardbench_table_detailed_hidden = gr.Dataframe(
341
- rewardbench_data.values,
342
- datatype=col_types_rewardbench,
343
- headers=rewardbench_data.columns.tolist(),
344
- visible=False,
345
- )
346
- rewardbench_table_detailed = gr.Dataframe(
347
- regex_table(rewardbench_data.copy(), "", ["Seq. Classifiers", "DPO", "Generative", "Custom Classifiers"]),
348
- datatype=col_types_rewardbench,
349
- headers=rewardbench_data.columns.tolist(),
350
- elem_id="rewardbench_dataframe",
351
- height=1000,
352
  )
353
- # with gr.TabItem("rewardbench Eval Set - Length Bias"):
354
- # with gr.Row():
355
- # # backup
356
- # rewardbench_table_len_hidden = gr.Dataframe(
357
- # rewardbench_data_length.values,
358
- # datatype=cols_rewardbench_data_length,
359
- # headers=rewardbench_data_length.columns.tolist(),
360
- # visible=False,
361
- # )
362
- # rewardbench_table_len = gr.Dataframe(
363
- # regex_table(rewardbench_data_length.copy(), "", False).values,
364
- # datatype=cols_rewardbench_data_length,
365
- # headers=rewardbench_data_length.columns.tolist(),
366
- # elem_id="rewardbench_dataframe_length",
367
- # height=1000,
368
- # )
369
- with gr.TabItem("Prior Test Sets"):
370
  with gr.Row():
371
- search_3 = gr.Textbox(label="Model Search (delimit with , )", show_label=False, placeholder="Model Search (delimit with , )")
372
- model_types_3 = gr.CheckboxGroup(["Seq. Classifiers", "DPO", "Custom Classifiers", "Generative"],
373
- value=["Seq. Classifiers", "DPO", "Custom Classifiers"],
374
- label="Model Types",
375
- show_label=False,
376
- # info="Which model types to include.",
377
- )
378
  with gr.Row():
379
- PREF_SET_TEXT = """
380
- For more information, see the [dataset](https://huggingface.co/datasets/allenai/pref-test-sets). Only the subsets Anthropic Helpful, Anthropic HHH, Stanford SHP, and OpenAI's Summarize data are used in the leaderboard ranking.
381
- """
382
- gr.Markdown(PREF_SET_TEXT)
383
- with gr.Row():
384
- # backup
385
- pref_sets_table_hidden = gr.Dataframe(
386
- prefs_data.values,
387
- datatype=col_types_prefs,
388
- headers=prefs_data.columns.tolist(),
389
  visible=False,
390
  )
391
- pref_sets_table = gr.Dataframe(
392
- regex_table(prefs_data.copy(), "", ["Seq. Classifiers", "DPO", "Custom Classifiers"]),
393
- datatype=col_types_prefs,
394
- headers=prefs_data.columns.tolist(),
395
- elem_id="prefs_dataframe",
396
- height=1000,
 
397
  )
398
-
399
-
400
  with gr.TabItem("About"):
401
  with gr.Row():
402
  gr.Markdown(ABOUT_TEXT)
@@ -406,27 +151,18 @@ with gr.Blocks(css=custom_css) as app:
406
  # loads one sample
407
  gr.Markdown("""## Random Dataset Sample Viewer
408
  Warning, refusals, XSTest, and donotanswer datasets have sensitive content.""")
409
- subset_selector = gr.Dropdown(subsets, label="Subset", value=None, multiselect=True)
410
  button = gr.Button("Show Random Sample")
411
 
412
  with gr.Row():
413
  sample_display = gr.Markdown("{sampled data loads here}")
414
 
415
  button.click(fn=random_sample, inputs=[subset_selector], outputs=[sample_display])
416
- # removed plot because not pretty enough
417
- # with gr.TabItem("Model Correlation"):
418
- # with gr.Row():
419
- # plot = plot_avg_correlation(rewardbench_data_avg, prefs_data)
420
- # gr.Plot(plot)
421
 
422
- search_1.change(regex_table, inputs=[rewardbench_table_hidden, search_1, model_types_1], outputs=rewardbench_table)
423
- search_2.change(regex_table, inputs=[rewardbench_table_detailed_hidden, search_2, model_types_2], outputs=rewardbench_table_detailed)
424
- # search.change(regex_table, inputs=[rewardbench_table_len_hidden, search, filter_button], outputs=rewardbench_table_len)
425
- search_3.change(regex_table, inputs=[pref_sets_table_hidden, search_3, model_types_3], outputs=pref_sets_table)
426
-
427
- model_types_1.change(regex_table, inputs=[rewardbench_table_hidden, search_1, model_types_1], outputs=rewardbench_table)
428
- model_types_2.change(regex_table, inputs=[rewardbench_table_detailed_hidden, search_2, model_types_2], outputs=rewardbench_table_detailed)
429
- model_types_3.change(regex_table, inputs=[pref_sets_table_hidden, search_3, model_types_3], outputs=pref_sets_table)
430
 
431
  with gr.Row():
432
  with gr.Accordion("📚 Citation", open=False):
@@ -442,16 +178,7 @@ Warning, refusals, XSTest, and donotanswer datasets have sensitive content.""")
442
  elem_id="citation-button",
443
  show_copy_button=True,
444
  )
445
- # Load data when app starts, TODO make this used somewhere...
446
- # def load_data_on_start():
447
- # data_rewardbench = load_all_data(repo_dir_rewardbench)
448
- # rewardbench_table.update(data_rewardbench)
449
-
450
- # data_rewardbench_avg = avg_over_rewardbench(repo_dir_rewardbench)
451
- # rewardbench_table.update(data_rewardbench_avg)
452
 
453
- # data_prefs = load_all_data(repo_dir_prefs)
454
- # pref_sets_table.update(data_prefs)
455
 
456
  scheduler = BackgroundScheduler()
457
  scheduler.add_job(restart_space, "interval", seconds=10800) # restarted every 3h
 
3
  from huggingface_hub import HfApi, snapshot_download
4
  from apscheduler.schedulers.background import BackgroundScheduler
5
  from datasets import load_dataset
6
+ from src.utils import load_all_data, prep_df, sort_by_category
7
  from src.md import ABOUT_TEXT, TOP_TEXT
 
 
8
  from src.css import custom_css
9
  import numpy as np
10
 
11
  api = HfApi()
12
 
13
  COLLAB_TOKEN = os.environ.get("COLLAB_TOKEN")
14
+ evals_repo = "alrope/href_results"
15
 
16
+ eval_set_repo = "alrope/test_dev"
17
+ local_result_dir = "./results/"
18
 
19
  def restart_space():
20
+ api.restart_space(repo_id="alrope/href", token=COLLAB_TOKEN)
21
 
22
  print("Pulling evaluation results")
23
  repo = snapshot_download(
24
+ local_dir=local_result_dir,
25
+ ignore_patterns=[],
26
  repo_id=evals_repo,
27
  use_auth_token=COLLAB_TOKEN,
28
  tqdm_class=None,
29
  etag_timeout=30,
30
  repo_type="dataset",
31
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
 
33
+ href_data_greedy = prep_df(load_all_data(local_result_dir, subdir="temperature=0.0"))
34
+ href_data_nongreedy = prep_df(load_all_data(local_result_dir, subdir="temperature=1.0"))
 
 
 
 
 
 
 
 
 
 
 
35
 
 
 
 
 
 
 
 
36
 
37
+ col_types_href = ["number"] + ["markdown"] + ["number"] * int((len(href_data_greedy.columns) - 1) / 2)
38
+ col_types_href_hidden = ["number"] + ["markdown"] + ["number"] * (len(href_data_greedy.columns) - 1)
39
+ categories = ['Brainstorm', 'Open QA', 'Closed QA', 'Extract', 'Generation', 'Rewrite', 'Summarize', 'Classify', "Reasoning Over Numerical Data", "Multi-Document Synthesis", "Fact Checking or Attributed QA"]
40
+ # categories = ['Average', 'Brainstorm', 'Open QA', 'Closed QA', 'Extract', 'Generation', 'Rewrite', 'Summarize', 'Classify']
 
 
 
 
 
 
 
 
 
 
 
41
 
42
  # for showing random samples
43
+ eval_set = load_dataset(eval_set_repo, use_auth_token=COLLAB_TOKEN, split="dev")
44
+ def random_sample(r: gr.Request, category):
45
+ if category is None or category == []:
46
  sample_index = np.random.randint(0, len(eval_set) - 1)
47
  sample = eval_set[sample_index]
48
+ else: # filter by category (can be list)
49
+ if isinstance(category, str):
50
+ category = [category]
51
+ # filter down dataset to only include the category(s)
52
+ eval_set_filtered = eval_set.filter(lambda x: x["category"] in category)
53
  sample_index = np.random.randint(0, len(eval_set_filtered) - 1)
54
  sample = eval_set_filtered[sample_index]
55
 
56
  markdown_text = '\n\n'.join([f"**{key}**:\n\n{value}" for key, value in sample.items()])
57
  return markdown_text
58
 
59
+ subsets = eval_set.unique("category")
60
 
 
 
 
 
 
 
 
 
 
61
 
62
+ def regex_table(dataframe, regex, selected_category, style=True):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
  """
64
  Takes a model name as a regex, then returns only the rows that has that in it.
65
  """
66
+ dataframe = sort_by_category(dataframe, selected_category)
67
+
68
  # Split regex statement by comma and trim whitespace around regexes
69
  regex_list = [x.strip() for x in regex.split(",")]
70
  # Join the list into a single regex pattern with '|' acting as OR
71
  combined_regex = '|'.join(regex_list)
72
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
  # Filter the dataframe such that 'model' contains any of the regex patterns
74
  data = dataframe[dataframe["Model"].str.contains(combined_regex, case=False, na=False)]
75
 
 
 
 
 
 
 
 
 
76
  data.reset_index(drop=True, inplace=True)
77
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
  if style:
79
+ # Format for different columns
80
+ format_dict = {col: "{:.1f}" for col in data.columns if col not in ['Average', 'Model', 'Rank']}
81
+ format_dict['Average'] = "{:.2f}"
82
+ data = data.style.format(format_dict, na_rep='').set_properties(**{'text-align': 'right'})
83
  return data
84
 
 
85
 
86
+ total_models = len(regex_table(href_data_greedy.copy(), "", "Average", style=False).values)
87
 
88
  with gr.Blocks(css=custom_css) as app:
89
  # create tabs for the app, moving the current table to one titled "rewardbench" and the benchmark_text to a tab called "About"
 
98
  ![](file/src/logo.png)
99
  """)
100
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
101
+ with gr.TabItem("🏆 HREF Leaderboard"):
102
  with gr.Row():
103
  search_1 = gr.Textbox(label="Model Search (delimit with , )",
104
+ # placeholder="Model Search (delimit with , )",
105
+ show_label=True)
106
+ category_selector_1 = gr.Dropdown(categories, label="Sorted By", value="Average", multiselect=False, show_label=True)
 
 
 
 
 
107
  with gr.Row():
108
  # reference data
109
  rewardbench_table_hidden = gr.Dataframe(
110
+ href_data_greedy.values,
111
+ datatype=col_types_href_hidden,
112
+ headers=href_data_greedy.columns.tolist(),
113
  visible=False,
114
  )
115
  rewardbench_table = gr.Dataframe(
116
+ regex_table(href_data_greedy.copy(), "", "Average"),
117
+ datatype=col_types_href,
118
+ headers=href_data_greedy.columns.tolist(),
119
+ elem_id="href_data_greedy",
120
+ interactive=False,
121
+ max_height=1000,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
122
  )
123
+ with gr.TabItem("Non-Greedy"):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
124
  with gr.Row():
125
+ search_2 = gr.Textbox(label="Model Search (delimit with , )",
126
+ # placeholder="Model Search (delimit with , )",
127
+ show_label=True)
128
+ category_selector_2 = gr.Dropdown(categories, label="Sorted By", value="Average", multiselect=False, show_label=True)
 
 
 
129
  with gr.Row():
130
+ # reference data
131
+ rewardbench_table_hidden_nongreedy = gr.Dataframe(
132
+ href_data_nongreedy.values,
133
+ datatype=col_types_href_hidden,
134
+ headers=href_data_nongreedy.columns.tolist(),
 
 
 
 
 
135
  visible=False,
136
  )
137
+ rewardbench_table_nongreedy = gr.Dataframe(
138
+ regex_table(href_data_nongreedy.copy(), "", "Average"),
139
+ datatype=col_types_href,
140
+ headers=href_data_nongreedy.columns.tolist(),
141
+ elem_id="href_data_nongreedy",
142
+ interactive=False,
143
+ max_height=1000,
144
  )
 
 
145
  with gr.TabItem("About"):
146
  with gr.Row():
147
  gr.Markdown(ABOUT_TEXT)
 
151
  # loads one sample
152
  gr.Markdown("""## Random Dataset Sample Viewer
153
  Warning, refusals, XSTest, and donotanswer datasets have sensitive content.""")
154
+ subset_selector = gr.Dropdown(subsets, label="Category", value=None, multiselect=True)
155
  button = gr.Button("Show Random Sample")
156
 
157
  with gr.Row():
158
  sample_display = gr.Markdown("{sampled data loads here}")
159
 
160
  button.click(fn=random_sample, inputs=[subset_selector], outputs=[sample_display])
 
 
 
 
 
161
 
162
+ search_1.change(regex_table, inputs=[rewardbench_table_hidden, search_1, category_selector_1], outputs=rewardbench_table)
163
+ category_selector_1.change(regex_table, inputs=[rewardbench_table_hidden, search_1, category_selector_1], outputs=rewardbench_table)
164
+ search_2.change(regex_table, inputs=[rewardbench_table_hidden_nongreedy, search_2, category_selector_2], outputs=rewardbench_table_nongreedy)
165
+ category_selector_2.change(regex_table, inputs=[rewardbench_table_hidden_nongreedy, search_2, category_selector_2], outputs=rewardbench_table_nongreedy)
 
 
 
 
166
 
167
  with gr.Row():
168
  with gr.Accordion("📚 Citation", open=False):
 
178
  elem_id="citation-button",
179
  show_copy_button=True,
180
  )
 
 
 
 
 
 
 
181
 
 
 
182
 
183
  scheduler = BackgroundScheduler()
184
  scheduler.add_job(restart_space, "interval", seconds=10800) # restarted every 3h
app_old.py ADDED
@@ -0,0 +1,464 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import os
3
+ from huggingface_hub import HfApi, snapshot_download
4
+ from apscheduler.schedulers.background import BackgroundScheduler
5
+ from datasets import load_dataset
6
+ from src.utils_old import load_all_data
7
+ from src.md import ABOUT_TEXT, TOP_TEXT
8
+ from src.constants import subset_mapping, length_categories, example_counts
9
+ from src.css import custom_css
10
+ import numpy as np
11
+
12
+ api = HfApi()
13
+
14
+ COLLAB_TOKEN = os.environ.get("COLLAB_TOKEN")
15
+ evals_repo = "allenai/reward-bench-results"
16
+
17
+ eval_set_repo = "allenai/reward-bench"
18
+ repo_dir_rewardbench = "./evals/rewardbench/"
19
+
20
+ def restart_space():
21
+ api.restart_space(repo_id="allenai/reward-bench", token=COLLAB_TOKEN)
22
+
23
+ print("Pulling evaluation results")
24
+ repo = snapshot_download(
25
+ local_dir=repo_dir_rewardbench,
26
+ ignore_patterns=["pref-sets-scores/*", "eval-set-scores/*"],
27
+ repo_id=evals_repo,
28
+ use_auth_token=COLLAB_TOKEN,
29
+ tqdm_class=None,
30
+ etag_timeout=30,
31
+ repo_type="dataset",
32
+ )
33
+
34
+
35
+ def avg_over_rewardbench(dataframe_core, dataframe_prefs):
36
+ """
37
+ Averages over the subsets alpacaeval, mt-bench, llmbar, refusals, hep and returns dataframe with only these columns.
38
+
39
+ We average over 4 core sections (per prompt weighting):
40
+ 1. Chat: Includes the easy chat subsets (alpacaeval-easy, alpacaeval-length, alpacaeval-hard, mt-bench-easy, mt-bench-medium)
41
+ 2. Chat Hard: Includes the hard chat subsets (mt-bench-hard, llmbar-natural, llmbar-adver-neighbor, llmbar-adver-GPTInst, llmbar-adver-GPTOut, llmbar-adver-manual)
42
+ 3. Safety: Includes the safety subsets (refusals-dangerous, refusals-offensive, xstest-should-refuse, xstest-should-respond, do not answer)
43
+ 4. Reasoning: Includes the code and math subsets (math-prm, hep-cpp, hep-go, hep-java, hep-js, hep-python, hep-rust)
44
+ 5. Prior Sets (0.5 weight): Includes the test sets (anthropic_helpful, mtbench_human, shp, summarize)
45
+ """
46
+ new_df = dataframe_core.copy()
47
+ dataframe_prefs = dataframe_prefs.copy()
48
+
49
+ # for main subsets, keys in subset_mapping, take the weighted avg by example_counts and store for the models
50
+ for subset, sub_subsets in subset_mapping.items():
51
+ subset_cols = [col for col in new_df.columns if col in sub_subsets]
52
+ sub_data = new_df[subset_cols].values # take the relevant column values
53
+ sub_counts = [example_counts[s] for s in subset_cols] # take the example counts
54
+ new_df[subset] = np.average(sub_data, axis=1, weights=sub_counts) # take the weighted average
55
+ # new_df[subset] = np.round(np.nanmean(new_df[subset_cols].values, axis=1), 2)
56
+
57
+ data_cols = list(subset_mapping.keys())
58
+ keep_columns = ["model",] + ["model_type"] + data_cols
59
+ # keep_columns = ["model", "average"] + subsets
60
+ new_df = new_df[keep_columns]
61
+
62
+ # selected average from pref_sets
63
+ pref_columns = ["anthropic_helpful", "anthropic_hhh", "shp", "summarize"]
64
+ pref_data = dataframe_prefs[pref_columns].values
65
+
66
+ # add column test sets knowing the rows are not identical, take superset
67
+ dataframe_prefs["Prior Sets (0.5 weight)"] = np.nanmean(pref_data, axis=1)
68
+
69
+ # add column Test Sets empty to new_df
70
+ new_df["Prior Sets (0.5 weight)"] = np.nan
71
+ # per row in new_df if model is in dataframe_prefs, add the value to new_df["Prior Sets (0.5 weight)"]
72
+ values = []
73
+ for i, row in new_df.iterrows():
74
+ model = row["model"]
75
+ if model in dataframe_prefs["model"].values:
76
+ values.append(dataframe_prefs[dataframe_prefs["model"] == model]["Prior Sets (0.5 weight)"].values[0])
77
+ # new_df.at[i, "Prior Sets (0.5 weight)"] = dataframe_prefs[dataframe_prefs["model"] == model]["Prior Sets (0.5 weight)"].values[0]
78
+ else:
79
+ values.append(np.nan)
80
+
81
+ new_df["Prior Sets (0.5 weight)"] = values
82
+
83
+ # add total average
84
+ data_cols += ["Prior Sets (0.5 weight)"]
85
+ final_data = new_df[data_cols].values
86
+ masked_data = np.ma.masked_array(final_data, np.isnan(final_data))
87
+ weights = [2, 2, 2, 2, 1]
88
+ average = np.ma.average(masked_data, axis=1, weights=weights)
89
+ new_df["average"] = average.filled(np.nan)
90
+ # new_df["average"] = np.nanmean(new_df[data_cols].values, axis=1)
91
+
92
+ # make average third column
93
+ keep_columns = ["model", "model_type", "average"] + data_cols
94
+ new_df = new_df[keep_columns]
95
+ return new_df
96
+
97
+ def expand_subsets(dataframe):
98
+ # TODO need to modify data/ script to do this
99
+ pass
100
+
101
+
102
+ def length_bias_check(dataframe):
103
+ """
104
+ Takes the raw rewardbench dataframe and splits the data into new buckets according to length_categories.
105
+ Then, take the average of the three buckets as "average"
106
+ """
107
+ new_df = dataframe.copy()
108
+ existing_subsets = new_df.columns[3:] # model, model_type, average
109
+ final_subsets = ["Length Bias", "Neutral", "Terse Bias"]
110
+ # new data is empty list dict for each final subset
111
+ new_data = {s: [] for s in final_subsets}
112
+
113
+ # now, subsets correspond to those with True, Nuetral, and False length bias
114
+ # check if length_categories[subset] == "True" or "False" or "Neutral"
115
+ for subset in existing_subsets:
116
+ subset_data = new_df[subset].values
117
+ subset_length = length_categories[subset]
118
+ # route to the correct bucket
119
+ if subset_length == "True":
120
+ new_data["Length Bias"].append(subset_data)
121
+ elif subset_length == "Neutral":
122
+ new_data["Neutral"].append(subset_data)
123
+ elif subset_length == "False":
124
+ new_data["Terse Bias"].append(subset_data)
125
+
126
+ # take average of new_data and add to new_df (removing other columns than model)
127
+ for subset in final_subsets:
128
+ new_df[subset] = np.nanmean(new_data[subset], axis=0)
129
+ keep_columns = ["model"] + final_subsets
130
+ new_df = new_df[keep_columns]
131
+ # recompute average
132
+ # new_df["average"] = np.round(np.nanmean(new_df[final_subsets].values, axis=1), 2)
133
+
134
+ return new_df
135
+
136
+
137
+
138
+ rewardbench_data = load_all_data(repo_dir_rewardbench, subdir="eval-set").sort_values(by='average', ascending=False)
139
+ rewardbench_data_length = length_bias_check(rewardbench_data).sort_values(by='Terse Bias', ascending=False)
140
+ prefs_data = load_all_data(repo_dir_rewardbench, subdir="pref-sets").sort_values(by='average', ascending=False)
141
+ # prefs_data_sub = expand_subsets(prefs_data).sort_values(by='average', ascending=False)
142
+
143
+ rewardbench_data_avg = avg_over_rewardbench(rewardbench_data, prefs_data).sort_values(by='average', ascending=False)
144
+
145
+ def prep_df(df):
146
+ # add column to 0th entry with count (column name itself empty)
147
+ df.insert(0, '', range(1, 1 + len(df)))
148
+
149
+ # replace "model" with "Model" and "model_type" with "Model Type" and "average" with "Average"
150
+ df = df.rename(columns={"model": "Model", "model_type": "Model Type", "average": "Average"})
151
+
152
+ # if "Model Type" in columns
153
+ if "Model Type" in df.columns:
154
+ # get model_types that have generative in them
155
+ mask = df["Model Type"].str.contains("generative", case=False, na=False)
156
+
157
+ # set these values to "Generative"
158
+ df.loc[mask, "Model Type"] = "Generative"
159
+
160
+ return df
161
+
162
+ # add count column to all dataframes
163
+ rewardbench_data = prep_df(rewardbench_data)
164
+ rewardbench_data_avg = prep_df(rewardbench_data_avg).rename(columns={"Average": "Score"})
165
+ # adjust weight of this average to 50% for Prior Sets (0.5 weight), 1 for others
166
+
167
+ rewardbench_data_length = prep_df(rewardbench_data_length)
168
+ prefs_data = prep_df(prefs_data)
169
+
170
+ col_types_rewardbench = ["number"] + ["markdown"] + ["str"] + ["number"] * (len(rewardbench_data.columns) - 1)
171
+ col_types_rewardbench_avg = ["number"] + ["markdown"]+ ["str"] + ["number"] * (len(rewardbench_data_avg.columns) - 1)
172
+ cols_rewardbench_data_length = ["markdown"] + ["number"] * (len(rewardbench_data_length.columns) - 1)
173
+ col_types_prefs = ["number"] + ["markdown"] + ["number"] * (len(prefs_data.columns) - 1)
174
+ # col_types_prefs_sub = ["markdown"] + ["number"] * (len(prefs_data_sub.columns) - 1)
175
+
176
+ # for showing random samples
177
+ eval_set = load_dataset(eval_set_repo, use_auth_token=COLLAB_TOKEN, split="filtered")
178
+ def random_sample(r: gr.Request, subset):
179
+ if subset is None or subset == []:
180
+ sample_index = np.random.randint(0, len(eval_set) - 1)
181
+ sample = eval_set[sample_index]
182
+ else: # filter by subsets (can be list)
183
+ if isinstance(subset, str):
184
+ subset = [subset]
185
+ # filter down dataset to only include the subset(s)
186
+ eval_set_filtered = eval_set.filter(lambda x: x["subset"] in subset)
187
+ sample_index = np.random.randint(0, len(eval_set_filtered) - 1)
188
+ sample = eval_set_filtered[sample_index]
189
+
190
+ markdown_text = '\n\n'.join([f"**{key}**:\n\n{value}" for key, value in sample.items()])
191
+ return markdown_text
192
+
193
+ subsets = eval_set.unique("subset")
194
+
195
+ color_map = {
196
+ "Generative": "#7497db",
197
+ "Custom Classifier": "#E8ECF2",
198
+ "Seq. Classifier": "#ffcd75",
199
+ "DPO": "#75809c",
200
+ }
201
+ def color_model_type_column(df, color_map):
202
+ """
203
+ Apply color to the 'Model Type' column of the DataFrame based on a given color mapping.
204
+
205
+ Parameters:
206
+ df (pd.DataFrame): The DataFrame containing the 'Model Type' column.
207
+ color_map (dict): A dictionary mapping model types to colors.
208
+
209
+ Returns:
210
+ pd.Styler: The styled DataFrame.
211
+ """
212
+ # Function to apply color based on the model type
213
+ def apply_color(val):
214
+ color = color_map.get(val, "default") # Default color if not specified in color_map
215
+ return f'background-color: {color}'
216
+
217
+ # Format for different columns
218
+ format_dict = {col: "{:.1f}" for col in df.columns if col not in ['Average', 'Model', 'Model Type']}
219
+ format_dict['Average'] = "{:.2f}"
220
+ format_dict[''] = "{:d}"
221
+
222
+ return df.style.applymap(apply_color, subset=['Model Type']).format(format_dict, na_rep='')
223
+
224
+ def regex_table(dataframe, regex, filter_button, style=True):
225
+ """
226
+ Takes a model name as a regex, then returns only the rows that has that in it.
227
+ """
228
+ # Split regex statement by comma and trim whitespace around regexes
229
+ regex_list = [x.strip() for x in regex.split(",")]
230
+ # Join the list into a single regex pattern with '|' acting as OR
231
+ combined_regex = '|'.join(regex_list)
232
+
233
+ # remove internal ai2 data
234
+ dataframe = dataframe[~dataframe["Model"].str.contains("ai2", case=False, na=False)]
235
+
236
+ # if filter_button, remove all rows with "ai2" in the model name
237
+ update_scores = False
238
+ if isinstance(filter_button, list) or isinstance(filter_button, str):
239
+ if "Prior Sets" not in filter_button and 'Prior Sets (0.5 weight)' in dataframe.columns:
240
+ update_scores = True
241
+ # remove the column "Prior Sets (0.5 weight)" from the outputted table
242
+ dataframe = dataframe.drop(columns=['Prior Sets (0.5 weight)'])
243
+ if "Seq. Classifiers" not in filter_button:
244
+ dataframe = dataframe[~dataframe["Model Type"].str.contains("Seq. Classifier", case=False, na=False)]
245
+ if "DPO" not in filter_button:
246
+ dataframe = dataframe[~dataframe["Model Type"].str.contains("DPO", case=False, na=False)]
247
+ if "Custom Classifiers" not in filter_button:
248
+ dataframe = dataframe[~dataframe["Model Type"].str.contains("Custom Classifier", case=False, na=False)]
249
+ if "Generative" not in filter_button:
250
+ dataframe = dataframe[~dataframe["Model Type"].str.contains("generative", case=False, na=False)]
251
+ # Filter the dataframe such that 'model' contains any of the regex patterns
252
+ data = dataframe[dataframe["Model"].str.contains(combined_regex, case=False, na=False)]
253
+
254
+ # if update the score to not use prior sets, do so
255
+ if update_scores:
256
+ data["Score"] = (data["Chat"] + data["Chat Hard"] + data["Safety"] + data["Reasoning"]) / 4
257
+ # if "Prior Sets (0.5 weight)" in data.columns:
258
+ # data["Prior Sets (0.5 weight)"] = np.nan
259
+ # sort array by Score column
260
+ data = data.sort_values(by='Score', ascending=False)
261
+
262
+ data.reset_index(drop=True, inplace=True)
263
+
264
+ # replace column '' with count/rank
265
+ data[''] = np.arange(1, 1 + len(data))
266
+
267
+ # if Score exists, round to 2 decimals
268
+ if "Score" in data.columns:
269
+ data["Score"] = np.round(np.array(data["Score"].values).astype(float), 2)
270
+ if "Average" in data.columns:
271
+ data["Average"] = np.round(np.array(data["Average"].values).astype(float), 1)
272
+ # round all others to 1 decimal
273
+ for col in data.columns:
274
+ if col not in ["", "Model", "Model Type", "Score", "Average"]:
275
+ # replace any data[col].values == '' with np.nan
276
+ data[col] = data[col].replace('', np.nan)
277
+ data[col] = np.round(np.array(data[col].values).astype(float), 1)
278
+ if style:
279
+ # apply color
280
+ data = color_model_type_column(data, color_map)
281
+
282
+ return data
283
+
284
+
285
+ def printout(df):
286
+ print(df.iloc[0])
287
+ print(df.iloc[1])
288
+
289
+ # import ipdb; ipdb.set_trace()
290
+
291
+ total_models = len(regex_table(rewardbench_data_avg.copy(), "", ["Seq. Classifiers", "DPO", "Custom Classifiers", "Generative"], style=False).values)
292
+
293
+
294
+ with gr.Blocks(css=custom_css) as app:
295
+ # create tabs for the app, moving the current table to one titled "rewardbench" and the benchmark_text to a tab called "About"
296
+ with gr.Row():
297
+ with gr.Column(scale=6):
298
+ gr.Markdown(TOP_TEXT.format(str(total_models)))
299
+ with gr.Column(scale=4):
300
+ # search = gr.Textbox(label="Model Search (delimit with , )", placeholder="Regex search for a model")
301
+ # filter_button = gr.Checkbox(label="Include AI2 training runs (or type ai2 above).", interactive=True)
302
+ # img = gr.Image(value="https://private-user-images.githubusercontent.com/10695622/310698241-24ed272a-0844-451f-b414-fde57478703e.png", width=500)
303
+ gr.Markdown("""
304
+ ![](file/src/logo.png)
305
+ """)
306
+ with gr.Tabs(elem_classes="tab-buttons") as tabs:
307
+ with gr.TabItem("🏆 RewardBench Leaderboard"):
308
+ with gr.Row():
309
+ search_1 = gr.Textbox(label="Model Search (delimit with , )",
310
+ placeholder="Model Search (delimit with , )",
311
+ show_label=False)
312
+ model_types_1 = gr.CheckboxGroup(["Seq. Classifiers", "DPO", "Custom Classifiers", "Generative", "Prior Sets"],
313
+ value=["Seq. Classifiers", "DPO", "Custom Classifiers", "Generative"],
314
+ label="Model Types",
315
+ show_label=False,
316
+ # info="Which model types to include.",
317
+ )
318
+ with gr.Row():
319
+ # reference data
320
+ rewardbench_table_hidden = gr.Dataframe(
321
+ rewardbench_data_avg.values,
322
+ datatype=col_types_rewardbench_avg,
323
+ headers=rewardbench_data_avg.columns.tolist(),
324
+ visible=False,
325
+ )
326
+ rewardbench_table = gr.Dataframe(
327
+ regex_table(rewardbench_data_avg.copy(), "", ["Seq. Classifiers", "DPO", "Custom Classifiers", "Generative"]),
328
+ datatype=col_types_rewardbench_avg,
329
+ headers=rewardbench_data_avg.columns.tolist(),
330
+ elem_id="rewardbench_dataframe_avg",
331
+ max_height=1000,
332
+ )
333
+
334
+ with gr.TabItem("🔍 RewardBench - Detailed"):
335
+ with gr.Row():
336
+ search_2 = gr.Textbox(label="Model Search (delimit with , )", show_label=False, placeholder="Model Search (delimit with , )")
337
+ model_types_2 = gr.CheckboxGroup(["Seq. Classifiers", "DPO", "Custom Classifiers", "Generative"],
338
+ value=["Seq. Classifiers", "DPO", "Generative", "Custom Classifiers"],
339
+ label="Model Types",
340
+ show_label=False,
341
+ # info="Which model types to include."
342
+ )
343
+ with gr.Row():
344
+ # ref data
345
+ rewardbench_table_detailed_hidden = gr.Dataframe(
346
+ rewardbench_data.values,
347
+ datatype=col_types_rewardbench,
348
+ headers=rewardbench_data.columns.tolist(),
349
+ visible=False,
350
+ )
351
+ rewardbench_table_detailed = gr.Dataframe(
352
+ regex_table(rewardbench_data.copy(), "", ["Seq. Classifiers", "DPO", "Generative", "Custom Classifiers"]),
353
+ datatype=col_types_rewardbench,
354
+ headers=rewardbench_data.columns.tolist(),
355
+ elem_id="rewardbench_dataframe",
356
+ max_height=1000,
357
+ )
358
+ # with gr.TabItem("rewardbench Eval Set - Length Bias"):
359
+ # with gr.Row():
360
+ # # backup
361
+ # rewardbench_table_len_hidden = gr.Dataframe(
362
+ # rewardbench_data_length.values,
363
+ # datatype=cols_rewardbench_data_length,
364
+ # headers=rewardbench_data_length.columns.tolist(),
365
+ # visible=False,
366
+ # )
367
+ # rewardbench_table_len = gr.Dataframe(
368
+ # regex_table(rewardbench_data_length.copy(), "", False).values,
369
+ # datatype=cols_rewardbench_data_length,
370
+ # headers=rewardbench_data_length.columns.tolist(),
371
+ # elem_id="rewardbench_dataframe_length",
372
+ # height=1000,
373
+ # )
374
+ with gr.TabItem("Prior Test Sets"):
375
+ with gr.Row():
376
+ search_3 = gr.Textbox(label="Model Search (delimit with , )", show_label=False, placeholder="Model Search (delimit with , )")
377
+ model_types_3 = gr.CheckboxGroup(["Seq. Classifiers", "DPO", "Custom Classifiers", "Generative"],
378
+ value=["Seq. Classifiers", "DPO", "Custom Classifiers"],
379
+ label="Model Types",
380
+ show_label=False,
381
+ # info="Which model types to include.",
382
+ )
383
+ with gr.Row():
384
+ PREF_SET_TEXT = """
385
+ For more information, see the [dataset](https://huggingface.co/datasets/allenai/pref-test-sets). Only the subsets Anthropic Helpful, Anthropic HHH, Stanford SHP, and OpenAI's Summarize data are used in the leaderboard ranking.
386
+ """
387
+ gr.Markdown(PREF_SET_TEXT)
388
+ with gr.Row():
389
+ # backup
390
+ pref_sets_table_hidden = gr.Dataframe(
391
+ prefs_data.values,
392
+ datatype=col_types_prefs,
393
+ headers=prefs_data.columns.tolist(),
394
+ visible=False,
395
+ )
396
+ pref_sets_table = gr.Dataframe(
397
+ regex_table(prefs_data.copy(), "", ["Seq. Classifiers", "DPO", "Custom Classifiers"]),
398
+ datatype=col_types_prefs,
399
+ headers=prefs_data.columns.tolist(),
400
+ elem_id="prefs_dataframe",
401
+ max_height=1000,
402
+ )
403
+
404
+
405
+ with gr.TabItem("About"):
406
+ with gr.Row():
407
+ gr.Markdown(ABOUT_TEXT)
408
+
409
+ with gr.TabItem("Dataset Viewer"):
410
+ with gr.Row():
411
+ # loads one sample
412
+ gr.Markdown("""## Random Dataset Sample Viewer
413
+ Warning, refusals, XSTest, and donotanswer datasets have sensitive content.""")
414
+ subset_selector = gr.Dropdown(subsets, label="Subset", value=None, multiselect=True)
415
+ button = gr.Button("Show Random Sample")
416
+
417
+ with gr.Row():
418
+ sample_display = gr.Markdown("{sampled data loads here}")
419
+
420
+ button.click(fn=random_sample, inputs=[subset_selector], outputs=[sample_display])
421
+ # removed plot because not pretty enough
422
+ # with gr.TabItem("Model Correlation"):
423
+ # with gr.Row():
424
+ # plot = plot_avg_correlation(rewardbench_data_avg, prefs_data)
425
+ # gr.Plot(plot)
426
+
427
+ search_1.change(regex_table, inputs=[rewardbench_table_hidden, search_1, model_types_1], outputs=rewardbench_table)
428
+ search_2.change(regex_table, inputs=[rewardbench_table_detailed_hidden, search_2, model_types_2], outputs=rewardbench_table_detailed)
429
+ # search.change(regex_table, inputs=[rewardbench_table_len_hidden, search, filter_button], outputs=rewardbench_table_len)
430
+ search_3.change(regex_table, inputs=[pref_sets_table_hidden, search_3, model_types_3], outputs=pref_sets_table)
431
+
432
+ model_types_1.change(regex_table, inputs=[rewardbench_table_hidden, search_1, model_types_1], outputs=rewardbench_table)
433
+ model_types_2.change(regex_table, inputs=[rewardbench_table_detailed_hidden, search_2, model_types_2], outputs=rewardbench_table_detailed)
434
+ model_types_3.change(regex_table, inputs=[pref_sets_table_hidden, search_3, model_types_3], outputs=pref_sets_table)
435
+
436
+ with gr.Row():
437
+ with gr.Accordion("📚 Citation", open=False):
438
+ citation_button = gr.Textbox(
439
+ value=r"""@misc{RewardBench,
440
+ title={RewardBench: Evaluating Reward Models for Language Modeling},
441
+ author={Lambert, Nathan and Pyatkin, Valentina and Morrison, Jacob and Miranda, LJ and Lin, Bill Yuchen and Chandu, Khyathi and Dziri, Nouha and Kumar, Sachin and Zick, Tom and Choi, Yejin and Smith, Noah A. and Hajishirzi, Hannaneh},
442
+ year={2024},
443
+ howpublished={\url{https://huggingface.co/spaces/allenai/reward-bench}
444
+ }""",
445
+ lines=7,
446
+ label="Copy the following to cite these results.",
447
+ elem_id="citation-button",
448
+ show_copy_button=True,
449
+ )
450
+ # Load data when app starts, TODO make this used somewhere...
451
+ # def load_data_on_start():
452
+ # data_rewardbench = load_all_data(repo_dir_rewardbench)
453
+ # rewardbench_table.update(data_rewardbench)
454
+
455
+ # data_rewardbench_avg = avg_over_rewardbench(repo_dir_rewardbench)
456
+ # rewardbench_table.update(data_rewardbench_avg)
457
+
458
+ # data_prefs = load_all_data(repo_dir_prefs)
459
+ # pref_sets_table.update(data_prefs)
460
+
461
+ scheduler = BackgroundScheduler()
462
+ scheduler.add_job(restart_space, "interval", seconds=10800) # restarted every 3h
463
+ scheduler.start()
464
+ app.launch(allowed_paths=['src/']) # had .queue() before launch before... not sure if that's necessary
src/constants.py CHANGED
@@ -57,4 +57,4 @@ subset_mapping = {
57
  "Chat Hard": ['llmbar-adver-GPTInst', 'llmbar-adver-GPTOut', 'llmbar-adver-manual', 'llmbar-adver-neighbor', 'llmbar-natural', 'mt-bench-hard'],
58
  "Safety": ['donotanswer', 'refusals-dangerous', 'refusals-offensive', 'xstest-should-refuse', 'xstest-should-respond'],
59
  "Reasoning": ["hep-cpp", "hep-go", "hep-java", "hep-js", "hep-python", "hep-rust", "math-prm"]
60
- }
 
57
  "Chat Hard": ['llmbar-adver-GPTInst', 'llmbar-adver-GPTOut', 'llmbar-adver-manual', 'llmbar-adver-neighbor', 'llmbar-natural', 'mt-bench-hard'],
58
  "Safety": ['donotanswer', 'refusals-dangerous', 'refusals-offensive', 'xstest-should-refuse', 'xstest-should-respond'],
59
  "Reasoning": ["hep-cpp", "hep-go", "hep-java", "hep-js", "hep-python", "hep-rust", "math-prm"]
60
+ }
src/md.py CHANGED
@@ -102,5 +102,4 @@ current_time = datetime.now(pacific_tz).strftime("%H:%M %Z, %d %b %Y")
102
  TOP_TEXT = f"""# RewardBench: Evaluating Reward Models
103
  ### Evaluating the capabilities, safety, and pitfalls of reward models
104
  [Code](https://github.com/allenai/reward-bench) | [Eval. Dataset](https://huggingface.co/datasets/allenai/reward-bench) | [Prior Test Sets](https://huggingface.co/datasets/allenai/pref-test-sets) | [Results](https://huggingface.co/datasets/allenai/reward-bench-results) | [Paper](https://arxiv.org/abs/2403.13787) | Total models: {{}} | * Unverified models | ⚠️ Dataset Contamination | Last restart (PST): {current_time}
105
-
106
- ⚠️ Many of the top models were trained on unintentionally contaminated, AI-generated data, for more information, see this [gist](https://gist.github.com/natolambert/1aed306000c13e0e8c5bc17c1a5dd300)."""
 
102
  TOP_TEXT = f"""# RewardBench: Evaluating Reward Models
103
  ### Evaluating the capabilities, safety, and pitfalls of reward models
104
  [Code](https://github.com/allenai/reward-bench) | [Eval. Dataset](https://huggingface.co/datasets/allenai/reward-bench) | [Prior Test Sets](https://huggingface.co/datasets/allenai/pref-test-sets) | [Results](https://huggingface.co/datasets/allenai/reward-bench-results) | [Paper](https://arxiv.org/abs/2403.13787) | Total models: {{}} | * Unverified models | ⚠️ Dataset Contamination | Last restart (PST): {current_time}
105
+ """
 
src/plt.py DELETED
@@ -1,53 +0,0 @@
1
- import matplotlib.pyplot as plt
2
- import pandas as pd
3
- from .utils import undo_hyperlink
4
-
5
- def plot_avg_correlation(df1, df2):
6
- """
7
- Plots the "average" column for each unique model that appears in both dataframes.
8
-
9
- Parameters:
10
- - df1: pandas DataFrame containing columns "model" and "average".
11
- - df2: pandas DataFrame containing columns "model" and "average".
12
- """
13
- # Identify the unique models that appear in both DataFrames
14
- common_models = pd.Series(list(set(df1['model']) & set(df2['model'])))
15
-
16
- # Set up the plot
17
- plt.figure(figsize=(13, 6), constrained_layout=True)
18
-
19
- # axes from 0 to 1 for x and y
20
- plt.xlim(0.475, 0.8)
21
- plt.ylim(0.475, 0.8)
22
-
23
- # larger font (16)
24
- plt.rcParams.update({'font.size': 12, 'axes.labelsize': 14,'axes.titlesize': 14})
25
- # plt.subplots_adjust(left=0.1, right=0.9, top=0.9, bottom=0.1)
26
- # plt.tight_layout()
27
- # plt.margins(0,0)
28
-
29
- for model in common_models:
30
- # Filter data for the current model
31
- df1_model_data = df1[df1['model'] == model]['average'].values
32
- df2_model_data = df2[df2['model'] == model]['average'].values
33
-
34
- # Plotting
35
- plt.scatter(df1_model_data, df2_model_data, label=model)
36
- m_name = undo_hyperlink(model)
37
- if m_name == "No text found":
38
- m_name = "Random"
39
- # Add text above each point like
40
- # plt.text(x[i] + 0.1, y[i] + 0.1, label, ha='left', va='bottom')
41
- plt.text(df1_model_data - .005, df2_model_data, m_name, horizontalalignment='right', verticalalignment='center')
42
-
43
- # add correlation line to scatter plot
44
- # first, compute correlation
45
- corr = df1['average'].corr(df2['average'])
46
- # add correlation line based on corr
47
-
48
-
49
-
50
- plt.xlabel('HERM Eval. Set Avg.', fontsize=16)
51
- plt.ylabel('Pref. Test Sets Avg.', fontsize=16)
52
- # plt.legend(title='Model', bbox_to_anchor=(1.05, 1), loc='upper left')
53
- return plt
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/utils.py CHANGED
@@ -6,33 +6,11 @@ import os
6
  import re
7
 
8
  UNVERIFIED_MODELS = [
9
- "nvidia/Nemotron-4-340B-Reward",
10
- "nvidia/Llama3-70B-SteerLM-RM",
11
- "Cohere May 2024",
12
- "google/gemini-1.5-pro-0514",
13
- "google/flame-24b-july-2024",
14
- "Cohere March 2024",
15
- "facebook/Self-taught-Llama-3-70B",
16
- "facebook/Self-taught-evaluator-llama3.1-70B",
17
- "google/flame-1.0-24B-july-2024",
18
- "Salesforce/SFR-LLaMa-3.1-70B-Judge-r",
19
- "Salesforce/SFR-nemo-12B-Judge-r",
20
- "Salesforce/SFR-LLaMa-3.1-8B-Judge-r",
21
- "SF-Foundation/TextEval-OffsetBias-12B",
22
- "SF-Foundation/TextEval-Llama3.1-70B",
23
- "nvidia/Llama-3.1-Nemotron-70B-Reward",
24
  ]
25
 
26
  CONTAMINATED_MODELS = [
27
- "Skywork/Skywork-Reward-Gemma-2-27B",
28
- "Skywork/Skywork-Critic-Llama-3.1-70B",
29
- "LxzGordon/URM-LLaMa-3.1-8B",
30
- "Skywork/Skywork-Reward-Llama-3.1-8B",
31
- "Ray2333/GRM-Llama3-8B-rewardmodel-ft",
32
- "nicolinho/QRM-Llama3.1-8B",
33
- "nicolinho/QRM-Llama3-8B",
34
- "general-preference/GPM-Llama-3.1-8B",
35
- "general-preference/GPM-Gemma-2B"
36
  ]
37
 
38
  # From Open LLM Leaderboard
@@ -75,97 +53,103 @@ def undo_hyperlink(html_string):
75
  def load_all_data(data_repo, subdir:str, subsubsets=False): # use HF api to pull the git repo
76
  dir = Path(data_repo)
77
  data_dir = dir / subdir
78
- orgs = [d for d in os.listdir(data_dir) if os.path.isdir(os.path.join(data_dir, d))]
79
- # get all files within the sub folders orgs
80
- models_results = []
81
- for org in orgs:
82
- org_dir = data_dir / org
83
- files = [f for f in os.listdir(org_dir) if os.path.isfile(os.path.join(org_dir, f))]
84
- for file in files:
85
- if file.endswith(".json"):
86
- models_results.append(org + "/" + file)
87
-
88
  # create empty dataframe to add all data to
89
  df = pd.DataFrame()
90
 
91
  # load all json data in the list models_results one by one to avoid not having the same entries
92
- for model in models_results:
93
- model_data = load_dataset("json", data_files=data_repo + subdir+ "/" + model, split="train")
 
94
  df2 = pd.DataFrame(model_data)
95
  # add to df
96
  df = pd.concat([df2, df])
 
 
97
 
98
 
99
- # remove chat_template comlumn
100
- df = df.drop(columns=["chat_template"])
101
-
102
  # sort columns alphabetically
103
  df = df.reindex(sorted(df.columns), axis=1)
104
-
105
  # move column "model" to the front
106
  cols = list(df.columns)
107
  cols.insert(0, cols.pop(cols.index('model')))
108
  df = df.loc[:, cols]
 
 
 
 
109
 
110
- # select all columns except "model"
111
  cols = df.columns.tolist()
112
  cols.remove("model")
113
- # if model_type is a column (pref tests may not have it)
114
- if "model_type" in cols:
115
- cols.remove("model_type")
116
- # remove ref_model if in columns
117
- if "ref_model" in cols:
118
- cols.remove("ref_model")
119
- # remove model_beaker from dataframe
120
- if "model_beaker" in cols:
121
- cols.remove("model_beaker")
122
- df = df.drop(columns=["model_beaker"])
123
-
124
- # remove column xstest (outdated data)
125
- # if xstest is a column
126
- if "xstest" in cols:
127
- df = df.drop(columns=["xstest"])
128
- cols.remove("xstest")
129
-
130
- if "ref_model" in df.columns:
131
- df = df.drop(columns=["ref_model"])
132
-
133
- # remove column anthropic and summarize_prompted (outdated data)
134
- if "anthropic" in cols:
135
- df = df.drop(columns=["anthropic"])
136
- cols.remove("anthropic")
137
- if "summarize_prompted" in cols:
138
- df = df.drop(columns=["summarize_prompted"])
139
- cols.remove("summarize_prompted")
140
- # remove pku_better and pku_safer (removed from the leaderboard)
141
- if "pku_better" in cols:
142
- df = df.drop(columns=["pku_better"])
143
- cols.remove("pku_better")
144
- if "pku_safer" in cols:
145
- df = df.drop(columns=["pku_safer"])
146
- cols.remove("pku_safer")
147
-
148
- # convert to score
149
  df[cols] = (df[cols]*100)
150
- avg = np.nanmean(df[cols].values,axis=1)
151
- # add average column
152
- df["average"] = avg
153
-
154
- # apply model_hyperlink function to column "model"
155
- df["model"] = df["model"].apply(lambda x: model_hyperlink(f"https://huggingface.co/{x}", x))
156
 
157
  # move average column to the second
158
  cols = list(df.columns)
159
  cols.insert(1, cols.pop(cols.index('average')))
160
  df = df.loc[:, cols]
161
 
162
- # move model_type column to first
163
- if "model_type" in cols:
164
- cols = list(df.columns)
165
- cols.insert(1, cols.pop(cols.index('model_type')))
166
- df = df.loc[:, cols]
167
-
168
- # remove models with DPO Ref. Free as type (future work)
169
- df = df[~df["model_type"].str.contains("DPO Ref. Free", na=False)]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
170
 
171
  return df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  import re
7
 
8
  UNVERIFIED_MODELS = [
9
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  ]
11
 
12
  CONTAMINATED_MODELS = [
13
+
 
 
 
 
 
 
 
 
14
  ]
15
 
16
  # From Open LLM Leaderboard
 
53
  def load_all_data(data_repo, subdir:str, subsubsets=False): # use HF api to pull the git repo
54
  dir = Path(data_repo)
55
  data_dir = dir / subdir
56
+
57
+ # get all files
58
+ models_names = [f.split(".json")[0] for f in os.listdir(data_dir)
59
+ if os.path.isfile(os.path.join(data_dir, f)) and f.endswith(".json")]
 
 
 
 
 
 
60
  # create empty dataframe to add all data to
61
  df = pd.DataFrame()
62
 
63
  # load all json data in the list models_results one by one to avoid not having the same entries
64
+ for model_name in models_names:
65
+ model_data = load_dataset("json", data_files=os.path.join(data_dir, model_name + ".json"), split="train")
66
+ model_data = model_data.add_column("model", [model_name])
67
  df2 = pd.DataFrame(model_data)
68
  # add to df
69
  df = pd.concat([df2, df])
70
+
71
+ return df
72
 
73
 
74
+ def prep_df(df):
 
 
75
  # sort columns alphabetically
76
  df = df.reindex(sorted(df.columns), axis=1)
77
+
78
  # move column "model" to the front
79
  cols = list(df.columns)
80
  cols.insert(0, cols.pop(cols.index('model')))
81
  df = df.loc[:, cols]
82
+
83
+ # apply model_hyperlink function to column "model"
84
+ df["model"] = df.apply(lambda row: model_hyperlink(f"https://huggingface.co/{row['path']}", row['model']), axis=1)
85
+ df = df.drop(columns=["path"])
86
 
87
+ # select all columns except "model" and convert to score
88
  cols = df.columns.tolist()
89
  cols.remove("model")
90
+ cols = [c for c in cols if "rank" not in c]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
  df[cols] = (df[cols]*100)
 
 
 
 
 
 
92
 
93
  # move average column to the second
94
  cols = list(df.columns)
95
  cols.insert(1, cols.pop(cols.index('average')))
96
  df = df.loc[:, cols]
97
 
98
+ df = df.rename(columns={
99
+ "model": "Model",
100
+ "average": "Average",
101
+ "brainstorm": "Brainstorm",
102
+ "open_qa": "Open QA",
103
+ "closed_qa": "Closed QA",
104
+ "extract": "Extract",
105
+ "generation": "Generation",
106
+ "rewrite": "Rewrite",
107
+ "summarize": "Summarize",
108
+ "classify": "Classify",
109
+ "reasoning_over_numerical_data": "Reasoning Over Numerical Data",
110
+ "multi-document_synthesis": "Multi-Document Synthesis",
111
+ "fact_checking_or_attributed_qa": "Fact Checking or Attributed QA",
112
+ })
113
+
114
+ # Format for different columns
115
+ # if Score exists, round to 2 decimals
116
+ # if "Average" in df.columns:
117
+ # df["Average"] = np.array([f"{v:.2f}" for v in df["Average"].values])
118
+
119
+ # # round all others to 1 decimal
120
+ # for col in df.columns:
121
+ # if col not in ["Model", "Average"]:
122
+ # # replace any df[col].values == '' with np.nan
123
+ # df[col] = df[col].replace('', np.nan)
124
+ # df[col] = np.array([f"{v:.1f}" for v in df[col].values])
125
 
126
  return df
127
+
128
+
129
+ def sort_by_category(df, category):
130
+ new_df = df.copy()
131
+ col_rank = category.lower().replace(" ", "_") + "_rank"
132
+
133
+ # sort
134
+ new_df = new_df.sort_values(by=[col_rank, category], ascending=[True, False])
135
+
136
+ # move column ranking to the front
137
+ cols = list(new_df.columns)
138
+ cols.insert(0, cols.pop(cols.index(col_rank)))
139
+ new_df = new_df.loc[:, cols]
140
+ new_df = new_df.rename(columns={col_rank: "Rank"})
141
+
142
+ # move selected column to the third
143
+ cols = list(new_df.columns)
144
+ cols.insert(2, cols.pop(cols.index(category)))
145
+ new_df = new_df.loc[:, cols]
146
+
147
+ # # move selected column to the fourth
148
+ # cols = list(new_df.columns)
149
+ # cols.insert(3, cols.pop(cols.index("Average")))
150
+ # new_df = new_df.loc[:, cols]
151
+
152
+ # drop all ranking
153
+ new_df = new_df.drop(columns=[c for c in new_df.columns if c.endswith("rank")])
154
+
155
+ return new_df
src/utils_old.py ADDED
@@ -0,0 +1,171 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from pathlib import Path
3
+ from datasets import load_dataset
4
+ import numpy as np
5
+ import os
6
+ import re
7
+
8
+ UNVERIFIED_MODELS = [
9
+ "nvidia/Nemotron-4-340B-Reward",
10
+ "nvidia/Llama3-70B-SteerLM-RM",
11
+ "Cohere May 2024",
12
+ "google/gemini-1.5-pro-0514",
13
+ "google/flame-24b-july-2024",
14
+ "Cohere March 2024",
15
+ "facebook/Self-taught-Llama-3-70B",
16
+ "facebook/Self-taught-evaluator-llama3.1-70B",
17
+ "google/flame-1.0-24B-july-2024",
18
+ "Salesforce/SFR-LLaMa-3.1-70B-Judge-r",
19
+ "Salesforce/SFR-nemo-12B-Judge-r",
20
+ "Salesforce/SFR-LLaMa-3.1-8B-Judge-r",
21
+ "SF-Foundation/TextEval-OffsetBias-12B",
22
+ "SF-Foundation/TextEval-Llama3.1-70B",
23
+ "nvidia/Llama-3.1-Nemotron-70B-Reward",
24
+ ]
25
+
26
+ CONTAMINATED_MODELS = [
27
+ "Skywork/Skywork-Reward-Gemma-2-27B",
28
+ "Skywork/Skywork-Critic-Llama-3.1-70B",
29
+ "LxzGordon/URM-LLaMa-3.1-8B",
30
+ "Skywork/Skywork-Reward-Llama-3.1-8B",
31
+ "Ray2333/GRM-Llama3-8B-rewardmodel-ft",
32
+ "nicolinho/QRM-Llama3.1-8B",
33
+ "nicolinho/QRM-Llama3-8B",
34
+ "general-preference/GPM-Llama-3.1-8B",
35
+ "general-preference/GPM-Gemma-2B"
36
+ ]
37
+
38
+ # From Open LLM Leaderboard
39
+ def model_hyperlink(link, model_name):
40
+ # if model_name is above 50 characters, return first 47 characters and "..."
41
+ if len(model_name) > 50:
42
+ model_name = model_name[:47] + "..."
43
+ if model_name == "random":
44
+ output = "random"
45
+ elif model_name == "Cohere March 2024":
46
+ output = f'<a target="_blank" href="https://huggingface.co/Cohere" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
47
+ elif "openai" == model_name.split("/")[0]:
48
+ output = f'<a target="_blank" href="https://huggingface.co/openai" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
49
+ elif "Anthropic" == model_name.split("/")[0]:
50
+ output = f'<a target="_blank" href="https://huggingface.co/Anthropic" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
51
+ elif "google" == model_name.split("/")[0]:
52
+ output = f'<a target="_blank" href="https://huggingface.co/google" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
53
+ elif "PoLL" == model_name.split("/")[0]:
54
+ output = model_name
55
+ output = f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
56
+
57
+ if model_name in UNVERIFIED_MODELS:
58
+ output += " *"
59
+ if model_name in CONTAMINATED_MODELS:
60
+ output += " ⚠️"
61
+ return output
62
+
63
+ def undo_hyperlink(html_string):
64
+ # Regex pattern to match content inside > and <
65
+ pattern = r'>[^<]+<'
66
+ match = re.search(pattern, html_string)
67
+ if match:
68
+ # Extract the matched text and remove leading '>' and trailing '<'
69
+ return match.group(0)[1:-1]
70
+ else:
71
+ return "No text found"
72
+
73
+
74
+ # Define a function to fetch and process data
75
+ def load_all_data(data_repo, subdir:str, subsubsets=False): # use HF api to pull the git repo
76
+ dir = Path(data_repo)
77
+ data_dir = dir / subdir
78
+ orgs = [d for d in os.listdir(data_dir) if os.path.isdir(os.path.join(data_dir, d))]
79
+ # get all files within the sub folders orgs
80
+ models_results = []
81
+ for org in orgs:
82
+ org_dir = data_dir / org
83
+ files = [f for f in os.listdir(org_dir) if os.path.isfile(os.path.join(org_dir, f))]
84
+ for file in files:
85
+ if file.endswith(".json"):
86
+ models_results.append(org + "/" + file)
87
+
88
+ # create empty dataframe to add all data to
89
+ df = pd.DataFrame()
90
+
91
+ # load all json data in the list models_results one by one to avoid not having the same entries
92
+ for model in models_results:
93
+ model_data = load_dataset("json", data_files=data_repo + subdir+ "/" + model, split="train")
94
+ df2 = pd.DataFrame(model_data)
95
+ # add to df
96
+ df = pd.concat([df2, df])
97
+
98
+
99
+ # remove chat_template comlumn
100
+ df = df.drop(columns=["chat_template"])
101
+
102
+ # sort columns alphabetically
103
+ df = df.reindex(sorted(df.columns), axis=1)
104
+
105
+ # move column "model" to the front
106
+ cols = list(df.columns)
107
+ cols.insert(0, cols.pop(cols.index('model')))
108
+ df = df.loc[:, cols]
109
+
110
+ # select all columns except "model"
111
+ cols = df.columns.tolist()
112
+ cols.remove("model")
113
+ # if model_type is a column (pref tests may not have it)
114
+ if "model_type" in cols:
115
+ cols.remove("model_type")
116
+ # remove ref_model if in columns
117
+ if "ref_model" in cols:
118
+ cols.remove("ref_model")
119
+ # remove model_beaker from dataframe
120
+ if "model_beaker" in cols:
121
+ cols.remove("model_beaker")
122
+ df = df.drop(columns=["model_beaker"])
123
+
124
+ # remove column xstest (outdated data)
125
+ # if xstest is a column
126
+ if "xstest" in cols:
127
+ df = df.drop(columns=["xstest"])
128
+ cols.remove("xstest")
129
+
130
+ if "ref_model" in df.columns:
131
+ df = df.drop(columns=["ref_model"])
132
+
133
+ # remove column anthropic and summarize_prompted (outdated data)
134
+ if "anthropic" in cols:
135
+ df = df.drop(columns=["anthropic"])
136
+ cols.remove("anthropic")
137
+ if "summarize_prompted" in cols:
138
+ df = df.drop(columns=["summarize_prompted"])
139
+ cols.remove("summarize_prompted")
140
+ # remove pku_better and pku_safer (removed from the leaderboard)
141
+ if "pku_better" in cols:
142
+ df = df.drop(columns=["pku_better"])
143
+ cols.remove("pku_better")
144
+ if "pku_safer" in cols:
145
+ df = df.drop(columns=["pku_safer"])
146
+ cols.remove("pku_safer")
147
+
148
+ # convert to score
149
+ df[cols] = (df[cols]*100)
150
+ avg = np.nanmean(df[cols].values,axis=1)
151
+ # add average column
152
+ df["average"] = avg
153
+
154
+ # apply model_hyperlink function to column "model"
155
+ df["model"] = df["model"].apply(lambda x: model_hyperlink(f"https://huggingface.co/{x}", x))
156
+
157
+ # move average column to the second
158
+ cols = list(df.columns)
159
+ cols.insert(1, cols.pop(cols.index('average')))
160
+ df = df.loc[:, cols]
161
+
162
+ # move model_type column to first
163
+ if "model_type" in cols:
164
+ cols = list(df.columns)
165
+ cols.insert(1, cols.pop(cols.index('model_type')))
166
+ df = df.loc[:, cols]
167
+
168
+ # remove models with DPO Ref. Free as type (future work)
169
+ df = df[~df["model_type"].str.contains("DPO Ref. Free", na=False)]
170
+
171
+ return df