natolambert
commited on
Commit
•
9f4ce43
1
Parent(s):
bb95637
major update
Browse files- app.py +16 -12
- src/constants.py +3 -2
- src/logo.png +0 -0
- src/md.py +14 -5
- src/utils.py +3 -0
app.py
CHANGED
@@ -42,7 +42,7 @@ def avg_over_rewardbench(dataframe_core, dataframe_prefs):
|
|
42 |
2. Chat Hard: Includes the hard chat subsets (mt-bench-hard, llmbar-natural, llmbar-adver-neighbor, llmbar-adver-GPTInst, llmbar-adver-GPTOut, llmbar-adver-manual)
|
43 |
3. Safety: Includes the safety subsets (refusals-dangerous, refusals-offensive, xstest-should-refuse, xstest-should-respond, do not answer)
|
44 |
4. Reasoning: Includes the code and math subsets (math-prm, hep-cpp, hep-go, hep-java, hep-js, hep-python, hep-rust)
|
45 |
-
5.
|
46 |
"""
|
47 |
new_df = dataframe_core.copy()
|
48 |
dataframe_prefs = dataframe_prefs.copy()
|
@@ -61,28 +61,28 @@ def avg_over_rewardbench(dataframe_core, dataframe_prefs):
|
|
61 |
new_df = new_df[keep_columns]
|
62 |
|
63 |
# selected average from pref_sets
|
64 |
-
pref_columns = ["anthropic_helpful", "anthropic_hhh", "
|
65 |
pref_data = dataframe_prefs[pref_columns].values
|
66 |
|
67 |
# add column test sets knowing the rows are not identical, take superset
|
68 |
-
dataframe_prefs["
|
69 |
|
70 |
# add column Test Sets empty to new_df
|
71 |
-
new_df["
|
72 |
-
# per row in new_df if model is in dataframe_prefs, add the value to new_df["
|
73 |
values = []
|
74 |
for i, row in new_df.iterrows():
|
75 |
model = row["model"]
|
76 |
if model in dataframe_prefs["model"].values:
|
77 |
-
values.append(dataframe_prefs[dataframe_prefs["model"] == model]["
|
78 |
-
# new_df.at[i, "
|
79 |
else:
|
80 |
values.append(np.nan)
|
81 |
|
82 |
-
new_df["
|
83 |
|
84 |
# add total average
|
85 |
-
data_cols += ["
|
86 |
new_df["average"] = np.round(np.nanmean(new_df[data_cols].values, axis=1), 2)
|
87 |
|
88 |
# make average third column
|
@@ -197,7 +197,11 @@ def regex_table(dataframe, regex, filter_button):
|
|
197 |
if "Custom Classifiers" not in filter_button:
|
198 |
dataframe = dataframe[~dataframe["Model Type"].str.contains("Custom Classifier", case=False, na=False)]
|
199 |
# Filter the dataframe such that 'model' contains any of the regex patterns
|
200 |
-
|
|
|
|
|
|
|
|
|
201 |
|
202 |
|
203 |
with gr.Blocks(css=custom_css) as app:
|
@@ -280,7 +284,7 @@ with gr.Blocks(css=custom_css) as app:
|
|
280 |
# elem_id="rewardbench_dataframe_length",
|
281 |
# height=1000,
|
282 |
# )
|
283 |
-
with gr.TabItem("
|
284 |
with gr.Row():
|
285 |
search_3 = gr.Textbox(label="Model Search (delimit with , )", show_label=False, placeholder="Model Search (delimit with , )")
|
286 |
model_types_3 = gr.CheckboxGroup(["Seq. Classifiers", "DPO", "Custom Classifiers", "AI2 Experiments"],
|
@@ -291,7 +295,7 @@ with gr.Blocks(css=custom_css) as app:
|
|
291 |
)
|
292 |
with gr.Row():
|
293 |
PREF_SET_TEXT = """
|
294 |
-
For more information, see the [dataset](https://huggingface.co/datasets/allenai/pref-test-sets). Only the subsets Anthropic
|
295 |
"""
|
296 |
gr.Markdown(PREF_SET_TEXT)
|
297 |
with gr.Row():
|
|
|
42 |
2. Chat Hard: Includes the hard chat subsets (mt-bench-hard, llmbar-natural, llmbar-adver-neighbor, llmbar-adver-GPTInst, llmbar-adver-GPTOut, llmbar-adver-manual)
|
43 |
3. Safety: Includes the safety subsets (refusals-dangerous, refusals-offensive, xstest-should-refuse, xstest-should-respond, do not answer)
|
44 |
4. Reasoning: Includes the code and math subsets (math-prm, hep-cpp, hep-go, hep-java, hep-js, hep-python, hep-rust)
|
45 |
+
5. Prior Sets: Includes the test sets (anthropic_helpful, mtbench_human, shp, summarize)
|
46 |
"""
|
47 |
new_df = dataframe_core.copy()
|
48 |
dataframe_prefs = dataframe_prefs.copy()
|
|
|
61 |
new_df = new_df[keep_columns]
|
62 |
|
63 |
# selected average from pref_sets
|
64 |
+
pref_columns = ["anthropic_helpful", "anthropic_hhh", "shp", "summarize"]
|
65 |
pref_data = dataframe_prefs[pref_columns].values
|
66 |
|
67 |
# add column test sets knowing the rows are not identical, take superset
|
68 |
+
dataframe_prefs["Prior Sets"] = np.round(np.nanmean(pref_data, axis=1), 2)
|
69 |
|
70 |
# add column Test Sets empty to new_df
|
71 |
+
new_df["Prior Sets"] = np.nan
|
72 |
+
# per row in new_df if model is in dataframe_prefs, add the value to new_df["Prior Sets"]
|
73 |
values = []
|
74 |
for i, row in new_df.iterrows():
|
75 |
model = row["model"]
|
76 |
if model in dataframe_prefs["model"].values:
|
77 |
+
values.append(dataframe_prefs[dataframe_prefs["model"] == model]["Prior Sets"].values[0])
|
78 |
+
# new_df.at[i, "Prior Sets"] = dataframe_prefs[dataframe_prefs["model"] == model]["Prior Sets"].values[0]
|
79 |
else:
|
80 |
values.append(np.nan)
|
81 |
|
82 |
+
new_df["Prior Sets"] = values
|
83 |
|
84 |
# add total average
|
85 |
+
data_cols += ["Prior Sets"]
|
86 |
new_df["average"] = np.round(np.nanmean(new_df[data_cols].values, axis=1), 2)
|
87 |
|
88 |
# make average third column
|
|
|
197 |
if "Custom Classifiers" not in filter_button:
|
198 |
dataframe = dataframe[~dataframe["Model Type"].str.contains("Custom Classifier", case=False, na=False)]
|
199 |
# Filter the dataframe such that 'model' contains any of the regex patterns
|
200 |
+
data = dataframe[dataframe["Model"].str.contains(combined_regex, case=False, na=False)]
|
201 |
+
|
202 |
+
# replace column '' with count/rank
|
203 |
+
data[''] = np.arange(1, 1 + len(data))
|
204 |
+
return data
|
205 |
|
206 |
|
207 |
with gr.Blocks(css=custom_css) as app:
|
|
|
284 |
# elem_id="rewardbench_dataframe_length",
|
285 |
# height=1000,
|
286 |
# )
|
287 |
+
with gr.TabItem("Prior Test Sets"):
|
288 |
with gr.Row():
|
289 |
search_3 = gr.Textbox(label="Model Search (delimit with , )", show_label=False, placeholder="Model Search (delimit with , )")
|
290 |
model_types_3 = gr.CheckboxGroup(["Seq. Classifiers", "DPO", "Custom Classifiers", "AI2 Experiments"],
|
|
|
295 |
)
|
296 |
with gr.Row():
|
297 |
PREF_SET_TEXT = """
|
298 |
+
For more information, see the [dataset](https://huggingface.co/datasets/allenai/pref-test-sets). Only the subsets Anthropic Helpful, Anthropic HHH, Stanford SHP, and OpenAI's Summarize data are used in the leaderboard ranking.
|
299 |
"""
|
300 |
gr.Markdown(PREF_SET_TEXT)
|
301 |
with gr.Row():
|
src/constants.py
CHANGED
@@ -15,6 +15,7 @@ length_categories = {
|
|
15 |
'llmbar-adver-manual': 'False',
|
16 |
'llmbar-adver-neighbor': 'False',
|
17 |
'llmbar-natural': 'Neutral',
|
|
|
18 |
'mt-bench-easy': 'False',
|
19 |
'mt-bench-hard': 'False',
|
20 |
'mt-bench-med': 'Neutral',
|
@@ -31,7 +32,7 @@ example_counts = {
|
|
31 |
"mt-bench-easy": 28,
|
32 |
"mt-bench-med": 40,
|
33 |
"mt-bench-hard": 37,
|
34 |
-
|
35 |
"refusals-dangerous": 100,
|
36 |
"refusals-offensive": 100,
|
37 |
"llmbar-natural": 100,
|
@@ -54,6 +55,6 @@ subset_mapping = {
|
|
54 |
"Chat": ["alpacaeval-easy", "alpacaeval-length", "alpacaeval-hard", "mt-bench-easy", "mt-bench-med"],
|
55 |
"Chat Hard": ["mt-bench-hard", "llmbar-natural", "llmbar-adver-neighbor", "llmbar-adver-GPTInst", "llmbar-adver-GPTOut", "llmbar-adver-manual"],
|
56 |
"Safety": ["refusals-dangerous", "refusals-offensive", "xstest-should-refuse", "xstest-should-respond", "donotanswer"],
|
57 |
-
"Reasoning": [
|
58 |
"hep-cpp", "hep-go", "hep-java", "hep-js", "hep-python", "hep-rust"]
|
59 |
}
|
|
|
15 |
'llmbar-adver-manual': 'False',
|
16 |
'llmbar-adver-neighbor': 'False',
|
17 |
'llmbar-natural': 'Neutral',
|
18 |
+
'math-prm': 'Neutral',
|
19 |
'mt-bench-easy': 'False',
|
20 |
'mt-bench-hard': 'False',
|
21 |
'mt-bench-med': 'Neutral',
|
|
|
32 |
"mt-bench-easy": 28,
|
33 |
"mt-bench-med": 40,
|
34 |
"mt-bench-hard": 37,
|
35 |
+
"math-prm": 984, # actual length 447, upweighting to be equal to code
|
36 |
"refusals-dangerous": 100,
|
37 |
"refusals-offensive": 100,
|
38 |
"llmbar-natural": 100,
|
|
|
55 |
"Chat": ["alpacaeval-easy", "alpacaeval-length", "alpacaeval-hard", "mt-bench-easy", "mt-bench-med"],
|
56 |
"Chat Hard": ["mt-bench-hard", "llmbar-natural", "llmbar-adver-neighbor", "llmbar-adver-GPTInst", "llmbar-adver-GPTOut", "llmbar-adver-manual"],
|
57 |
"Safety": ["refusals-dangerous", "refusals-offensive", "xstest-should-refuse", "xstest-should-respond", "donotanswer"],
|
58 |
+
"Reasoning": ["math-prm",
|
59 |
"hep-cpp", "hep-go", "hep-java", "hep-js", "hep-python", "hep-rust"]
|
60 |
}
|
src/logo.png
CHANGED
src/md.py
CHANGED
@@ -9,7 +9,7 @@ We average over 4 core sections (per prompt weighting):
|
|
9 |
2. **Chat Hard**: Includes the hard chat subsets (mt-bench-hard, llmbar-natural, llmbar-adver-neighbor, llmbar-adver-GPTInst, llmbar-adver-GPTOut, llmbar-adver-manual)
|
10 |
3. **Safety**: Includes the safety subsets (refusals-dangerous, refusals-offensive, xstest-should-refuse, xstest-should-respond, do not answer)
|
11 |
4. **Reasoning**: Includes the code and math subsets (math-prm, hep-cpp, hep-go, hep-java, hep-js, hep-python, hep-rust)
|
12 |
-
5. **
|
13 |
|
14 |
We include multiple types of reward models in this evaluation:
|
15 |
1. **Sequence Classifiers** (Seq. Classifier): A model, normally trained with HuggingFace AutoModelForSequenceClassification, that takes in a prompt and a response and outputs a score.
|
@@ -20,9 +20,18 @@ We include multiple types of reward models in this evaluation:
|
|
20 |
All models are evaluated in fp16 expect for Starling-7B, which is evaluated in fp32.
|
21 |
Others, such as **Generative Judge** are coming soon.
|
22 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
### Subset Details
|
24 |
|
25 |
-
Total number of the prompts is:
|
26 |
|
27 |
| Subset | Num. Samples (Pre-filtering, post-filtering) | Description |
|
28 |
| :---------- | :-----: | :---------: |
|
@@ -77,11 +86,11 @@ Lengths (mean, std. dev.) include the prompt
|
|
77 |
| xstest-should-refuse | False | 584 (419) | 904 (493) | 129 (89) | 217 (115) | 81 (47) | 116 (53) |
|
78 |
| xstest-should-respond | True | 771 (420) | 466 (427) | 189 (105) | 107 (94) | 104 (48) | 67 (48) |
|
79 |
|
80 |
-
For more details, see the [dataset](https://huggingface.co/datasets/
|
81 |
"""
|
82 |
|
83 |
TOP_TEXT = """
|
84 |
-
# RewardBench:
|
85 |
### Evaluating the capabilities, safety, and pitfalls of reward models
|
86 |
-
[Code](https://github.com/allenai/
|
87 |
"""
|
|
|
9 |
2. **Chat Hard**: Includes the hard chat subsets (mt-bench-hard, llmbar-natural, llmbar-adver-neighbor, llmbar-adver-GPTInst, llmbar-adver-GPTOut, llmbar-adver-manual)
|
10 |
3. **Safety**: Includes the safety subsets (refusals-dangerous, refusals-offensive, xstest-should-refuse, xstest-should-respond, do not answer)
|
11 |
4. **Reasoning**: Includes the code and math subsets (math-prm, hep-cpp, hep-go, hep-java, hep-js, hep-python, hep-rust)
|
12 |
+
5. **Prior Sets**: Includes the test sets ([anthropic_helpful](https://huggingface.co/datasets/Anthropic/hh-rlhf), [anthropic_hhh](https://huggingface.co/datasets/HuggingFaceH4/hhh_alignment), [mtbench_human](https://huggingface.co/datasets/lmsys/mt_bench_human_judgments), [shp](https://huggingface.co/datasets/stanfordnlp/SHP), [summarize](https://huggingface.co/datasets/openai/summarize_from_feedback))
|
13 |
|
14 |
We include multiple types of reward models in this evaluation:
|
15 |
1. **Sequence Classifiers** (Seq. Classifier): A model, normally trained with HuggingFace AutoModelForSequenceClassification, that takes in a prompt and a response and outputs a score.
|
|
|
20 |
All models are evaluated in fp16 expect for Starling-7B, which is evaluated in fp32.
|
21 |
Others, such as **Generative Judge** are coming soon.
|
22 |
|
23 |
+
### Model Types
|
24 |
+
|
25 |
+
Currently, we evaluate the following model types:
|
26 |
+
1. **Sequence Classifiers**: A model, normally trained with HuggingFace AutoModelForSequenceClassification, that takes in a prompt and a response and outputs a score.
|
27 |
+
2. **Custom Classifiers**: Research models with different architectures and training objectives to either take in two inputs at once or generate scores differently (e.g. PairRM and Stanford SteamSHP).
|
28 |
+
3. **DPO**: Models trained with Direct Preference Optimization (DPO) with a reference model being either the base or supervised fine-tuning checkpoint.
|
29 |
+
|
30 |
+
Support of DPO models without a reference model is coming soon.
|
31 |
+
|
32 |
### Subset Details
|
33 |
|
34 |
+
Total number of the prompts is: 2985, filtered from 5123.
|
35 |
|
36 |
| Subset | Num. Samples (Pre-filtering, post-filtering) | Description |
|
37 |
| :---------- | :-----: | :---------: |
|
|
|
86 |
| xstest-should-refuse | False | 584 (419) | 904 (493) | 129 (89) | 217 (115) | 81 (47) | 116 (53) |
|
87 |
| xstest-should-respond | True | 771 (420) | 466 (427) | 189 (105) | 107 (94) | 104 (48) | 67 (48) |
|
88 |
|
89 |
+
For more details, see the [dataset](https://huggingface.co/datasets/allenai/reward-bench).
|
90 |
"""
|
91 |
|
92 |
TOP_TEXT = """
|
93 |
+
# RewardBench: Evaluating Reward Models
|
94 |
### Evaluating the capabilities, safety, and pitfalls of reward models
|
95 |
+
[Code](https://github.com/allenai/reward-bench) | [Eval. Dataset](https://huggingface.co/datasets/allenai/reward-bench) | [Classic Test Sets](https://huggingface.co/datasets/allenai/pref-test-sets) | [Results](https://huggingface.co/datasets/allenai/reward-bench-results) | Paper (coming soon)
|
96 |
"""
|
src/utils.py
CHANGED
@@ -116,4 +116,7 @@ def load_all_data(data_repo, subdir:str, subsubsets=False): # use HF api to p
|
|
116 |
cols.insert(1, cols.pop(cols.index('model_type')))
|
117 |
df = df.loc[:, cols]
|
118 |
|
|
|
|
|
|
|
119 |
return df
|
|
|
116 |
cols.insert(1, cols.pop(cols.index('model_type')))
|
117 |
df = df.loc[:, cols]
|
118 |
|
119 |
+
# remove models with DPO Ref. Free as type (future work)
|
120 |
+
df = df[~df["model_type"].str.contains("DPO Ref. Free", na=False)]
|
121 |
+
|
122 |
return df
|