natolambert
commited on
Commit
ยท
bbe05a0
1
Parent(s):
c8a4819
style
Browse files- app.py +21 -4
- src/css.py +22 -0
- src/md.py +3 -6
app.py
CHANGED
@@ -7,6 +7,7 @@ from src.utils import load_all_data
|
|
7 |
from src.md import ABOUT_TEXT, TOP_TEXT
|
8 |
from src.plt import plot_avg_correlation
|
9 |
from src.constants import subset_mapping, length_categories, example_counts
|
|
|
10 |
import numpy as np
|
11 |
|
12 |
api = HfApi()
|
@@ -185,18 +186,18 @@ def regex_table(dataframe, regex, filter_button):
|
|
185 |
return dataframe[dataframe["model"].str.contains(combined_regex, case=False, na=False)]
|
186 |
|
187 |
|
188 |
-
with gr.Blocks() as app:
|
189 |
# create tabs for the app, moving the current table to one titled "rewardbench" and the benchmark_text to a tab called "About"
|
190 |
with gr.Row():
|
191 |
-
with gr.Column(scale=
|
192 |
-
gr.Markdown(TOP_TEXT)
|
193 |
-
with gr.Column(scale=2.2):
|
194 |
# search = gr.Textbox(label="Model Search (delimit with , )", placeholder="Regex search for a model")
|
195 |
# filter_button = gr.Checkbox(label="Include AI2 training runs (or type ai2 above).", interactive=True)
|
196 |
# img = gr.Image(value="https://private-user-images.githubusercontent.com/10695622/310698241-24ed272a-0844-451f-b414-fde57478703e.png", width=500)
|
197 |
gr.Markdown("""
|
198 |
![](file/src/logo.png)
|
199 |
""")
|
|
|
|
|
200 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
201 |
with gr.TabItem("๐ RewardBench Leaderboard"):
|
202 |
with gr.Row():
|
@@ -321,6 +322,22 @@ with gr.Blocks() as app:
|
|
321 |
model_types_2.change(regex_table, inputs=[rewardbench_table_detailed_hidden, search_2, model_types_2], outputs=rewardbench_table_detailed)
|
322 |
model_types_3.change(regex_table, inputs=[pref_sets_table_hidden, search_3, model_types_3], outputs=pref_sets_table)
|
323 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
324 |
# Load data when app starts, TODO make this used somewhere...
|
325 |
# def load_data_on_start():
|
326 |
# data_rewardbench = load_all_data(repo_dir_rewardbench)
|
|
|
7 |
from src.md import ABOUT_TEXT, TOP_TEXT
|
8 |
from src.plt import plot_avg_correlation
|
9 |
from src.constants import subset_mapping, length_categories, example_counts
|
10 |
+
from src.css import custom_css
|
11 |
import numpy as np
|
12 |
|
13 |
api = HfApi()
|
|
|
186 |
return dataframe[dataframe["model"].str.contains(combined_regex, case=False, na=False)]
|
187 |
|
188 |
|
189 |
+
with gr.Blocks(css=custom_css) as app:
|
190 |
# create tabs for the app, moving the current table to one titled "rewardbench" and the benchmark_text to a tab called "About"
|
191 |
with gr.Row():
|
192 |
+
with gr.Column(scale=1.65):
|
|
|
|
|
193 |
# search = gr.Textbox(label="Model Search (delimit with , )", placeholder="Regex search for a model")
|
194 |
# filter_button = gr.Checkbox(label="Include AI2 training runs (or type ai2 above).", interactive=True)
|
195 |
# img = gr.Image(value="https://private-user-images.githubusercontent.com/10695622/310698241-24ed272a-0844-451f-b414-fde57478703e.png", width=500)
|
196 |
gr.Markdown("""
|
197 |
![](file/src/logo.png)
|
198 |
""")
|
199 |
+
with gr.Column(scale=3):
|
200 |
+
gr.Markdown(TOP_TEXT)
|
201 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
202 |
with gr.TabItem("๐ RewardBench Leaderboard"):
|
203 |
with gr.Row():
|
|
|
322 |
model_types_2.change(regex_table, inputs=[rewardbench_table_detailed_hidden, search_2, model_types_2], outputs=rewardbench_table_detailed)
|
323 |
model_types_3.change(regex_table, inputs=[pref_sets_table_hidden, search_3, model_types_3], outputs=pref_sets_table)
|
324 |
|
325 |
+
with gr.Row():
|
326 |
+
with gr.Accordion("๐ Citation", open=False):
|
327 |
+
citation_button = gr.Textbox(
|
328 |
+
value=r"""
|
329 |
+
@misc{RewardBench,
|
330 |
+
title={RewardBench: Benchmarking Reward Models},
|
331 |
+
author={Lambert, Nathan and Pyatkin, Valentina and Morrison, Jacob and Miranda, LJ and Lin, Bill Yuchen and Chandu, Khyathi and Dziri, Nouha and Kumar, Sachin and Zick, Tom and Choi, Yejin and Smith, Noah A. and Hajishirzi, Hannaneh},
|
332 |
+
year={2024},
|
333 |
+
howpublished={\url{https://huggingface.co/spaces/allenai/reward-bench}
|
334 |
+
}
|
335 |
+
""",
|
336 |
+
height=15,
|
337 |
+
label="Copy the following to cite these results.",
|
338 |
+
elem_id="citation-button",
|
339 |
+
show_copy_button=True,
|
340 |
+
)
|
341 |
# Load data when app starts, TODO make this used somewhere...
|
342 |
# def load_data_on_start():
|
343 |
# data_rewardbench = load_all_data(repo_dir_rewardbench)
|
src/css.py
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
custom_css = """
|
2 |
+
|
3 |
+
/* Full width space */
|
4 |
+
.gradio-container {
|
5 |
+
max-width: 95%;
|
6 |
+
}
|
7 |
+
|
8 |
+
/* Text tyle and margins */
|
9 |
+
.markdown-text {
|
10 |
+
font-size: 17px !important;
|
11 |
+
}
|
12 |
+
|
13 |
+
.tab-buttons button {
|
14 |
+
font-size: 20px;
|
15 |
+
}
|
16 |
+
|
17 |
+
h1 {
|
18 |
+
font-size: 32px !important;
|
19 |
+
margin-top: 0px !important;
|
20 |
+
}
|
21 |
+
|
22 |
+
"""
|
src/md.py
CHANGED
@@ -16,6 +16,7 @@ We include multiple types of reward models in this evaluation:
|
|
16 |
3. **DPO**: Models trained with Direct Preference Optimization (DPO), with modifiers such as `-ref-free` or `-norm` changing how scores are computed.
|
17 |
4. **Random**: Random choice baseline.
|
18 |
|
|
|
19 |
Others, such as **Generative Judge** are coming soon.
|
20 |
|
21 |
### Subset Details
|
@@ -78,11 +79,7 @@ For more details, see the [dataset](https://huggingface.co/datasets/ai2-rlhf-col
|
|
78 |
"""
|
79 |
|
80 |
TOP_TEXT = """
|
81 |
-
# RewardBench
|
82 |
-
|
83 |
-
Evaluating the capabilities, safety, and pitfalls of reward models.
|
84 |
-
|
85 |
[Code](https://github.com/allenai/herm) | [Eval. Dataset](https://huggingface.co/datasets/ai2-adapt-dev/rm-benchmark-dev) | [Existing Test Sets](https://huggingface.co/datasets/allenai/pref-test-sets) | [Results](https://huggingface.co/datasets/ai2-adapt-dev/HERM-Results) | Paper (coming soon)
|
86 |
-
|
87 |
-
All models are evaluated in fp16 expect for Starling-7B, which is evaluated in fp32.
|
88 |
"""
|
|
|
16 |
3. **DPO**: Models trained with Direct Preference Optimization (DPO), with modifiers such as `-ref-free` or `-norm` changing how scores are computed.
|
17 |
4. **Random**: Random choice baseline.
|
18 |
|
19 |
+
All models are evaluated in fp16 expect for Starling-7B, which is evaluated in fp32.
|
20 |
Others, such as **Generative Judge** are coming soon.
|
21 |
|
22 |
### Subset Details
|
|
|
79 |
"""
|
80 |
|
81 |
TOP_TEXT = """
|
82 |
+
# RewardBench: Benchmarking Reward Models
|
83 |
+
### Evaluating the capabilities, safety, and pitfalls of reward models
|
|
|
|
|
84 |
[Code](https://github.com/allenai/herm) | [Eval. Dataset](https://huggingface.co/datasets/ai2-adapt-dev/rm-benchmark-dev) | [Existing Test Sets](https://huggingface.co/datasets/allenai/pref-test-sets) | [Results](https://huggingface.co/datasets/ai2-adapt-dev/HERM-Results) | Paper (coming soon)
|
|
|
|
|
85 |
"""
|