natolambert
commited on
Commit
•
874c0c9
1
Parent(s):
18596de
up
Browse files
app.py
CHANGED
@@ -211,21 +211,24 @@ def regex_table(dataframe, regex, filter_button):
|
|
211 |
|
212 |
# if Score exists, round to 2 decimals
|
213 |
if "Score" in data.columns:
|
214 |
-
data["Score"] = data["Score"].
|
215 |
if "Average" in data.columns:
|
216 |
-
data["Average"] = data["Average"].
|
217 |
# round all others to 1 decimal
|
218 |
for col in data.columns:
|
219 |
if col not in ["", "Model", "Model Type", "Score", "Average"]:
|
220 |
-
data[col] = data[col].
|
221 |
return data
|
222 |
|
|
|
|
|
|
|
223 |
|
224 |
with gr.Blocks(css=custom_css) as app:
|
225 |
# create tabs for the app, moving the current table to one titled "rewardbench" and the benchmark_text to a tab called "About"
|
226 |
with gr.Row():
|
227 |
with gr.Column(scale=6):
|
228 |
-
gr.Markdown(TOP_TEXT)
|
229 |
with gr.Column(scale=4):
|
230 |
# search = gr.Textbox(label="Model Search (delimit with , )", placeholder="Regex search for a model")
|
231 |
# filter_button = gr.Checkbox(label="Include AI2 training runs (or type ai2 above).", interactive=True)
|
|
|
211 |
|
212 |
# if Score exists, round to 2 decimals
|
213 |
if "Score" in data.columns:
|
214 |
+
data["Score"] = np.round(np.array(data["Score"].values).astype(float), 2)
|
215 |
if "Average" in data.columns:
|
216 |
+
data["Average"] = np.round(np.array(data["Average"].values).astype(float), 1)
|
217 |
# round all others to 1 decimal
|
218 |
for col in data.columns:
|
219 |
if col not in ["", "Model", "Model Type", "Score", "Average"]:
|
220 |
+
data[col] = np.round(np.array(data[col].values).astype(float), 1)
|
221 |
return data
|
222 |
|
223 |
+
# import ipdb; ipdb.set_trace()
|
224 |
+
|
225 |
+
total_models = len(regex_table(rewardbench_data_avg.copy(), "", ["Seq. Classifiers", "DPO", "Custom Classifiers"]).values)
|
226 |
|
227 |
with gr.Blocks(css=custom_css) as app:
|
228 |
# create tabs for the app, moving the current table to one titled "rewardbench" and the benchmark_text to a tab called "About"
|
229 |
with gr.Row():
|
230 |
with gr.Column(scale=6):
|
231 |
+
gr.Markdown(TOP_TEXT.format(str(total_models)))
|
232 |
with gr.Column(scale=4):
|
233 |
# search = gr.Textbox(label="Model Search (delimit with , )", placeholder="Regex search for a model")
|
234 |
# filter_button = gr.Checkbox(label="Include AI2 training runs (or type ai2 above).", interactive=True)
|
src/md.py
CHANGED
@@ -97,5 +97,5 @@ For more details, see the [dataset](https://huggingface.co/datasets/allenai/rewa
|
|
97 |
TOP_TEXT = """
|
98 |
# RewardBench: Evaluating Reward Models
|
99 |
### Evaluating the capabilities, safety, and pitfalls of reward models
|
100 |
-
[Code](https://github.com/allenai/reward-bench) | [Eval. Dataset](https://huggingface.co/datasets/allenai/reward-bench) | [Prior Test Sets](https://huggingface.co/datasets/allenai/pref-test-sets) | [Results](https://huggingface.co/datasets/allenai/reward-bench-results) | [Paper](https://arxiv.org/abs/2403.13787)
|
101 |
"""
|
|
|
97 |
TOP_TEXT = """
|
98 |
# RewardBench: Evaluating Reward Models
|
99 |
### Evaluating the capabilities, safety, and pitfalls of reward models
|
100 |
+
[Code](https://github.com/allenai/reward-bench) | [Eval. Dataset](https://huggingface.co/datasets/allenai/reward-bench) | [Prior Test Sets](https://huggingface.co/datasets/allenai/pref-test-sets) | [Results](https://huggingface.co/datasets/allenai/reward-bench-results) | [Paper](https://arxiv.org/abs/2403.13787) | Total models: {}
|
101 |
"""
|