natolambert
commited on
Commit
•
e5d5995
1
Parent(s):
8e499f4
smol improvements
Browse files- app.py +38 -21
- requirements.txt +1 -0
app.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1 |
import gradio as gr
|
2 |
import os
|
3 |
from huggingface_hub import HfApi, snapshot_download
|
|
|
4 |
from datasets import load_dataset
|
5 |
from src.utils import load_all_data
|
6 |
from src.md import ABOUT_TEXT
|
@@ -15,10 +16,8 @@ eval_set_repo = "ai2-rlhf-collab/rm-benchmark-dev"
|
|
15 |
repo_dir_herm = "./evals/herm/"
|
16 |
repo_dir_prefs = "./evals/prefs/"
|
17 |
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
|
23 |
print("Pulling evaluation results")
|
24 |
repo = snapshot_download(
|
@@ -43,17 +42,18 @@ def avg_over_herm(dataframe):
|
|
43 |
"""
|
44 |
Averages over the subsets alpacaeval, mt-bench, llmbar, refusals, hep and returns dataframe with only these columns.
|
45 |
"""
|
|
|
46 |
subsets = ["alpacaeval", "mt-bench", "llmbar", "refusals", "hep"]
|
47 |
# for each subset, avg the columns that have the subset in the column name, then add a new column with subset name and avg
|
48 |
for subset in subsets:
|
49 |
-
subset_cols = [col for col in
|
50 |
-
|
51 |
|
52 |
keep_columns = ["model", "average"] + subsets
|
53 |
-
|
54 |
# replace average column with new average
|
55 |
-
|
56 |
-
return
|
57 |
|
58 |
def expand_subsets(dataframe):
|
59 |
# TODO need to modify data/ script to do this
|
@@ -71,12 +71,23 @@ col_types_prefs = ["markdown"] + ["number"] * (len(prefs_data.columns) - 1)
|
|
71 |
|
72 |
# for showing random samples
|
73 |
eval_set = load_dataset(eval_set_repo, use_auth_token=COLLAB_TOKEN, split="filtered")
|
74 |
-
def random_sample(r: gr.Request):
|
75 |
-
|
76 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
77 |
markdown_text = '\n\n'.join([f"**{key}**: {value}" for key, value in sample.items()])
|
78 |
return markdown_text
|
79 |
|
|
|
|
|
80 |
with gr.Blocks() as app:
|
81 |
# create tabs for the app, moving the current table to one titled "HERM" and the benchmark_text to a tab called "About"
|
82 |
with gr.Row():
|
@@ -114,23 +125,29 @@ with gr.Blocks() as app:
|
|
114 |
with gr.Row():
|
115 |
# loads one sample
|
116 |
gr.Markdown("## Random Dataset Sample Viewer")
|
|
|
117 |
button = gr.Button("Show Random Sample")
|
118 |
|
119 |
with gr.Row():
|
120 |
sample_display = gr.Markdown("{sampled data loads here}")
|
121 |
|
122 |
-
button.click(fn=random_sample, outputs=sample_display)
|
123 |
|
124 |
|
125 |
# Load data when app starts, TODO make this used somewhere...
|
126 |
-
def load_data_on_start():
|
127 |
-
|
128 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
129 |
|
130 |
-
|
131 |
-
|
|
|
132 |
|
133 |
-
data_prefs = load_all_data(repo_dir_prefs)
|
134 |
-
pref_sets_table.update(data_prefs)
|
135 |
|
136 |
-
app.launch()
|
|
|
1 |
import gradio as gr
|
2 |
import os
|
3 |
from huggingface_hub import HfApi, snapshot_download
|
4 |
+
from apscheduler.schedulers.background import BackgroundScheduler
|
5 |
from datasets import load_dataset
|
6 |
from src.utils import load_all_data
|
7 |
from src.md import ABOUT_TEXT
|
|
|
16 |
repo_dir_herm = "./evals/herm/"
|
17 |
repo_dir_prefs = "./evals/prefs/"
|
18 |
|
19 |
+
def restart_space():
|
20 |
+
api.restart_space(repo_id="ai2-rlhf-collab/rm-benchmark-viewer", token=COLLAB_TOKEN)
|
|
|
|
|
21 |
|
22 |
print("Pulling evaluation results")
|
23 |
repo = snapshot_download(
|
|
|
42 |
"""
|
43 |
Averages over the subsets alpacaeval, mt-bench, llmbar, refusals, hep and returns dataframe with only these columns.
|
44 |
"""
|
45 |
+
new_df = dataframe.copy()
|
46 |
subsets = ["alpacaeval", "mt-bench", "llmbar", "refusals", "hep"]
|
47 |
# for each subset, avg the columns that have the subset in the column name, then add a new column with subset name and avg
|
48 |
for subset in subsets:
|
49 |
+
subset_cols = [col for col in new_df.columns if subset in col]
|
50 |
+
new_df[subset] = np.round(np.nanmean(new_df[subset_cols].values, axis=1), 2)
|
51 |
|
52 |
keep_columns = ["model", "average"] + subsets
|
53 |
+
new_df = new_df[keep_columns]
|
54 |
# replace average column with new average
|
55 |
+
new_df["average"] = np.round(np.nanmean(new_df[subsets].values, axis=1), 2)
|
56 |
+
return new_df
|
57 |
|
58 |
def expand_subsets(dataframe):
|
59 |
# TODO need to modify data/ script to do this
|
|
|
71 |
|
72 |
# for showing random samples
|
73 |
eval_set = load_dataset(eval_set_repo, use_auth_token=COLLAB_TOKEN, split="filtered")
|
74 |
+
def random_sample(r: gr.Request, subset):
|
75 |
+
if subset is None or subset == []:
|
76 |
+
sample_index = np.random.randint(0, len(eval_set) - 1)
|
77 |
+
sample = eval_set[sample_index]
|
78 |
+
else: # filter by subsets (can be list)
|
79 |
+
if isinstance(subset, str):
|
80 |
+
subset = [subset]
|
81 |
+
# filter down dataset to only include the subset(s)
|
82 |
+
eval_set_filtered = eval_set.filter(lambda x: x["subset"] in subset)
|
83 |
+
sample_index = np.random.randint(0, len(eval_set_filtered) - 1)
|
84 |
+
sample = eval_set_filtered[sample_index]
|
85 |
+
|
86 |
markdown_text = '\n\n'.join([f"**{key}**: {value}" for key, value in sample.items()])
|
87 |
return markdown_text
|
88 |
|
89 |
+
subsets = eval_set.unique("subset")
|
90 |
+
|
91 |
with gr.Blocks() as app:
|
92 |
# create tabs for the app, moving the current table to one titled "HERM" and the benchmark_text to a tab called "About"
|
93 |
with gr.Row():
|
|
|
125 |
with gr.Row():
|
126 |
# loads one sample
|
127 |
gr.Markdown("## Random Dataset Sample Viewer")
|
128 |
+
subset_selector = gr.Dropdown(subsets, label="Subset", value=None, multiselect=True)
|
129 |
button = gr.Button("Show Random Sample")
|
130 |
|
131 |
with gr.Row():
|
132 |
sample_display = gr.Markdown("{sampled data loads here}")
|
133 |
|
134 |
+
button.click(fn=random_sample, inputs=[subset_selector], outputs=[sample_display])
|
135 |
|
136 |
|
137 |
# Load data when app starts, TODO make this used somewhere...
|
138 |
+
# def load_data_on_start():
|
139 |
+
# data_herm = load_all_data(repo_dir_herm)
|
140 |
+
# herm_table.update(data_herm)
|
141 |
+
|
142 |
+
# data_herm_avg = avg_over_herm(repo_dir_herm)
|
143 |
+
# herm_table.update(data_herm_avg)
|
144 |
+
|
145 |
+
# data_prefs = load_all_data(repo_dir_prefs)
|
146 |
+
# pref_sets_table.update(data_prefs)
|
147 |
|
148 |
+
scheduler = BackgroundScheduler()
|
149 |
+
scheduler.add_job(restart_space, "interval", seconds=10800) # restarted every 3h
|
150 |
+
scheduler.start()
|
151 |
|
|
|
|
|
152 |
|
153 |
+
app.queue().launch()
|
requirements.txt
CHANGED
@@ -1,2 +1,3 @@
|
|
|
|
1 |
pandas
|
2 |
datasets
|
|
|
1 |
+
APScheduler==3.10.1
|
2 |
pandas
|
3 |
datasets
|