Spaces:
Running
Running
natolambert
commited on
Commit
Β·
1d33a30
1
Parent(s):
9af70d6
add generative default off
Browse files- app.py +7 -3
- src/md.py +1 -0
- src/utils.py +5 -1
app.py
CHANGED
@@ -203,6 +203,8 @@ def regex_table(dataframe, regex, filter_button):
|
|
203 |
dataframe = dataframe[~dataframe["Model Type"].str.contains("DPO", case=False, na=False)]
|
204 |
if "Custom Classifiers" not in filter_button:
|
205 |
dataframe = dataframe[~dataframe["Model Type"].str.contains("Custom Classifier", case=False, na=False)]
|
|
|
|
|
206 |
# Filter the dataframe such that 'model' contains any of the regex patterns
|
207 |
data = dataframe[dataframe["Model"].str.contains(combined_regex, case=False, na=False)]
|
208 |
|
@@ -217,6 +219,8 @@ def regex_table(dataframe, regex, filter_button):
|
|
217 |
# round all others to 1 decimal
|
218 |
for col in data.columns:
|
219 |
if col not in ["", "Model", "Model Type", "Score", "Average"]:
|
|
|
|
|
220 |
data[col] = np.round(np.array(data[col].values).astype(float), 1)
|
221 |
return data
|
222 |
|
@@ -242,7 +246,7 @@ with gr.Blocks(css=custom_css) as app:
|
|
242 |
search_1 = gr.Textbox(label="Model Search (delimit with , )",
|
243 |
placeholder="Model Search (delimit with , )",
|
244 |
show_label=False)
|
245 |
-
model_types_1 = gr.CheckboxGroup(["Seq. Classifiers", "DPO", "Custom Classifiers", "AI2 Experiments"],
|
246 |
value=["Seq. Classifiers", "DPO", "Custom Classifiers"],
|
247 |
label="Model Types",
|
248 |
show_label=False,
|
@@ -267,7 +271,7 @@ with gr.Blocks(css=custom_css) as app:
|
|
267 |
with gr.TabItem("π RewardBench - Detailed"):
|
268 |
with gr.Row():
|
269 |
search_2 = gr.Textbox(label="Model Search (delimit with , )", show_label=False, placeholder="Model Search (delimit with , )")
|
270 |
-
model_types_2 = gr.CheckboxGroup(["Seq. Classifiers", "DPO", "Custom Classifiers", "AI2 Experiments"],
|
271 |
value=["Seq. Classifiers", "DPO", "Custom Classifiers"],
|
272 |
label="Model Types",
|
273 |
show_label=False,
|
@@ -307,7 +311,7 @@ with gr.Blocks(css=custom_css) as app:
|
|
307 |
with gr.TabItem("Prior Test Sets"):
|
308 |
with gr.Row():
|
309 |
search_3 = gr.Textbox(label="Model Search (delimit with , )", show_label=False, placeholder="Model Search (delimit with , )")
|
310 |
-
model_types_3 = gr.CheckboxGroup(["Seq. Classifiers", "DPO", "Custom Classifiers", "AI2 Experiments"],
|
311 |
value=["Seq. Classifiers", "DPO", "Custom Classifiers"],
|
312 |
label="Model Types",
|
313 |
show_label=False,
|
|
|
203 |
dataframe = dataframe[~dataframe["Model Type"].str.contains("DPO", case=False, na=False)]
|
204 |
if "Custom Classifiers" not in filter_button:
|
205 |
dataframe = dataframe[~dataframe["Model Type"].str.contains("Custom Classifier", case=False, na=False)]
|
206 |
+
if "Generative" not in filter_button:
|
207 |
+
dataframe = dataframe[~dataframe["Model Type"].str.contains("generative", case=False, na=False)]
|
208 |
# Filter the dataframe such that 'model' contains any of the regex patterns
|
209 |
data = dataframe[dataframe["Model"].str.contains(combined_regex, case=False, na=False)]
|
210 |
|
|
|
219 |
# round all others to 1 decimal
|
220 |
for col in data.columns:
|
221 |
if col not in ["", "Model", "Model Type", "Score", "Average"]:
|
222 |
+
# replace any data[col].values == '' with np.NaN
|
223 |
+
data[col] = data[col].replace('', np.NaN)
|
224 |
data[col] = np.round(np.array(data[col].values).astype(float), 1)
|
225 |
return data
|
226 |
|
|
|
246 |
search_1 = gr.Textbox(label="Model Search (delimit with , )",
|
247 |
placeholder="Model Search (delimit with , )",
|
248 |
show_label=False)
|
249 |
+
model_types_1 = gr.CheckboxGroup(["Seq. Classifiers", "DPO", "Custom Classifiers", "Generative", "AI2 Experiments"],
|
250 |
value=["Seq. Classifiers", "DPO", "Custom Classifiers"],
|
251 |
label="Model Types",
|
252 |
show_label=False,
|
|
|
271 |
with gr.TabItem("π RewardBench - Detailed"):
|
272 |
with gr.Row():
|
273 |
search_2 = gr.Textbox(label="Model Search (delimit with , )", show_label=False, placeholder="Model Search (delimit with , )")
|
274 |
+
model_types_2 = gr.CheckboxGroup(["Seq. Classifiers", "DPO", "Custom Classifiers", "Generative", "AI2 Experiments"],
|
275 |
value=["Seq. Classifiers", "DPO", "Custom Classifiers"],
|
276 |
label="Model Types",
|
277 |
show_label=False,
|
|
|
311 |
with gr.TabItem("Prior Test Sets"):
|
312 |
with gr.Row():
|
313 |
search_3 = gr.Textbox(label="Model Search (delimit with , )", show_label=False, placeholder="Model Search (delimit with , )")
|
314 |
+
model_types_3 = gr.CheckboxGroup(["Seq. Classifiers", "DPO", "Custom Classifiers", "Generative", "AI2 Experiments"],
|
315 |
value=["Seq. Classifiers", "DPO", "Custom Classifiers"],
|
316 |
label="Model Types",
|
317 |
show_label=False,
|
src/md.py
CHANGED
@@ -22,6 +22,7 @@ We include multiple types of reward models in this evaluation:
|
|
22 |
2. **Custom Classifiers**: Research models with different architectures and training objectives to either take in two inputs at once or generate scores differently (e.g. PairRM and Stanford SteamSHP).
|
23 |
3. **DPO**: Models trained with Direct Preference Optimization (DPO), with modifiers such as `-ref-free` or `-norm` changing how scores are computed.
|
24 |
4. **Random**: Random choice baseline.
|
|
|
25 |
|
26 |
All models are evaluated in fp16 expect for Starling-7B, which is evaluated in fp32.
|
27 |
Others, such as **Generative Judge** are coming soon.
|
|
|
22 |
2. **Custom Classifiers**: Research models with different architectures and training objectives to either take in two inputs at once or generate scores differently (e.g. PairRM and Stanford SteamSHP).
|
23 |
3. **DPO**: Models trained with Direct Preference Optimization (DPO), with modifiers such as `-ref-free` or `-norm` changing how scores are computed.
|
24 |
4. **Random**: Random choice baseline.
|
25 |
+
4. **Generative**: Prompting fine-tuned models to choose between two answers, similar to MT Bench and AlpacaEval.
|
26 |
|
27 |
All models are evaluated in fp16 expect for Starling-7B, which is evaluated in fp32.
|
28 |
Others, such as **Generative Judge** are coming soon.
|
src/utils.py
CHANGED
@@ -9,8 +9,12 @@ import re
|
|
9 |
def model_hyperlink(link, model_name):
|
10 |
if model_name == "random":
|
11 |
return "random"
|
12 |
-
|
13 |
return f'<a target="_blank" href="https://huggingface.co/Cohere" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
|
|
|
|
|
|
|
|
|
14 |
return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
|
15 |
|
16 |
def undo_hyperlink(html_string):
|
|
|
9 |
def model_hyperlink(link, model_name):
|
10 |
if model_name == "random":
|
11 |
return "random"
|
12 |
+
elif model_name == "Cohere March 2024":
|
13 |
return f'<a target="_blank" href="https://huggingface.co/Cohere" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
|
14 |
+
elif "openai" == model_name.split("/")[0]:
|
15 |
+
return f'<a target="_blank" href="https://huggingface.co/openai" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
|
16 |
+
elif "Anthropic" == model_name.split("/")[0]:
|
17 |
+
return f'<a target="_blank" href="https://huggingface.co/Anthropic" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
|
18 |
return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
|
19 |
|
20 |
def undo_hyperlink(html_string):
|