natolambert
commited on
Commit
·
91c5b22
1
Parent(s):
93916f2
add tag
Browse files- src/md.py +3 -1
- src/utils.py +20 -7
src/md.py
CHANGED
@@ -2,6 +2,8 @@ ABOUT_TEXT = """
|
|
2 |
We compute the win percentage for a reward model on hand curated chosen-rejected pairs for each prompt.
|
3 |
A win is when the score for the chosen response is higher than the score for the rejected response.
|
4 |
|
|
|
|
|
5 |
## Overview
|
6 |
|
7 |
We average over 4 core sections (per prompt weighting):
|
@@ -93,5 +95,5 @@ For more details, see the [dataset](https://huggingface.co/datasets/allenai/rewa
|
|
93 |
TOP_TEXT = """
|
94 |
# RewardBench: Evaluating Reward Models
|
95 |
### Evaluating the capabilities, safety, and pitfalls of reward models
|
96 |
-
[Code](https://github.com/allenai/reward-bench) | [Eval. Dataset](https://huggingface.co/datasets/allenai/reward-bench) | [Prior Test Sets](https://huggingface.co/datasets/allenai/pref-test-sets) | [Results](https://huggingface.co/datasets/allenai/reward-bench-results) | [Paper](https://arxiv.org/abs/2403.13787) | Total models: {}
|
97 |
"""
|
|
|
2 |
We compute the win percentage for a reward model on hand curated chosen-rejected pairs for each prompt.
|
3 |
A win is when the score for the chosen response is higher than the score for the rejected response.
|
4 |
|
5 |
+
Note: Models with (*) after the model name are independently submitted model scores which have not been verified by the RewardBench team.
|
6 |
+
|
7 |
## Overview
|
8 |
|
9 |
We average over 4 core sections (per prompt weighting):
|
|
|
95 |
TOP_TEXT = """
|
96 |
# RewardBench: Evaluating Reward Models
|
97 |
### Evaluating the capabilities, safety, and pitfalls of reward models
|
98 |
+
[Code](https://github.com/allenai/reward-bench) | [Eval. Dataset](https://huggingface.co/datasets/allenai/reward-bench) | [Prior Test Sets](https://huggingface.co/datasets/allenai/pref-test-sets) | [Results](https://huggingface.co/datasets/allenai/reward-bench-results) | [Paper](https://arxiv.org/abs/2403.13787) | Total models: {} | * Unverified models
|
99 |
"""
|
src/utils.py
CHANGED
@@ -5,24 +5,37 @@ import numpy as np
|
|
5 |
import os
|
6 |
import re
|
7 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
# From Open LLM Leaderboard
|
9 |
def model_hyperlink(link, model_name):
|
10 |
# if model_name is above 50 characters, return first 47 characters and "..."
|
11 |
if len(model_name) > 50:
|
12 |
model_name = model_name[:47] + "..."
|
13 |
if model_name == "random":
|
14 |
-
|
15 |
elif model_name == "Cohere March 2024":
|
16 |
-
|
17 |
elif "openai" == model_name.split("/")[0]:
|
18 |
-
|
19 |
elif "Anthropic" == model_name.split("/")[0]:
|
20 |
-
|
21 |
elif "google" == model_name.split("/")[0]:
|
22 |
-
|
23 |
elif "PoLL" == model_name.split("/")[0]:
|
24 |
-
|
25 |
-
|
|
|
|
|
|
|
|
|
|
|
26 |
|
27 |
def undo_hyperlink(html_string):
|
28 |
# Regex pattern to match content inside > and <
|
|
|
5 |
import os
|
6 |
import re
|
7 |
|
8 |
+
UNVERIFIED_MODELS = [
|
9 |
+
"nvidia/Nemotron-4-340B-Reward",
|
10 |
+
"nvidia/Llama3-70B-SteerLM-RM",
|
11 |
+
"Cohere May 2024",
|
12 |
+
"google/gemini-1.5-pro-0514",
|
13 |
+
"Cohere March 2024",
|
14 |
+
]
|
15 |
+
|
16 |
# From Open LLM Leaderboard
|
17 |
def model_hyperlink(link, model_name):
|
18 |
# if model_name is above 50 characters, return first 47 characters and "..."
|
19 |
if len(model_name) > 50:
|
20 |
model_name = model_name[:47] + "..."
|
21 |
if model_name == "random":
|
22 |
+
output = "random"
|
23 |
elif model_name == "Cohere March 2024":
|
24 |
+
output = f'<a target="_blank" href="https://huggingface.co/Cohere" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
|
25 |
elif "openai" == model_name.split("/")[0]:
|
26 |
+
output = f'<a target="_blank" href="https://huggingface.co/openai" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
|
27 |
elif "Anthropic" == model_name.split("/")[0]:
|
28 |
+
output = f'<a target="_blank" href="https://huggingface.co/Anthropic" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
|
29 |
elif "google" == model_name.split("/")[0]:
|
30 |
+
output = f'<a target="_blank" href="https://huggingface.co/google" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
|
31 |
elif "PoLL" == model_name.split("/")[0]:
|
32 |
+
output = model_name
|
33 |
+
output = f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
|
34 |
+
|
35 |
+
if model_name in UNVERIFIED_MODELS:
|
36 |
+
return output + " *"
|
37 |
+
else:
|
38 |
+
return output
|
39 |
|
40 |
def undo_hyperlink(html_string):
|
41 |
# Regex pattern to match content inside > and <
|