|
import gradio as gr |
|
import os |
|
from huggingface_hub import HfApi, snapshot_download |
|
from datasets import load_dataset |
|
from src.utils import load_all_data |
|
from src.md import ABOUT_TEXT |
|
import numpy as np |
|
|
|
api = HfApi() |
|
|
|
COLLAB_TOKEN = os.environ.get("COLLAB_TOKEN") |
|
evals_repo = "ai2-rlhf-collab/rm-benchmark-results" |
|
prefs_repo = "ai2-rlhf-collab/rm-testset-results" |
|
eval_set_repo = "ai2-rlhf-collab/rm-benchmark-dev" |
|
repo_dir_herm = "./evals/herm/" |
|
repo_dir_prefs = "./evals/prefs/" |
|
|
|
|
|
|
|
|
|
|
|
|
|
print("Pulling evaluation results") |
|
repo = snapshot_download( |
|
local_dir=repo_dir_herm, |
|
repo_id=evals_repo, |
|
use_auth_token=COLLAB_TOKEN, |
|
tqdm_class=None, |
|
etag_timeout=30, |
|
repo_type="dataset", |
|
) |
|
|
|
repo_pref_sets = snapshot_download( |
|
local_dir=repo_dir_prefs, |
|
repo_id=prefs_repo, |
|
use_auth_token=COLLAB_TOKEN, |
|
tqdm_class=None, |
|
etag_timeout=30, |
|
repo_type="dataset", |
|
) |
|
|
|
def avg_over_herm(dataframe): |
|
""" |
|
Averages over the subsets alpacaeval, mt-bench, llmbar, refusals, hep and returns dataframe with only these columns. |
|
""" |
|
subsets = ["alpacaeval", "mt-bench", "llmbar", "refusals", "hep"] |
|
|
|
for subset in subsets: |
|
subset_cols = [col for col in dataframe.columns if subset in col] |
|
dataframe[subset] = np.round(np.nanmean(dataframe[subset_cols].values, axis=1), 2) |
|
|
|
keep_columns = ["model", "average"] + subsets |
|
dataframe = dataframe[keep_columns] |
|
|
|
dataframe["average"] = np.round(np.nanmean(dataframe[subsets].values, axis=1), 2) |
|
return dataframe |
|
|
|
def expand_subsets(dataframe): |
|
|
|
pass |
|
|
|
herm_data = load_all_data(repo_dir_herm).sort_values(by='average', ascending=False) |
|
herm_data_avg = avg_over_herm(herm_data).sort_values(by='average', ascending=False) |
|
prefs_data = load_all_data(repo_dir_prefs).sort_values(by='average', ascending=False) |
|
|
|
|
|
col_types_herm = ["markdown"] + ["number"] * (len(herm_data.columns) - 1) |
|
col_types_herm_avg = ["markdown"] + ["number"] * (len(herm_data_avg.columns) - 1) |
|
col_types_prefs = ["markdown"] + ["number"] * (len(prefs_data.columns) - 1) |
|
|
|
|
|
|
|
eval_set = load_dataset(eval_set_repo, use_auth_token=COLLAB_TOKEN, split="filtered") |
|
def random_sample(r: gr.Request): |
|
sample_index = np.random.randint(0, len(eval_set) - 1) |
|
sample = eval_set[sample_index] |
|
markdown_text = '\n\n'.join([f"**{key}**: {value}" for key, value in sample.items()]) |
|
return markdown_text |
|
|
|
with gr.Blocks() as app: |
|
|
|
with gr.Row(): |
|
gr.Markdown("# HERM Results Viewer") |
|
with gr.Tabs(elem_classes="tab-buttons") as tabs: |
|
with gr.TabItem("HERM - Overview"): |
|
with gr.Row(): |
|
herm_table = gr.Dataframe( |
|
herm_data_avg.values, |
|
datatype=col_types_herm_avg, |
|
headers=herm_data_avg.columns.tolist(), |
|
elem_id="herm_dataframe_avg", |
|
) |
|
with gr.TabItem("HERM - Detailed"): |
|
with gr.Row(): |
|
herm_table = gr.Dataframe( |
|
herm_data.values, |
|
datatype=col_types_herm, |
|
headers=herm_data.columns.tolist(), |
|
elem_id="herm_dataframe", |
|
) |
|
with gr.TabItem("Pref Sets - Overview"): |
|
pref_sets_table = gr.Dataframe( |
|
prefs_data.values, |
|
datatype=col_types_prefs, |
|
headers=prefs_data.columns.tolist(), |
|
elem_id="prefs_dataframe", |
|
) |
|
|
|
with gr.TabItem("About"): |
|
with gr.Row(): |
|
gr.Markdown(ABOUT_TEXT) |
|
|
|
with gr.TabItem("Dataset Viewer"): |
|
with gr.Row(): |
|
|
|
gr.Markdown("## Random Dataset Sample Viewer") |
|
button = gr.Button("Show Random Sample") |
|
|
|
with gr.Row(): |
|
sample_display = gr.Markdown("{sampled data loads here}") |
|
|
|
button.click(fn=random_sample, outputs=sample_display) |
|
|
|
|
|
|
|
def load_data_on_start(): |
|
data_herm = load_all_data(repo_dir_herm) |
|
herm_table.update(data_herm) |
|
|
|
data_herm_avg = avg_over_herm(repo_dir_herm) |
|
herm_table.update(data_herm_avg) |
|
|
|
data_prefs = load_all_data(repo_dir_prefs) |
|
pref_sets_table.update(data_prefs) |
|
|
|
app.launch() |
|
|