import gradio as gr import os from huggingface_hub import HfApi, snapshot_download from apscheduler.schedulers.background import BackgroundScheduler from datasets import load_dataset from src.utils import load_all_data from src.md import ABOUT_TEXT, TOP_TEXT from src.plt import plot_avg_correlation import numpy as np api = HfApi() COLLAB_TOKEN = os.environ.get("COLLAB_TOKEN") evals_repo = "ai2-adapt-dev/HERM-Results" eval_set_repo = "ai2-adapt-dev/rm-benchmark-dev" repo_dir_herm = "./evals/herm/" def restart_space(): api.restart_space(repo_id="ai2-adapt-dev/rm-benchmark-viewer", token=COLLAB_TOKEN) print("Pulling evaluation results") repo = snapshot_download( local_dir=repo_dir_herm, repo_id=evals_repo, use_auth_token=COLLAB_TOKEN, tqdm_class=None, etag_timeout=30, repo_type="dataset", ) def avg_over_herm(dataframe): """ Averages over the subsets alpacaeval, mt-bench, llmbar, refusals, hep and returns dataframe with only these columns. """ new_df = dataframe.copy() subsets = ["alpacaeval", "mt-bench", "llmbar", "refusals", "hep"] # for each subset, avg the columns that have the subset in the column name, then add a new column with subset name and avg for subset in subsets: if subset == "refusals": subset_cols = ["refusals-dangerous", "refusals-offensive", "donotanswer","xstest-should-refuse", "xstest-should-respond"] else: subset_cols = [col for col in new_df.columns if subset in col] new_df[subset] = np.round(np.nanmean(new_df[subset_cols].values, axis=1), 2) keep_columns = ["model", "average"] + subsets new_df = new_df[keep_columns] # replace average column with new average new_df["average"] = np.round(np.nanmean(new_df[subsets].values, axis=1), 2) # rename column "hep" to "hep (code)" new_df = new_df.rename(columns={"hep": "hep (code)"}) return new_df def expand_subsets(dataframe): # TODO need to modify data/ script to do this pass # reference for length bias categories length_categories = { 'alpacaeval-easy': 'True', 'alpacaeval-hard': 'True', 'alpacaeval-length': 'Neutral', 'donotanswer': 'False', 'hep-cpp': 'Neutral', 'hep-go': 'Neutral', 'hep-java': 'Neutral', 'hep-js': 'Neutral', 'hep-python': 'Neutral', 'hep-rust': 'Neutral', 'llmbar-adver-GPTInst': 'False', 'llmbar-adver-GPTOut': 'Neutral', 'llmbar-adver-manual': 'False', 'llmbar-adver-neighbor': 'False', 'llmbar-natural': 'Neutral', 'mt-bench-easy': 'False', 'mt-bench-hard': 'False', 'mt-bench-med': 'Neutral', 'refusals-dangerous': 'False', 'refusals-offensive': 'False', 'xstest-should-refuse': 'False', 'xstest-should-respond': 'True' } def length_bias_check(dataframe): """ Takes the raw herm dataframe and splits the data into new buckets according to length_categories. Then, take the average of the three buckets as "average" """ new_df = dataframe.copy() existing_subsets = new_df.columns[2:] final_subsets = ["Length Bias", "Neutral", "Terse Bias"] # new data is empty list dict for each final subset new_data = {s: [] for s in final_subsets} # now, subsets correspond to those with True, Nuetral, and False length bias # check if length_categories[subset] == "True" or "False" or "Neutral" for subset in existing_subsets: subset_data = new_df[subset].values subset_length = length_categories[subset] # route to the correct bucket if subset_length == "True": new_data["Length Bias"].append(subset_data) elif subset_length == "Neutral": new_data["Neutral"].append(subset_data) elif subset_length == "False": new_data["Terse Bias"].append(subset_data) # take average of new_data and add to new_df (removing other columns than model) for subset in final_subsets: new_df[subset] = np.round(np.nanmean(new_data[subset], axis=0), 2) keep_columns = ["model"] + final_subsets new_df = new_df[keep_columns] # recompute average # new_df["average"] = np.round(np.nanmean(new_df[final_subsets].values, axis=1), 2) return new_df herm_data = load_all_data(repo_dir_herm, subdir="eval-set").sort_values(by='average', ascending=False) herm_data_avg = avg_over_herm(herm_data).sort_values(by='average', ascending=False) herm_data_length = length_bias_check(herm_data).sort_values(by='Terse Bias', ascending=False) prefs_data = load_all_data(repo_dir_herm, subdir="pref-sets").sort_values(by='average', ascending=False) # prefs_data_sub = expand_subsets(prefs_data).sort_values(by='average', ascending=False) col_types_herm = ["markdown"] + ["number"] * (len(herm_data.columns) - 1) col_types_herm_avg = ["markdown"] + ["number"] * (len(herm_data_avg.columns) - 1) cols_herm_data_length = ["markdown"] + ["number"] * (len(herm_data_length.columns) - 1) col_types_prefs = ["markdown"] + ["number"] * (len(prefs_data.columns) - 1) # col_types_prefs_sub = ["markdown"] + ["number"] * (len(prefs_data_sub.columns) - 1) # for showing random samples eval_set = load_dataset(eval_set_repo, use_auth_token=COLLAB_TOKEN, split="filtered") def random_sample(r: gr.Request, subset): if subset is None or subset == []: sample_index = np.random.randint(0, len(eval_set) - 1) sample = eval_set[sample_index] else: # filter by subsets (can be list) if isinstance(subset, str): subset = [subset] # filter down dataset to only include the subset(s) eval_set_filtered = eval_set.filter(lambda x: x["subset"] in subset) sample_index = np.random.randint(0, len(eval_set_filtered) - 1) sample = eval_set_filtered[sample_index] markdown_text = '\n\n'.join([f"**{key}**:\n\n{value}" for key, value in sample.items()]) return markdown_text subsets = eval_set.unique("subset") with gr.Blocks() as app: # create tabs for the app, moving the current table to one titled "HERM" and the benchmark_text to a tab called "About" with gr.Row(): gr.Markdown(TOP_TEXT) with gr.Tabs(elem_classes="tab-buttons") as tabs: with gr.TabItem("HERM Eval Set - Overview"): with gr.Row(): herm_table = gr.Dataframe( herm_data_avg.values, datatype=col_types_herm_avg, headers=herm_data_avg.columns.tolist(), elem_id="herm_dataframe_avg", height=1000, ) with gr.TabItem("HERM Eval Set - Detailed"): with gr.Row(): herm_table = gr.Dataframe( herm_data.values, datatype=col_types_herm, headers=herm_data.columns.tolist(), elem_id="herm_dataframe", height=1000, ) with gr.TabItem("HERM Eval Set - Length Bias"): with gr.Row(): herm_table = gr.Dataframe( herm_data_length.values, datatype=cols_herm_data_length, headers=herm_data_length.columns.tolist(), elem_id="herm_dataframe_length", height=1000, ) with gr.TabItem("Known Pref. Sets"): with gr.Row(): PREF_SET_TEXT = """ For more information, see the [dataset](https://huggingface.co/datasets/allenai/pref-test-sets). """ gr.Markdown(PREF_SET_TEXT) with gr.Row(): pref_sets_table = gr.Dataframe( prefs_data.values, datatype=col_types_prefs, headers=prefs_data.columns.tolist(), elem_id="prefs_dataframe", height=1000, ) with gr.TabItem("About"): with gr.Row(): gr.Markdown(ABOUT_TEXT) with gr.TabItem("Dataset Viewer"): with gr.Row(): # loads one sample gr.Markdown("## Random Dataset Sample Viewer") subset_selector = gr.Dropdown(subsets, label="Subset", value=None, multiselect=True) button = gr.Button("Show Random Sample") with gr.Row(): sample_display = gr.Markdown("{sampled data loads here}") button.click(fn=random_sample, inputs=[subset_selector], outputs=[sample_display]) # removed plot because not pretty enough # with gr.TabItem("Model Correlation"): # with gr.Row(): # plot = plot_avg_correlation(herm_data_avg, prefs_data) # gr.Plot(plot) # Load data when app starts, TODO make this used somewhere... # def load_data_on_start(): # data_herm = load_all_data(repo_dir_herm) # herm_table.update(data_herm) # data_herm_avg = avg_over_herm(repo_dir_herm) # herm_table.update(data_herm_avg) # data_prefs = load_all_data(repo_dir_prefs) # pref_sets_table.update(data_prefs) scheduler = BackgroundScheduler() scheduler.add_job(restart_space, "interval", seconds=10800) # restarted every 3h scheduler.start() app.queue().launch()