Spaces:
Sleeping
Sleeping
import io | |
import json | |
import gradio as gr | |
import pandas as pd | |
from huggingface_hub import HfFileSystem | |
RESULTS_DATASET_ID = "datasets/open-llm-leaderboard/results" | |
EXCLUDED_KEYS = { | |
"pretty_env_info", | |
"chat_template", | |
"group_subtasks", | |
} | |
# EXCLUDED_RESULTS_KEYS = { | |
# "leaderboard", | |
# } | |
# EXCLUDED_RESULTS_LEADERBOARDS_KEYS = { | |
# "alias", | |
# } | |
TASKS = { | |
"leaderboard_arc_challenge": ("ARC", "leaderboard_arc_challenge"), | |
"leaderboard_bbh": ("BBH", "leaderboard_bbh"), | |
"leaderboard_gpqa": ("GPQA", "leaderboard_gpqa"), | |
"leaderboard_ifeval": ("IFEval", "leaderboard_ifeval"), | |
"leaderboard_math_hard": ("MATH", "leaderboard_math"), | |
"leaderboard_mmlu_pro": ("MMLU-Pro", "leaderboard_mmlu_pro"), | |
"leaderboard_musr": ("MuSR", "leaderboard_musr"), | |
} | |
fs = HfFileSystem() | |
def fetch_result_paths(): | |
paths = fs.glob(f"{RESULTS_DATASET_ID}/**/**/*.json") | |
return paths | |
def filter_latest_result_path_per_model(paths): | |
from collections import defaultdict | |
d = defaultdict(list) | |
for path in paths: | |
model_id, _ = path[len(RESULTS_DATASET_ID) +1:].rsplit("/", 1) | |
d[model_id].append(path) | |
return {model_id: max(paths) for model_id, paths in d.items()} | |
def get_result_path_from_model(model_id, result_path_per_model): | |
return result_path_per_model[model_id] | |
def load_data(result_path) -> pd.DataFrame: | |
with fs.open(result_path, "r") as f: | |
data = json.load(f) | |
return data | |
def load_result_dataframe(model_id): | |
result_path = get_result_path_from_model(model_id, latest_result_path_per_model) | |
data = load_data(result_path) | |
model_name = data.get("model_name", "Model") | |
df = pd.json_normalize([{key: value for key, value in data.items() if key not in EXCLUDED_KEYS}]) | |
# df.columns = df.columns.str.split(".") # .split return a list instead of a tuple | |
return df.set_index(pd.Index([model_name])).reset_index() | |
def display_results(df_1, df_2, task): | |
df = pd.concat([df.set_index("index") for df in [df_1, df_2] if "index" in df.columns]) | |
df = df.T.rename_axis(columns=None) # index="Parameters", # .reset_index() | |
# return display_dataframe(df) | |
# d = df.set_index(df.index.str.split(".")) # .split return a list instead of a tuple | |
# results = d.loc[d.index.str[0] == "results"] | |
# results.index = results.index.str.join(".") | |
# configs = d.loc[d.index.str[0] == "configs"] | |
# configs.index = configs.index.str.join(".") | |
# return display_dataframe(results), display_dataframe(configs) | |
return display_results_tab(df, task), display_configs_tab(df, task) | |
def display_results_tab(df, task): | |
df = df.style.format(na_rep="") | |
df.hide( | |
[ | |
row | |
for row in df.index | |
if ( | |
not row.startswith("results.") | |
or row.startswith("results.leaderboard.") | |
or row.endswith(".alias") | |
or (not row.startswith(f"results.{task}") if task != "All" else False) | |
) | |
], | |
axis="index", | |
) | |
df.format_index(lambda idx: idx[len("results.leaderboard_"):].removesuffix(",none"), axis="index") | |
return df.to_html() | |
def display_configs_tab(df, task): | |
df = df.style.format(na_rep="") | |
df.hide( | |
[ | |
row | |
for row in df.index | |
if ( | |
not row.startswith("configs.") | |
or row.startswith("configs.leaderboard.") | |
or row.endswith(".alias") | |
or (not row.startswith(f"configs.{task}") if task != "All" else False) | |
) | |
], | |
axis="index", | |
) | |
df.format_index(lambda idx: idx[len("configs.leaderboard_"):], axis="index") | |
return df.to_html() | |
# if __name__ == "__main__": | |
latest_result_path_per_model = filter_latest_result_path_per_model(fetch_result_paths()) | |
with gr.Blocks(fill_height=True) as demo: | |
gr.HTML("<h1 style='text-align: center;'>Compare Results of the 🤗 Open LLM Leaderboard</h1>") | |
gr.HTML("<h3 style='text-align: center;'>Select 2 results to load and compare</h3>") | |
with gr.Row(): | |
with gr.Column(): | |
model_id_1 = gr.Dropdown(choices=list(latest_result_path_per_model.keys()), label="Results") | |
load_btn_1 = gr.Button("Load") | |
dataframe_1 = gr.Dataframe(visible=False) | |
with gr.Column(): | |
model_id_2 = gr.Dropdown(choices=list(latest_result_path_per_model.keys()), label="Results") | |
load_btn_2 = gr.Button("Load") | |
dataframe_2 = gr.Dataframe(visible=False) | |
with gr.Row(): | |
task = gr.Radio( | |
["All"] + list(TASKS.values()), | |
label="Tasks", | |
info="Evaluation tasks to be displayed", | |
value="All", | |
) | |
with gr.Row(): | |
# with gr.Tab("All"): | |
# pass | |
with gr.Tab("Results"): | |
results = gr.HTML() | |
with gr.Tab("Configs"): | |
configs = gr.HTML() | |
load_btn_1.click( | |
fn=load_result_dataframe, | |
inputs=model_id_1, | |
outputs=dataframe_1, | |
).then( | |
fn=display_results, | |
inputs=[dataframe_1, dataframe_2, task], | |
outputs=[results, configs], | |
) | |
load_btn_2.click( | |
fn=load_result_dataframe, | |
inputs=model_id_2, | |
outputs=dataframe_2, | |
).then( | |
fn=display_results, | |
inputs=[dataframe_1, dataframe_2, task], | |
outputs=[results, configs], | |
) | |
task.change( | |
fn=display_results, | |
inputs=[dataframe_1, dataframe_2, task], | |
outputs=[results, configs], | |
) | |
demo.launch() | |