File size: 8,859 Bytes
d0f55c6 b1b50fb 15c8167 a56da8a 15c8167 30a0c61 608184c 7e32ac7 15c8167 729af67 15c8167 7e32ac7 15c8167 8e404a5 15c8167 611a3ed 15c8167 8e404a5 15c8167 d0f55c6 8e404a5 c2c9efa 8e404a5 fae0e19 719c272 608184c d0f55c6 608184c d0f55c6 da4a3b1 15c8167 bea7063 15c8167 523fad9 96f60e1 ea4c670 966ae7b ea4c670 bea7063 ea4c670 15c8167 54e105e 585c3fa f12aa56 608184c 54e105e 15c8167 585c3fa f12aa56 bd64e7a 581682a 15c8167 611a3ed 22fb9eb 611a3ed 22fb9eb 585c3fa f12aa56 15c8167 581682a 41fbe9f 581682a 608184c 15c8167 bea7063 15c8167 9c39267 30a0c61 15c8167 bf6ab81 9c39267 15c8167 608184c 15c8167 bea7063 611a3ed 608184c 9c39267 30a0c61 9c39267 611a3ed 15c8167 26e855f 8f7c83f 608184c 07db628 bea7063 07db628 e7f29e1 2b1d96b 07db628 2b1d96b 07db628 2b1d96b 07db628 2b1d96b a56da8a 2b1d96b bea7063 2b1d96b a56da8a 31903af a56da8a bea7063 a56da8a 9ade1c2 a56da8a b1b50fb |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 |
import asyncio
import shutil
import tempfile
import gradio as gr
import pandas as pd
import as px
import src.constants as constants
from src.env_impact import get_env_impact
from src.hub import glob, load_json_file
def load_result_paths_per_model():
return sort_result_paths_per_model(fetch_result_paths())
def fetch_result_paths():
path = f"{constants.RESULTS_DATASET_ID}/**/**/*.json"
return glob(path)
def sort_result_paths_per_model(paths):
from collections import defaultdict
d = defaultdict(list)
for path in paths:
model_id, _ = path[len(constants.RESULTS_DATASET_ID) + 1 :].rsplit("/", 1)
return {model_id: sorted(paths) for model_id, paths in d.items()}
async def load_results_dataframe(model_id, result_paths_per_model=None):
if not model_id or not result_paths_per_model:
result_paths = result_paths_per_model[model_id]
results = await asyncio.gather(*[load_json_file(path) for path in result_paths])
results = [result for result in results if result]
if not results:
data = {"results": {}, "configs": {}, "env_impact": {}}
for result in results:
data["env_impact"].update(await get_env_impact(result))
model_name = result.get("model_name", "Model")
df = pd.json_normalize([data])
# df.columns = df.columns.str.split(".") # .split return a list instead of a tuple
return df.set_index(pd.Index([model_name]))
async def load_results(result_paths_per_model, *model_ids_lists):
dfs = await asyncio.gather(
load_results_dataframe(model_id, result_paths_per_model)
for model_ids in model_ids_lists
if model_ids
for model_id in model_ids
dfs = [df for df in dfs if df is not None]
if dfs:
return pd.concat(dfs), None
return None, None
def display_results(df, task, hide_std_errors, show_only_differences):
if df is None:
return None, None
df = df.T.rename_axis(columns=None)
return (
display_tab("results", df, task, hide_std_errors=hide_std_errors),
display_tab("configs", df, task, show_only_differences=show_only_differences),
display_tab("env_impact", df, task),
def display_tab(tab, df, task, hide_std_errors=True, show_only_differences=False):
if show_only_differences:
any_difference =[:, 0], axis=0).any(axis=1)
df ="html", na_rep="")
# Hide rows
for row in df.index
if (
not row.startswith(f"{tab}.")
or row.startswith(f"{tab}.leaderboard.")
or row.endswith(".alias")
or (
not row.startswith(f"{tab}.{task}")
if task != "All"
else row.startswith(f"{tab}.leaderboard_arc_challenge") # Hide legacy ARC
# Hide MATH fewshot_config.samples: <function list_fewshot_samples at 0x7f34d199ab90>
or (row.startswith(f"{tab}.leaderboard_math") and row.endswith("fewshot_config.samples"))
# Hide std errors
or (hide_std_errors and row.endswith("_stderr,none"))
# Hide non-different rows
or (show_only_differences and not any_difference[row])
# Color metric result cells
idx = pd.IndexSlice
colored_rows = idx[
for row in df.index
if row.endswith("acc,none") or row.endswith("acc_norm,none") or row.endswith("exact_match,none")
] # Apply only on numeric cells, otherwise the background gradient will not work
subset = idx[colored_rows, idx[:]]
df.background_gradient(cmap="PiYG", vmin=0, vmax=1, subset=subset, axis=None)
# Format index values: remove prefix and suffix
if tab == "env_impact":
start = len(f"{tab}.")
start = len(f"{tab}.leaderboard_") if task == "All" else len(f"{tab}.{task} ")
df.format_index(lambda idx: idx[start:].removesuffix(",none"), axis="index")
# Fix overflow
"selector": "td",
"props": [("overflow-wrap", "break-word"), ("max-width", "1px")],
"selector": ".col_heading",
"props": [("width", f"{100 / len(df.columns)}%")],
return df.to_html()
def update_tasks_component():
return (
["All"] + list(constants.TASKS.values()),
info="Evaluation tasks to be displayed",
) * 2
def clear_results():
# model_ids, dataframe, load_results_btn, load_configs_btn, load_env_impact_btn, results_task, configs_task
return (
*(gr.Button("Load", interactive=False),) * 3,
["All"] + list(constants.TASKS.values()),
info="Evaluation tasks to be displayed",
* 2,
def display_loading_message_for_results():
return ("<h3 style='text-align: center;'>Loading...</h3>",) * 3
def plot_results(df, task):
if df is not None:
df = df[
for col in df.columns
if col.startswith("results.")
and (col.endswith("acc,none") or col.endswith("acc_norm,none") or col.endswith("exact_match,none"))
tasks = {key: tupl[0] for key, tupl in constants.TASKS.items()}
tasks["leaderboard_math"] = tasks["leaderboard_math_hard"]
subtasks = {tupl[1]: tupl[0] for tupl in constants.SUBTASKS.get(task, [])}
if task == "All":
df = df[[col for col in df.columns if col.split(".")[1] in tasks]]
# - IFEval: Calculate average of both strict accuracies
ifeval_mean = df[
df = df.drop(columns=[col for col in df.columns if col.split(".")[1] == "leaderboard_ifeval"])
loc = df.columns.get_loc("results.leaderboard_math_hard.exact_match,none")
df.insert(loc - 1, "results.leaderboard_ifeval", ifeval_mean)
# Rename
df = df.rename(columns=lambda col: tasks[col.split(".")[1]])
df = df[[col for col in df.columns if col.startswith(f"results.{task}")]]
# - IFEval: Return 4 accuracies
if task == "leaderboard_ifeval":
df = df.rename(columns=lambda col: col.split(".")[2].removesuffix(",none"))
df = df.rename(columns=lambda col: tasks.get(col.split(".")[1], subtasks.get(col.split(".")[1])))
fig_1 =
labels={"index": "Benchmark" if task == "All" else "Subtask", "value": "Score"},
color_discrete_sequence=px.colors.qualitative.Safe, # TODO:
fig_1.update_yaxes(range=[0, 1])
fig_2 = px.line_polar(
df.melt(ignore_index=False, var_name="Benchmark", value_name="Score").reset_index(names="Model"),
range_r=[0, 1],
color_discrete_sequence=px.colors.qualitative.Safe, # TODO:
# Avoid bug with radar:
return fig_1, fig_2
return None, None
tmpdirname = None
def download_results(results):
global tmpdirname
if results:
if tmpdirname:
tmpdirname = tempfile.mkdtemp()
path = f"{tmpdirname}/results.html"
with open(path, "w") as f:
return gr.File(path, visible=True)
def clear_results_file():
global tmpdirname
if tmpdirname:
tmpdirname = None
return gr.File(visible=False)