|
import glob |
|
import json |
|
|
|
import datasets |
|
from huggingface_hub import snapshot_download |
|
import pandas as pd |
|
|
|
from backend.envs import EVAL_DATASET, TRACES_DATASET, TOKEN, EVAL_RESULTS_PATH |
|
|
|
|
|
SUBSETS = ["base","cot","orig"] |
|
|
|
|
|
def load_cot_data(): |
|
|
|
|
|
|
|
|
|
|
|
|
|
print("Downloading evaluation results...") |
|
snapshot_download( |
|
repo_id=EVAL_DATASET, |
|
revision="main", |
|
local_dir=EVAL_RESULTS_PATH, |
|
repo_type="dataset", |
|
max_workers=8, |
|
token=TOKEN |
|
) |
|
|
|
|
|
models = [] |
|
for path in glob.glob(f"{EVAL_RESULTS_PATH}/data/*/*", recursive=False): |
|
models.append(path.replace(f"{EVAL_RESULTS_PATH}/data/","")) |
|
|
|
|
|
results = [] |
|
for model in models: |
|
for subset in SUBSETS: |
|
result_files = glob.glob(f"{EVAL_RESULTS_PATH}/data/{model}/{subset}/**/*.json", recursive=True) |
|
for json_filepath in result_files: |
|
with open(json_filepath) as fp: |
|
data = json.load(fp) |
|
if "results" in data.keys(): |
|
for k,v in data["results"].items(): |
|
record = v.copy() |
|
record["model"] = model |
|
record["subset"] = subset |
|
results.append(record) |
|
|
|
df_results = pd.DataFrame(results) |
|
del results |
|
|
|
|
|
def split_alias(alias: str) -> pd.Series: |
|
if alias[-5:]=="_base": |
|
alias = alias[:-5] |
|
elif alias[-4:]=="_cot": |
|
alias = alias[:-4] |
|
|
|
if "_" not in alias: |
|
task = alias |
|
config = "" |
|
else: |
|
config, task = alias.split("_") |
|
|
|
return pd.Series({"task": task, "config": config}) |
|
|
|
df_results = pd.concat([df_results, df_results.alias.apply(split_alias)], axis=1) |
|
|
|
|
|
df_baseline = df_results[df_results.subset.eq("base")].groupby(["model","task"])[["acc,none"]].mean() |
|
|
|
|
|
df_tmp1 = df_results[df_results.subset.eq("cot")].sort_values(by=["model","task","config"]) |
|
df_tmp1.reset_index(inplace=True, drop=True) |
|
|
|
df_cot = df_tmp1[["model","task","config"]].copy() |
|
df_cot["acc_cot"] = df_tmp1["acc,none"] |
|
df_cot["acc_base"] = df_cot.apply(lambda row: df_baseline.loc[(row.model, row.task)]["acc,none"], axis=1) |
|
|
|
df_cot["acc_gain"] = df_cot.acc_cot - df_cot.acc_base |
|
df_cot["delta_rel"] = (df_cot.acc_cot - df_cot.acc_base)/df_cot.acc_base |
|
|
|
|
|
df_cot_avg = df_cot.groupby(["model","config"]).mean(numeric_only=True).reset_index() |
|
df_cot_avg["task"] = "all" |
|
|
|
|
|
df_cot = pd.concat([df_cot_avg, df_cot], ignore_index=True) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print("Loading traces data...") |
|
dataset = datasets.load_dataset(TRACES_DATASET, split="test", token=TOKEN, num_proc=8) |
|
dataset = dataset.select_columns(["config_data"]) |
|
df_cottraces = pd.DataFrame({"config_data": dataset["config_data"]}) |
|
del dataset |
|
config_data = [] |
|
for data in df_cottraces.config_data.to_list(): |
|
if data is not None: |
|
config_data.append(dict(data)) |
|
del df_cottraces |
|
df_cotconfigs = pd.DataFrame(config_data) |
|
df_cotconfigs.drop_duplicates(inplace=True, ignore_index=True) |
|
df_cotconfigs |
|
|
|
|
|
def select_config_data(row): |
|
df_selected = df_cotconfigs[df_cotconfigs.name.eq(row.config) & df_cotconfigs.model.eq(row.model)] |
|
if len(df_selected) == 0: |
|
print(f"Config {row.config} not found for model {row.model}") |
|
return None |
|
return df_selected.drop(columns=["name", "model", "task"]).iloc[0] |
|
|
|
df_cot = pd.concat( |
|
[ |
|
df_cot, |
|
df_cot.apply(select_config_data, axis=1) |
|
], |
|
axis=1 |
|
) |
|
|
|
|
|
for col in ['acc_base', 'acc_cot', 'acc_gain']: |
|
df_cot[col] = 100 * df_cot[col] |
|
|
|
print("Regimes dataframe created:") |
|
print(df_cot.head(3)) |
|
|
|
|
|
|
|
|
|
|
|
df_cot_err = df_cot.groupby(["model","task"]).agg({'acc_gain': ['mean', 'min', 'max'], "acc_base": "mean", "acc_cot": "mean"}) |
|
df_cot_err.columns = ['-'.join(col).strip() for col in df_cot_err.columns.values] |
|
df_cot_err["acc_gain-err"] = 0.5 * (df_cot_err["acc_gain-max"] - df_cot_err["acc_gain-min"]) |
|
df_cot_err.reset_index(inplace=True) |
|
df_cot_err.rename(columns={"acc_base-mean": "base accuracy", "acc_cot-mean": "cot accuracy", "acc_gain-mean": "marginal acc. gain"}, inplace=True) |
|
|
|
print("Error dataframe created:") |
|
print(df_cot_err.head(3)) |
|
|
|
|
|
return df_cot_err, df_cot |