Spaces:
Running
Running
File size: 3,118 Bytes
62c7044 84ee137 62c7044 ada4cd8 098bb60 ada4cd8 b27b717 84ee137 1cade3b 84ee137 ada4cd8 1cade3b 84ee137 b27b717 ada4cd8 84ee137 2bc2f6b c64c31e 2bc2f6b c64c31e 5d9a791 5680172 5d9a791 c64c31e 5d9a791 4c0cc56 5d9a791 c64c31e 84ee137 1d1e9b6 84ee137 375e6bf |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 |
import os
import pandas as pd
from src.display.utils import AutoEvalColumn
def get_leaderboard_df_crm(
crm_results_path: str, accuracy_cols: list, ts_cols: list
) -> tuple[pd.DataFrame, pd.DataFrame]:
"""Creates a dataframe from all the individual experiment results"""
use_case_flavor_mapping_df = pd.read_csv(os.path.join(crm_results_path, "hf_leaderboard_flavor_mapping.csv"))
sf_finetuned_models = ["SF-TextBase 70B", "SF-TextBase 7B", "SF-TextSum"]
# sf_finetuned_models = []
leaderboard_accuracy_df = pd.read_csv(os.path.join(crm_results_path, "hf_leaderboard_accuracy.csv"))
leaderboard_accuracy_df = leaderboard_accuracy_df[~leaderboard_accuracy_df["Model Name"].isin(sf_finetuned_models)]
leaderboard_accuracy_df = leaderboard_accuracy_df.join(
use_case_flavor_mapping_df[["Use Case Name", "Cost and Speed: Flavor"]].set_index("Use Case Name"),
on="Use Case Name",
)
leaderboard_cost_df = pd.read_csv(os.path.join(crm_results_path, "hf_leaderboard_latency_cost.csv"))
leaderboard_cost_df = leaderboard_cost_df[~leaderboard_cost_df["Model Name"].isin(sf_finetuned_models)]
leaderboard_accuracy_df = leaderboard_accuracy_df.join(
leaderboard_cost_df.set_index(["Model Name", "Cost and Speed: Flavor"]),
on=["Model Name", "Cost and Speed: Flavor"],
)
leaderboard_ts_df = pd.read_csv(os.path.join(crm_results_path, "hf_leaderboard_ts.csv"))
leaderboard_ts_crm_bias_df = pd.read_csv(os.path.join(crm_results_path, "hf_leaderboard_crm_bias.csv"))
leaderboard_ts_df = leaderboard_ts_df[~leaderboard_ts_df["Model Name"].isin(sf_finetuned_models)]
leaderboard_ts_df = leaderboard_ts_df.join(leaderboard_ts_crm_bias_df.set_index("Model Name"), on="Model Name")
privacy_cols = leaderboard_ts_df[
[
"Privacy Zero-Shot Match Avoidance",
"Privacy Zero-Shot Reveal Avoidance",
"Privacy Five-Shot Match Avoidance",
"Privacy Five-Shot Reveal Avoidance",
]
].apply(lambda x: x.str.rstrip("%").astype("float") / 100.0, axis=1)
leaderboard_ts_df["Privacy"] = privacy_cols.mean(axis=1).transform(lambda x: "{:,.2%}".format(x))
leaderboard_ts_df["Bias No CI"] = leaderboard_ts_df["CRM Fairness"].transform(lambda x: x.split(" ")[0])
ts_lvl2_cols = leaderboard_ts_df[
[
"Safety",
"Privacy",
"Truthfulness",
"Bias No CI",
]
].apply(lambda x: x.str.rstrip("%").astype("float") / 100.0, axis=1)
leaderboard_ts_df["Trust & Safety"] = ts_lvl2_cols.mean(axis=1).transform(lambda x: "{:,.2%}".format(x))
leaderboard_accuracy_df = leaderboard_accuracy_df.join(
leaderboard_ts_df[ts_cols].set_index(["Model Name"]),
on=["Model Name"],
)
leaderboard_accuracy_df = leaderboard_accuracy_df.sort_values(
by=[AutoEvalColumn.use_case_name.name, AutoEvalColumn.accuracy_metric_average.name], ascending=[True, False]
)
leaderboard_accuracy_df = leaderboard_accuracy_df[accuracy_cols].round(decimals=2)
return leaderboard_accuracy_df
|