Spaces:
Running
Running
File size: 3,309 Bytes
62c7044 84ee137 62c7044 ada4cd8 b27b717 84ee137 1cade3b 84ee137 ada4cd8 1cade3b 84ee137 b27b717 ada4cd8 b27b717 ada4cd8 84ee137 ada4cd8 2bc2f6b 4c0cc56 2bc2f6b 4c0cc56 5d9a791 4c0cc56 5d9a791 4c0cc56 5d9a791 2bc2f6b 84ee137 2bc2f6b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 |
import os
import pandas as pd
from src.display.utils import AutoEvalColumn
def get_leaderboard_df_crm(
crm_results_path: str, accuracy_cols: list, cost_cols: list
) -> tuple[pd.DataFrame, pd.DataFrame]:
"""Creates a dataframe from all the individual experiment results"""
use_case_flavor_mapping_df = pd.read_csv(os.path.join(crm_results_path, "hf_leaderboard_flavor_mapping.csv"))
sf_finetuned_models = ["SF-TextBase 70B", "SF-TextBase 7B", "SF-TextSum"]
# sf_finetuned_models = []
leaderboard_accuracy_df = pd.read_csv(os.path.join(crm_results_path, "hf_leaderboard_accuracy.csv"))
leaderboard_accuracy_df = leaderboard_accuracy_df[~leaderboard_accuracy_df["Model Name"].isin(sf_finetuned_models)]
leaderboard_accuracy_df = leaderboard_accuracy_df.join(
use_case_flavor_mapping_df[["Use Case Name", "Cost and Speed: Flavor"]].set_index("Use Case Name"),
on="Use Case Name",
)
ref_df = leaderboard_accuracy_df[["Model Name", "LLM Provider"]].drop_duplicates()
leaderboard_cost_df = pd.read_csv(os.path.join(crm_results_path, "hf_leaderboard_latency_cost.csv"))
leaderboard_cost_df = leaderboard_cost_df[~leaderboard_cost_df["Model Name"].isin(sf_finetuned_models)]
leaderboard_accuracy_df = leaderboard_accuracy_df.join(
leaderboard_cost_df.set_index(["Model Name", "Cost and Speed: Flavor"]),
on=["Model Name", "Cost and Speed: Flavor"],
)
leaderboard_cost_df = leaderboard_cost_df.join(ref_df.set_index("Model Name"), on="Model Name")
leaderboard_cost_df = leaderboard_cost_df[cost_cols].round(decimals=2)
leaderboard_ts_df = pd.read_csv(os.path.join(crm_results_path, "hf_leaderboard_ts.csv"))
leaderboard_ts__crm_bias_df = pd.read_csv(os.path.join(crm_results_path, "hf_leaderboard_crm_bias.csv"))
leaderboard_ts_df = leaderboard_ts_df[~leaderboard_ts_df["Model Name"].isin(sf_finetuned_models)]
leaderboard_ts_df = leaderboard_ts_df.join(ref_df.set_index("Model Name"), on="Model Name")
leaderboard_ts_df = leaderboard_ts_df.join(leaderboard_ts__crm_bias_df.set_index("Model Name"), on="Model Name")
privacy_cols = leaderboard_ts_df[
[
"Privacy Zero-Shot Match Avoidance",
"Privacy Zero-Shot Reveal Avoidance",
"Privacy Five-Shot Match Avoidance",
"Privacy Five-Shot Reveal Avoidance",
]
].apply(lambda x: x.str.rstrip("%").astype("float") / 100.0, axis=1)
leaderboard_ts_df["Privacy"] = privacy_cols.mean(axis=1).transform(lambda x: "{:,.2%}".format(x))
leaderboard_ts_df["Bias No CI"] = leaderboard_ts_df["CRM Bias"].transform(lambda x: x.split(" ")[0])
ts_cols = leaderboard_ts_df[
[
"Safety",
"Privacy",
"Truthfulness",
"Bias No CI",
]
].apply(lambda x: x.str.rstrip("%").astype("float") / 100.0, axis=1)
leaderboard_ts_df["Trust & Safety"] = ts_cols.mean(axis=1).transform(lambda x: "{:,.2%}".format(x))
leaderboard_accuracy_df = leaderboard_accuracy_df.sort_values(
by=[AutoEvalColumn.accuracy_metric_average.name], ascending=False
)
leaderboard_accuracy_df = leaderboard_accuracy_df[accuracy_cols].round(decimals=2)
return leaderboard_accuracy_df, leaderboard_cost_df, leaderboard_ts_df
|