import json import pandas as pd import yaml from sklearn.metrics import cohen_kappa_score import numpy as np from datasets import load_dataset from .envs import TOKEN TYPES = ["number", "html", "number", "number", "number", "number"] def read_json(file_path: str) -> list[dict]: """ Read a JSON/JSONL file and return its contents as a list of dictionaries. Parameters: file_path (str): The path to the JSON file. Returns: list[dict]: The contents of the JSON file as a list of dictionaries. """ try: with open(file_path) as f: data = [json.loads(x) for x in f] return data except json.decoder.JSONDecodeError: with open(file_path) as f: data = json.load(f) return data def pairwise_compare( evaluator1_responses: list[dict], evaluator2_responses: list[dict], ) -> tuple[float, float]: """ Compare pairwise evaluators. Args: evaluator1_responses: The responses from the first evaluator. evaluator2_responses: The responses from the second evaluator. Returns: None """ assert len(evaluator1_responses) == len(evaluator2_responses) evaluator1_winners = np.array([response["winner"] for response in evaluator1_responses]) evaluator2_winners = np.array([response["winner"] for response in evaluator2_responses]) acc = (evaluator1_winners == evaluator2_winners).mean().item() agreement = cohen_kappa_score(evaluator1_winners, evaluator2_winners) return acc, agreement def pairwise_meta_eval(human_responses: list[dict], model_dir: str, model_dir_swap: str) -> dict[float]: """ Evaluate a pairwise evaluator. Args: human_responses: The responses from the human evaluator. model_dir: The directory containing the model responses. model_dir_swap: The directory containing the model responses with swapped inputs. Returns: dict[float]: The accuracy and agreement. """ model_responses = read_json(model_dir) model_responses_swap = read_json(model_dir_swap) acc, agr = pairwise_compare(human_responses, model_responses) swap_acc, swap_agr = pairwise_compare( human_responses, model_responses_swap, ) acc = (acc + swap_acc) / 2 agr = (agr + swap_agr) / 2 models_acc, models_agr = pairwise_compare( model_responses, model_responses_swap, ) return acc, agr, models_acc, models_agr def load_leaderboard() -> pd.DataFrame: """Loads the leaderboard from the file system""" with open("./data/models.yaml") as fp: models = yaml.safe_load(fp) human_responses = load_dataset("salesforce/instrusum", "human_eval_pairwise", token=TOKEN)["data"] human_responses = [x for x in human_responses] predictions = {k: [] for k in ["Model", "Accuracy", "Agreement", "Self-Accuracy", "Self-Agreement"]} for model in models: fdir = model["fdir"] acc, agr, models_acc, models_agr = pairwise_meta_eval( human_responses, f"./predictions/{fdir}.jsonl", f"./predictions/{fdir}_swap.jsonl" ) # predictions["Model"].append(model["name"]) # predictions["Model"].append(f"[{model['name']}]({model['url']})") link = model['url'] model_name = model['name'] output = f'{model_name}' predictions["Model"].append(output) predictions["Accuracy"].append(acc) predictions["Agreement"].append(agr) predictions["Self-Accuracy"].append(models_acc) predictions["Self-Agreement"].append(models_agr) df = pd.DataFrame(predictions).sort_values(by="Agreement", ascending=False).round(decimals=3) df.reset_index(drop=True, inplace=True) df[' '] = pd.Series(range(1, len(df) + 1)) columns = [' '] + [col for col in df.columns if col != ' '] df = df[columns] return df