Spaces:

Bias-Leaderboard
/

leaderboard

Sleeping

File size: 4,592 Bytes

460d762
 
 
 
4aff44e
460d762
d6b3d82
460d762
 
 
 
d16cee2
 
460d762
d16cee2
460d762
d16cee2
 
460d762
 
 
 
 
 
 
 
 
 
6eaad72
b323764
 
460d762
 
 
 
 
 
 
 
 
b323764
12cea14
b323764
460d762
 
 
d350941
460d762
 
d16cee2
460d762
 
 
 
 
 
 
 
d16cee2
460d762
 
4aff44e
97b27da
 
 
 
460d762
4aff44e
 
 
 
d16cee2
 
 
 
 
 
 
6eaad72
 
d16cee2
 
 
 
 
2bb5ded
460d762
d16cee2
 
2bb5ded
460d762
d16cee2
460d762
ba25d90
 
d16cee2
d350941
d16cee2
6eaad72
d16cee2
460d762
d16cee2
460d762
 
699e8ff
d6b3d82
d16cee2
d6b3d82
 
 
 
 
 
6eaad72
d6b3d82
 
 
6eaad72
d6b3d82
6eaad72
 
 
d6b3d82
 
460d762
d16cee2
 
 
 
 
 
460d762
 
 
 
 
 
699e8ff
 
460d762

from dataclasses import dataclass

import glob
import json
import os
from typing import Dict, List, Tuple
import dateutil

from src.utils_display import AutoEvalColumn, make_clickable_model
import numpy as np

METRICS = ["acc_norm", "acc_norm", "acc", "mc2"]
BENCHMARKS = ["arc:challenge", "hellaswag", "hendrycksTest", "truthfulqa:mc"]
BENCH_TO_NAME = {
    "arc:challenge": AutoEvalColumn.arc.name,
    "hellaswag": AutoEvalColumn.hellaswag.name,
    "hendrycksTest": AutoEvalColumn.mmlu.name,
    "truthfulqa:mc": AutoEvalColumn.truthfulqa.name,
}


@dataclass
class EvalResult:
    eval_name: str
    org: str
    model: str
    revision: str
    results: dict
    precision: str = ""
    model_type: str = ""
    weight_type: str = ""

    def to_dict(self):
        if self.org is not None:
            base_model = f"{self.org}/{self.model}"
        else:
            base_model = f"{self.model}"
        data_dict = {}

        data_dict["eval_name"] = self.eval_name # not a column, just a save name
        data_dict["weight_type"] = self.weight_type  # not a column, just a save name
        data_dict[AutoEvalColumn.precision.name] = self.precision
        data_dict[AutoEvalColumn.model_type.name] = self.model_type
        data_dict[AutoEvalColumn.model.name] = make_clickable_model(base_model)
        data_dict[AutoEvalColumn.dummy.name] = base_model
        data_dict[AutoEvalColumn.revision.name] = self.revision
        data_dict[AutoEvalColumn.average.name] = sum([v for k, v in self.results.items()]) / 4.0

        for benchmark in BENCHMARKS:
            if benchmark not in self.results.keys():
                self.results[benchmark] = None

        for k, v in BENCH_TO_NAME.items():
            data_dict[v] = self.results[k]

        return data_dict


def parse_eval_result(json_filepath: str) -> Tuple[str, list[dict]]:
    with open(json_filepath) as fp:
        data = json.load(fp)

    
    for mmlu_k in ["harness|hendrycksTest-abstract_algebra|5", "hendrycksTest-abstract_algebra"]:
        if mmlu_k in data["versions"] and data["versions"][mmlu_k] == 0:
            return None, [] # we skip models with the wrong version 

    try:
        config = data["config"]
    except KeyError:
        config = data["config_general"]
    model = config.get("model_name", None)
    if model is None:
        model = config.get("model_args", None)

    model_sha = config.get("model_sha", "")
    model_split = model.split("/", 1)

    precision = config.get("model_dtype")

    model = model_split[-1]

    if len(model_split) == 1:
        org = None
        model = model_split[0]
        result_key = f"{model}_{model_sha}_{precision}"
    else:
        org = model_split[0]
        model = model_split[1]
        result_key =  f"{org}_{model}_{model_sha}_{precision}"

    eval_results = []
    for benchmark, metric in zip(BENCHMARKS, METRICS):
        accs = np.array([v.get(metric, None) for k, v in data["results"].items() if benchmark in k])
        if accs.size == 0 or any([acc is None for acc in accs]):
            continue
        mean_acc = np.mean(accs) * 100.0
        eval_results.append(EvalResult(
            eval_name=result_key, org=org, model=model, revision=model_sha, results={benchmark: mean_acc}, precision=precision, #todo model_type=, weight_type=
        ))

    return result_key, eval_results


def get_eval_results() -> List[EvalResult]:
    json_filepaths = []

    for root, dir, files in os.walk("eval-results"):
        # We should only have json files in model results
        if len(files) == 0 or any([not f.endswith(".json") for f in files]):
            continue

        # Sort the files by date
        # store results by precision maybe?
        try:
            files.sort(key=lambda x:  dateutil.parser.parse(x.split("_", 1)[-1][:-5]))
        except dateutil.parser._parser.ParserError:
            files = [files[-1]]

        #up_to_date = files[-1]
        for file in files:
            json_filepaths.append(os.path.join(root, file))

    eval_results = {}
    for json_filepath in json_filepaths:
        result_key, results = parse_eval_result(json_filepath)
        for eval_result in results:
            if result_key in eval_results.keys():
                eval_results[result_key].results.update(eval_result.results)
            else:
                eval_results[result_key] = eval_result

    eval_results = [v for v in eval_results.values()]

    return eval_results


def get_eval_results_dicts() -> List[Dict]:
    eval_results = get_eval_results()

    return [e.to_dict() for e in eval_results]