|
from urllib.parse import quote |
|
|
|
import gradio as gr |
|
import numpy as np |
|
import urllib3 |
|
from bs4 import BeautifulSoup |
|
from datasets import load_dataset |
|
from huggingface_hub import ( |
|
CommitOperationAdd, |
|
EvalResult, |
|
ModelCard, |
|
RepoUrl, |
|
create_commit, |
|
) |
|
from huggingface_hub.repocard_data import eval_results_to_model_index |
|
from pytablewriter import MarkdownTableWriter |
|
|
|
COMMIT_DESCRIPTION = """This is an automated PR created with [this space](https://huggingface.co/spaces/T145/open-llm-leaderboard-results-to-modelcard)! |
|
|
|
The purpose of this PR is to add evaluation results from the Open LLM Leaderboard to your model card. |
|
|
|
Please report any issues here: https://huggingface.co/spaces/T145/open-llm-leaderboard-results-to-modelcard/discussions""" |
|
|
|
|
|
|
|
KEY_IFEVAL = "IFEval" |
|
KEY_BBH = "BBH" |
|
KEY_MATH = "MATH Lvl 5" |
|
KEY_GPQA = "GPQA" |
|
KEY_MUSR = "MUSR" |
|
KEY_MMLU = "MMLU-PRO" |
|
|
|
def normalize_within_range(value, lower_bound=0, higher_bound=1): |
|
return (np.clip(value - lower_bound, 0, None)) / (higher_bound - lower_bound) * 100 |
|
|
|
|
|
def calculate_results(repo: str, pool: urllib3.PoolManager): |
|
try: |
|
base_url = f"https://huggingface.co/datasets/open-llm-leaderboard/results/tree/main/{repo}" |
|
html = pool.request("GET", base_url).data |
|
soup = BeautifulSoup(html, "html.parser") |
|
dl_link = soup.find_all(title="Download file")[-1]["href"] |
|
data = pool.request("GET", f"https://huggingface.co{dl_link}").json() |
|
|
|
del base_url |
|
del html |
|
del soup |
|
del dl_link |
|
|
|
precision = data["config"]["model_dtype"] |
|
revision = data["config"]["model_revision"] |
|
|
|
|
|
bbh_scores = [] |
|
for subtask_key in data["group_subtasks"]["leaderboard_bbh"]: |
|
num_choices = len(data["configs"][subtask_key]["doc_to_choice"]) |
|
if subtask_key in data["results"]: |
|
bbh_raw_score = data["results"][subtask_key]["acc_norm,none"] |
|
lower_bound = 1 / num_choices |
|
normalized_score = normalize_within_range(bbh_raw_score, lower_bound, 1.0) |
|
bbh_scores.append(normalized_score) |
|
|
|
|
|
bbh_score = sum(bbh_scores) / len(bbh_scores) |
|
bbh_score = float(round(bbh_score, 2)) |
|
|
|
|
|
math_raw_score = data["results"]["leaderboard_math_hard"]["exact_match,none"] |
|
math_score = normalize_within_range(math_raw_score, 0, 1.0) |
|
math_score = float(round(math_score, 2)) |
|
|
|
|
|
gpqa_raw_score = data["results"]["leaderboard_gpqa"]["acc_norm,none"] |
|
gpqa_score = normalize_within_range(gpqa_raw_score, 0.25, 1.0) |
|
gpqa_score = float(round(gpqa_score, 2)) |
|
|
|
|
|
mmlu_raw_score = data["results"]["leaderboard_mmlu_pro"]["acc,none"] |
|
mmlu_score = normalize_within_range(mmlu_raw_score, 0.1, 1.0) |
|
mmlu_score = float(round(mmlu_score, 2)) |
|
|
|
|
|
ifeval_inst_score = ( |
|
data["results"]["leaderboard_ifeval"]["inst_level_strict_acc,none"] * 100 |
|
) |
|
ifeval_prompt_score = ( |
|
data["results"]["leaderboard_ifeval"]["prompt_level_strict_acc,none"] * 100 |
|
) |
|
|
|
|
|
ifeval_score = (ifeval_inst_score + ifeval_prompt_score) / 2 |
|
ifeval_score = float(round(ifeval_score, 2)) |
|
|
|
|
|
musr_scores = [] |
|
for subtask_key in data["group_subtasks"]["leaderboard_musr"]: |
|
subtask_config = data["configs"][subtask_key] |
|
dataset = load_dataset(subtask_config["dataset_path"], split=subtask_config["test_split"]) |
|
num_choices = max(len(eval(question["choices"])) for question in dataset) |
|
musr_raw_score = data["results"][subtask_key]["acc_norm,none"] |
|
lower_bound = 1 / num_choices |
|
normalized_score = normalize_within_range(musr_raw_score, lower_bound, 1.0) |
|
|
|
musr_scores.append(normalized_score) |
|
del dataset |
|
|
|
musr_score = sum(musr_scores) / len(musr_scores) |
|
musr_score = float(round(musr_score, 2)) |
|
|
|
|
|
average_score = ( |
|
bbh_score + math_score + gpqa_score + mmlu_score + musr_score + ifeval_score |
|
) / 6 |
|
average_score = float(round(average_score, 2)) |
|
|
|
results = { |
|
"Model": repo, |
|
"Precision": precision, |
|
"Revision": revision, |
|
"Average": average_score, |
|
KEY_IFEVAL: ifeval_score, |
|
KEY_BBH: bbh_score, |
|
KEY_MATH: math_score, |
|
KEY_GPQA: gpqa_score, |
|
KEY_MUSR: musr_score, |
|
KEY_MMLU: mmlu_score, |
|
} |
|
|
|
return results |
|
except Exception: |
|
return None |
|
|
|
|
|
def get_details_url(repo: str): |
|
author, model = repo.split("/") |
|
return f"https://huggingface.co/datasets/open-llm-leaderboard/{author}__{model}-details" |
|
|
|
|
|
def get_contents_url(repo: str): |
|
param = quote(repo, safe="") |
|
return f"https://huggingface.co/datasets/open-llm-leaderboard/contents/viewer/default/train?q={param}&sort[column]=Average%20%E2%AC%86%EF%B8%8F&sort[direction]=desc" |
|
|
|
|
|
def get_query_url(repo: str): |
|
param = quote(repo, safe="") |
|
return f"https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard#/?search={param}" |
|
|
|
|
|
def get_task_summary(results): |
|
return { |
|
KEY_IFEVAL: { |
|
"dataset_type": "wis-k/instruction-following-eval", |
|
"dataset_name": "IFEval (0-Shot)", |
|
"metric_type": "inst_level_strict_acc and prompt_level_strict_acc", |
|
"metric_value": results[KEY_IFEVAL], |
|
"dataset_config": None, |
|
"dataset_split": "train", |
|
"dataset_args": {"num_few_shot": 0}, |
|
"metric_name": "averaged accuracy", |
|
}, |
|
KEY_BBH: { |
|
"dataset_type": "SaylorTwift/bbh", |
|
"dataset_name": "BBH (3-Shot)", |
|
"metric_type": "acc_norm", |
|
"metric_value": results[KEY_BBH], |
|
"dataset_config": None, |
|
"dataset_split": "test", |
|
"dataset_args": {"num_few_shot": 3}, |
|
"metric_name": "normalized accuracy", |
|
}, |
|
KEY_MATH: { |
|
"dataset_type": "lighteval/MATH-Hard", |
|
"dataset_name": "MATH Lvl 5 (4-Shot)", |
|
"metric_type": "exact_match", |
|
"metric_value": results[KEY_MATH], |
|
"dataset_config": None, |
|
"dataset_split": "test", |
|
"dataset_args": {"num_few_shot": 4}, |
|
"metric_name": "exact match", |
|
}, |
|
KEY_GPQA: { |
|
"dataset_type": "Idavidrein/gpqa", |
|
"dataset_name": "GPQA (0-shot)", |
|
"metric_type": "acc_norm", |
|
"metric_value": results[KEY_GPQA], |
|
"dataset_config": None, |
|
"dataset_split": "train", |
|
"dataset_args": {"num_few_shot": 0}, |
|
"metric_name": "acc_norm", |
|
}, |
|
KEY_MUSR: { |
|
"dataset_type": "TAUR-Lab/MuSR", |
|
"dataset_name": "MuSR (0-shot)", |
|
"metric_type": "acc_norm", |
|
"metric_value": results[KEY_MUSR], |
|
"dataset_config": None, |
|
"dataset_split": None, |
|
"dataset_args": {"num_few_shot": 0}, |
|
"metric_name": "acc_norm", |
|
}, |
|
KEY_MMLU: { |
|
"dataset_type": "TIGER-Lab/MMLU-Pro", |
|
"dataset_name": "MMLU-PRO (5-shot)", |
|
"metric_type": "acc", |
|
"metric_value": results[KEY_MMLU], |
|
"dataset_config": "main", |
|
"dataset_split": "test", |
|
"dataset_args": {"num_few_shot": 5}, |
|
"metric_name": "accuracy", |
|
}, |
|
} |
|
|
|
|
|
def get_eval_results(repo: str, results: dict): |
|
task_summary = get_task_summary(results) |
|
table = MarkdownTableWriter() |
|
table.headers = ["Metric", "Value (%)"] |
|
table.value_matrix = [["**Average**", results["Average"]]] + [ |
|
[v["dataset_name"], v["metric_value"]] for v in task_summary.values() |
|
] |
|
|
|
text = f""" |
|
# [Open LLM Leaderboard Evaluation Results](https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard) |
|
Detailed results can be found [here]({get_details_url(repo)})! |
|
Summarized results can be found [here]({get_contents_url(repo)})! |
|
|
|
{table.dumps()} |
|
""" |
|
return text |
|
|
|
|
|
def get_edited_yaml_readme(repo: str, results: dict, token: str | None): |
|
card = ModelCard.load(repo, token=token) |
|
|
|
common = { |
|
"task_type": "text-generation", |
|
"task_name": "Text Generation", |
|
"source_name": "Open LLM Leaderboard", |
|
"source_url": get_query_url(repo), |
|
} |
|
|
|
tasks_results = get_task_summary(results) |
|
|
|
if not card.data[ |
|
"eval_results" |
|
]: |
|
card.data["model-index"] = eval_results_to_model_index( |
|
repo.split("/")[1], |
|
[EvalResult(**task, **common) for task in tasks_results.values()], |
|
) |
|
else: |
|
for task in tasks_results.values(): |
|
cur_result = EvalResult(**task, **common) |
|
if any( |
|
result.is_equal_except_value(cur_result) |
|
for result in card.data["eval_results"] |
|
): |
|
continue |
|
card.data["eval_results"].append(cur_result) |
|
|
|
return str(card) |
|
|
|
|
|
def commit( |
|
repo, |
|
pr_number=None, |
|
message="Adding Evaluation Results", |
|
oauth_token: gr.OAuthToken | None = None, |
|
): |
|
if not oauth_token: |
|
raise gr.Warning("You are not logged in. Click on 'Sign in with Huggingface' to log in.") |
|
else: |
|
token = oauth_token |
|
|
|
if repo.startswith("https://huggingface.co/"): |
|
try: |
|
repo = RepoUrl(repo).repo_id |
|
except Exception as e: |
|
raise gr.Error(f"Not a valid repo id: {str(repo)}") from e |
|
|
|
with urllib3.PoolManager() as pool: |
|
results = calculate_results(repo, pool) |
|
|
|
edited = {"revision": f"refs/pr/{pr_number}"} if pr_number else {"create_pr": True} |
|
|
|
try: |
|
try: |
|
readme_text = get_edited_yaml_readme( |
|
repo, results, token=token |
|
) + get_eval_results(repo, results) |
|
except Exception as e: |
|
if "Repo card metadata block was not found." in str(e): |
|
readme_text = get_edited_yaml_readme(repo, results, token=token) |
|
else: |
|
print(f"Something went wrong: {e}") |
|
|
|
ops = [ |
|
CommitOperationAdd( |
|
path_in_repo="README.md", path_or_fileobj=readme_text.encode() |
|
) |
|
] |
|
commit = create_commit( |
|
repo_id=repo, |
|
token=token, |
|
operations=ops, |
|
commit_message=message, |
|
commit_description=COMMIT_DESCRIPTION, |
|
repo_type="model", |
|
**edited, |
|
).pr_url |
|
|
|
return commit |
|
|
|
except Exception as e: |
|
if "Discussions are disabled for this repo" in str(e): |
|
return "Discussions disabled" |
|
elif "Cannot access gated repo" in str(e): |
|
return "Gated repo" |
|
elif "Repository Not Found" in str(e): |
|
return "Repository Not Found" |
|
else: |
|
return e |
|
|