from urllib.parse import quote import gradio as gr import numpy as np import urllib3 from bs4 import BeautifulSoup from datasets import load_dataset from huggingface_hub import ( CommitOperationAdd, EvalResult, ModelCard, RepoUrl, create_commit, ) from huggingface_hub.repocard_data import eval_results_to_model_index from pytablewriter import MarkdownTableWriter COMMIT_DESCRIPTION = """This is an automated PR created with [this space](https://huggingface.co/spaces/T145/open-llm-leaderboard-results-to-modelcard)! The purpose of this PR is to add evaluation results from the Open LLM Leaderboard to your model card. Please report any issues here: https://huggingface.co/spaces/T145/open-llm-leaderboard-results-to-modelcard/discussions""" # Keys are named after the backend keys # https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/blob/main/backend/README.md#leaderboard KEY_IFEVAL = "IFEval" KEY_BBH = "BBH" KEY_MATH = "MATH Lvl 5" KEY_GPQA = "GPQA" KEY_MUSR = "MUSR" KEY_MMLU = "MMLU-PRO" def normalize_within_range(value, lower_bound=0, higher_bound=1): return (np.clip(value - lower_bound, 0, None)) / (higher_bound - lower_bound) * 100 def calculate_results(repo: str, pool: urllib3.PoolManager): try: base_url = f"https://huggingface.co/datasets/open-llm-leaderboard/results/tree/main/{repo}" html = pool.request("GET", base_url).data soup = BeautifulSoup(html, "html.parser") dl_link = soup.find_all(title="Download file")[-1]["href"] data = pool.request("GET", f"https://huggingface.co{dl_link}").json() del base_url del html del soup del dl_link precision = data["config"]["model_dtype"] revision = data["config"]["model_revision"] # Normalize BBH subtasks scores bbh_scores = [] for subtask_key in data["group_subtasks"]["leaderboard_bbh"]: num_choices = len(data["configs"][subtask_key]["doc_to_choice"]) if subtask_key in data["results"]: bbh_raw_score = data["results"][subtask_key]["acc_norm,none"] lower_bound = 1 / num_choices normalized_score = normalize_within_range(bbh_raw_score, lower_bound, 1.0) bbh_scores.append(normalized_score) # Average BBH score bbh_score = sum(bbh_scores) / len(bbh_scores) bbh_score = float(round(bbh_score, 2)) # Calculate the MATH score math_raw_score = data["results"]["leaderboard_math_hard"]["exact_match,none"] math_score = normalize_within_range(math_raw_score, 0, 1.0) math_score = float(round(math_score, 2)) # Normalize GPQA scores gpqa_raw_score = data["results"]["leaderboard_gpqa"]["acc_norm,none"] gpqa_score = normalize_within_range(gpqa_raw_score, 0.25, 1.0) gpqa_score = float(round(gpqa_score, 2)) # Normalize MMLU PRO scores mmlu_raw_score = data["results"]["leaderboard_mmlu_pro"]["acc,none"] mmlu_score = normalize_within_range(mmlu_raw_score, 0.1, 1.0) mmlu_score = float(round(mmlu_score, 2)) # Compute IFEval ifeval_inst_score = ( data["results"]["leaderboard_ifeval"]["inst_level_strict_acc,none"] * 100 ) ifeval_prompt_score = ( data["results"]["leaderboard_ifeval"]["prompt_level_strict_acc,none"] * 100 ) # Average IFEval scores ifeval_score = (ifeval_inst_score + ifeval_prompt_score) / 2 ifeval_score = float(round(ifeval_score, 2)) # Normalize MUSR scores musr_scores = [] for subtask_key in data["group_subtasks"]["leaderboard_musr"]: subtask_config = data["configs"][subtask_key] dataset = load_dataset(subtask_config["dataset_path"], split=subtask_config["test_split"]) num_choices = max(len(eval(question["choices"])) for question in dataset) musr_raw_score = data["results"][subtask_key]["acc_norm,none"] lower_bound = 1 / num_choices normalized_score = normalize_within_range(musr_raw_score, lower_bound, 1.0) musr_scores.append(normalized_score) del dataset musr_score = sum(musr_scores) / len(musr_scores) musr_score = float(round(musr_score, 2)) # Calculate overall score average_score = ( bbh_score + math_score + gpqa_score + mmlu_score + musr_score + ifeval_score ) / 6 average_score = float(round(average_score, 2)) results = { "Model": repo, "Precision": precision, "Revision": revision, "Average": average_score, KEY_IFEVAL: ifeval_score, KEY_BBH: bbh_score, KEY_MATH: math_score, KEY_GPQA: gpqa_score, KEY_MUSR: musr_score, KEY_MMLU: mmlu_score, } # pprint(results, sort_dicts=False) return results except Exception: # likely will be from no results being available return None def get_details_url(repo: str): author, model = repo.split("/") return f"https://huggingface.co/datasets/open-llm-leaderboard/{author}__{model}-details" def get_contents_url(repo: str): param = quote(repo, safe="") return f"https://huggingface.co/datasets/open-llm-leaderboard/contents/viewer/default/train?q={param}&sort[column]=Average%20%E2%AC%86%EF%B8%8F&sort[direction]=desc" def get_query_url(repo: str): param = quote(repo, safe="") return f"https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard#/?search={param}" def get_task_summary(results): return { KEY_IFEVAL: { "dataset_type": "wis-k/instruction-following-eval", "dataset_name": "IFEval (0-Shot)", "metric_type": "inst_level_strict_acc and prompt_level_strict_acc", "metric_value": results[KEY_IFEVAL], "dataset_config": None, "dataset_split": "train", "dataset_args": {"num_few_shot": 0}, "metric_name": "averaged accuracy", }, KEY_BBH: { "dataset_type": "SaylorTwift/bbh", "dataset_name": "BBH (3-Shot)", "metric_type": "acc_norm", "metric_value": results[KEY_BBH], "dataset_config": None, "dataset_split": "test", "dataset_args": {"num_few_shot": 3}, "metric_name": "normalized accuracy", }, KEY_MATH: { "dataset_type": "lighteval/MATH-Hard", "dataset_name": "MATH Lvl 5 (4-Shot)", "metric_type": "exact_match", "metric_value": results[KEY_MATH], "dataset_config": None, "dataset_split": "test", "dataset_args": {"num_few_shot": 4}, "metric_name": "exact match", }, KEY_GPQA: { "dataset_type": "Idavidrein/gpqa", "dataset_name": "GPQA (0-shot)", "metric_type": "acc_norm", "metric_value": results[KEY_GPQA], "dataset_config": None, "dataset_split": "train", "dataset_args": {"num_few_shot": 0}, "metric_name": "acc_norm", }, KEY_MUSR: { "dataset_type": "TAUR-Lab/MuSR", "dataset_name": "MuSR (0-shot)", "metric_type": "acc_norm", "metric_value": results[KEY_MUSR], "dataset_config": None, "dataset_split": None, # three test splits "dataset_args": {"num_few_shot": 0}, "metric_name": "acc_norm", }, KEY_MMLU: { "dataset_type": "TIGER-Lab/MMLU-Pro", "dataset_name": "MMLU-PRO (5-shot)", "metric_type": "acc", "metric_value": results[KEY_MMLU], "dataset_config": "main", "dataset_split": "test", "dataset_args": {"num_few_shot": 5}, "metric_name": "accuracy", }, } def get_eval_results(repo: str, results: dict): task_summary = get_task_summary(results) table = MarkdownTableWriter() table.headers = ["Metric", "Value (%)"] table.value_matrix = [["**Average**", results["Average"]]] + [ [v["dataset_name"], v["metric_value"]] for v in task_summary.values() ] text = f""" # [Open LLM Leaderboard Evaluation Results](https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard) Detailed results can be found [here]({get_details_url(repo)})! Summarized results can be found [here]({get_contents_url(repo)})! {table.dumps()} """ return text def get_edited_yaml_readme(repo: str, results: dict, token: str | None): card = ModelCard.load(repo, token=token) common = { "task_type": "text-generation", "task_name": "Text Generation", "source_name": "Open LLM Leaderboard", "source_url": get_query_url(repo), } tasks_results = get_task_summary(results) if not card.data[ "eval_results" ]: # No results reported yet, we initialize the metadata card.data["model-index"] = eval_results_to_model_index( repo.split("/")[1], [EvalResult(**task, **common) for task in tasks_results.values()], ) else: # We add the new evaluations for task in tasks_results.values(): cur_result = EvalResult(**task, **common) if any( result.is_equal_except_value(cur_result) for result in card.data["eval_results"] ): continue card.data["eval_results"].append(cur_result) return str(card) def commit( repo, pr_number=None, # specify pr number if you want to edit it message="Adding Evaluation Results", oauth_token: gr.OAuthToken | None = None, ): if not oauth_token: raise gr.Warning("You are not logged in. Click on 'Sign in with Huggingface' to log in.") else: token = oauth_token if repo.startswith("https://huggingface.co/"): try: repo = RepoUrl(repo).repo_id except Exception as e: raise gr.Error(f"Not a valid repo id: {str(repo)}") from e with urllib3.PoolManager() as pool: results = calculate_results(repo, pool) edited = {"revision": f"refs/pr/{pr_number}"} if pr_number else {"create_pr": True} try: try: # check if there is a readme already readme_text = get_edited_yaml_readme( repo, results, token=token ) + get_eval_results(repo, results) except Exception as e: if "Repo card metadata block was not found." in str(e): # There is no readme readme_text = get_edited_yaml_readme(repo, results, token=token) else: print(f"Something went wrong: {e}") ops = [ CommitOperationAdd( path_in_repo="README.md", path_or_fileobj=readme_text.encode() ) ] commit = create_commit( repo_id=repo, token=token, operations=ops, commit_message=message, commit_description=COMMIT_DESCRIPTION, repo_type="model", **edited, ).pr_url return commit except Exception as e: if "Discussions are disabled for this repo" in str(e): return "Discussions disabled" elif "Cannot access gated repo" in str(e): return "Gated repo" elif "Repository Not Found" in str(e): return "Repository Not Found" else: return e