data_only_hallucination_leaderboard

Runtime error

App Files Files Community

pminervini commited on Nov 29, 2023

Commit

894c4b4

1 Parent(s): e504efd

update

Browse files

Files changed (5) hide show

backend-cli.py +80 -0
src/backend/envs.py +33 -0
src/backend/manage_requests.py +126 -0
src/backend/run_eval_suite.py +36 -0
src/backend/sort_queue.py +28 -0

backend-cli.py ADDED Viewed

	@@ -0,0 +1,80 @@

+import os
+import json
+from datetime import datetime
+from huggingface_hub import snapshot_download
+from src.backend.run_eval_suite import run_evaluation
+from src.backend.manage_requests import check_completed_evals, get_eval_requests, set_eval_request
+from src.backend.sort_queue import sort_models_by_priority
+from src.backend.envs import Tasks, NUM_FEWSHOT, EVAL_REQUESTS_PATH_BACKEND,EVAL_RESULTS_PATH_BACKEND, DEVICE, LIMIT
+from src.envs import QUEUE_REPO, RESULTS_REPO, API
+import logging
+import pprint
+TASKS_HARNESS = [task.value.benchmark for task in Tasks]
+logging.getLogger("openai").setLevel(logging.WARNING)
+logging.basicConfig(level=logging.ERROR)
+pp = pprint.PrettyPrinter(width=80)
+PENDING_STATUS = "PENDING"
+RUNNING_STATUS = "RUNNING"
+FINISHED_STATUS = "FINISHED"
+FAILED_STATUS = "FAILED"
+snapshot_download(repo_id=RESULTS_REPO, revision="main", local_dir=EVAL_RESULTS_PATH_BACKEND, repo_type="dataset", max_workers=60)
+snapshot_download(repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset", max_workers=60)
+def run_auto_eval():
+    current_pending_status = [PENDING_STATUS]
+    # pull the eval dataset from the hub and parse any eval requests
+    # check completed evals and set them to finished
+    check_completed_evals(api=API, checked_status=RUNNING_STATUS, completed_status=FINISHED_STATUS,
+                          failed_status=FAILED_STATUS, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND,
+                          hf_repo_results=RESULTS_REPO, local_dir_results=EVAL_RESULTS_PATH_BACKEND)
+    # Get all eval request that are PENDING, if you want to run other evals, change this parameter
+    eval_requests = get_eval_requests(job_status=current_pending_status, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND)
+    # Sort the evals by priority (first submitted first run)
+    eval_requests = sort_models_by_priority(api=API, models=eval_requests)
+    print(f"Found {len(eval_requests)} {','.join(current_pending_status)} eval requests")
+    if len(eval_requests) == 0:
+        return
+    eval_request = eval_requests[0]
+    pp.pprint(eval_request)
+    set_eval_request(api=API, eval_request=eval_request, set_to_status=RUNNING_STATUS, hf_repo=QUEUE_REPO,
+                     local_dir=EVAL_REQUESTS_PATH_BACKEND)
+    results = run_evaluation(eval_request=eval_request, task_names=TASKS_HARNESS, num_fewshot=NUM_FEWSHOT,
+                             batch_size=1, device=DEVICE, no_cache=True, limit=LIMIT)
+    dumped = json.dumps(results, indent=2)
+    print(dumped)
+    output_path = os.path.join(EVAL_RESULTS_PATH_BACKEND, *eval_request.model.split("/"), f"results_{datetime.now()}.json")
+    os.makedirs(os.path.dirname(output_path), exist_ok=True)
+    with open(output_path, "w") as f:
+        f.write(dumped)
+    API.upload_file(path_or_fileobj=output_path, path_in_repo=f"{eval_request.model}/results_{datetime.now()}.json",
+                    repo_id=RESULTS_REPO, repo_type="dataset")
+    set_eval_request(api=API, eval_request=eval_request, set_to_status=FINISHED_STATUS, hf_repo=QUEUE_REPO,
+                     local_dir=EVAL_REQUESTS_PATH_BACKEND)
+    # breakpoint()
+if __name__ == "__main__":
+    run_auto_eval()

src/backend/envs.py ADDED Viewed

	@@ -0,0 +1,33 @@

+import os
+import torch
+from dataclasses import dataclass
+from enum import Enum
+from src.envs import CACHE_PATH
+@dataclass
+class Task:
+    benchmark: str
+    metric: str
+    col_name: str
+class Tasks(Enum):
+    # task_key in the json file, metric_key in the json file, name to display in the leaderboard
+    # task0 = Task("anli_r1", "acc", "ANLI")
+    # task1 = Task("logiqa", "acc_norm", "LogiQA")
+    task0 = Task("nq_open", "em", "NQ Open")
+    task1 = Task("triviaqa", "em", "TriviaQA")
+NUM_FEWSHOT = 64  # Change with your few shot
+EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
+EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
+DEVICE = "cuda:0" if torch.cuda.is_available() else 'cpu'
+LIMIT = 32  # Testing; needs to be None

src/backend/manage_requests.py ADDED Viewed

	@@ -0,0 +1,126 @@

+import glob
+import json
+from dataclasses import dataclass
+from typing import Optional
+from huggingface_hub import HfApi, snapshot_download
+@dataclass
+class EvalRequest:
+    model: str
+    private: bool
+    status: str
+    json_filepath: str
+    weight_type: str = "Original"
+    model_type: str = ""  # pretrained, finetuned, with RL
+    precision: str = ""  # float16, bfloat16
+    base_model: Optional[str] = None # for adapter models
+    revision: str = "main" # commit
+    submitted_time: Optional[str] = "2022-05-18T11:40:22.519222"  # random date just so that we can still order requests by date
+    model_type: Optional[str] = None
+    likes: Optional[int] = 0
+    params: Optional[int] = None
+    license: Optional[str] = ""
+    def get_model_args(self):
+        model_args = f"pretrained={self.model},revision={self.revision}"
+        if self.precision in ["float16", "float32", "bfloat16"]:
+            model_args += f",dtype={self.precision}"
+        # Quantized models need some added config, the install of bits and bytes, etc
+        #elif self.precision == "8bit":
+        #    model_args += ",load_in_8bit=True"
+        #elif self.precision == "4bit":
+        #    model_args += ",load_in_4bit=True"
+        #elif self.precision == "GPTQ":
+            # A GPTQ model does not need dtype to be specified,
+            # it will be inferred from the config
+            pass
+        else:
+            raise Exception(f"Unknown precision {self.precision}.")
+        return model_args
+def set_eval_request(api: HfApi, eval_request: EvalRequest, set_to_status: str, hf_repo: str, local_dir: str):
+    """Updates a given eval request with its new status on the hub (running, completed, failed, ...)"""
+    json_filepath = eval_request.json_filepath
+    with open(json_filepath) as fp:
+        data = json.load(fp)
+    data["status"] = set_to_status
+    with open(json_filepath, "w") as f:
+        f.write(json.dumps(data))
+    api.upload_file(
+        path_or_fileobj=json_filepath,
+        path_in_repo=json_filepath.replace(local_dir, ""),
+        repo_id=hf_repo,
+        repo_type="dataset",
+    )
+def get_eval_requests(job_status: list, local_dir: str, hf_repo: str) -> list[EvalRequest]:
+    """Get all pending evaluation requests and return a list in which private
+    models appearing first, followed by public models sorted by the number of
+    likes.
+    Returns:
+        `list[EvalRequest]`: a list of model info dicts.
+    """
+    snapshot_download(repo_id=hf_repo, revision="main", local_dir=local_dir, repo_type="dataset", max_workers=60)
+    json_files = glob.glob(f"{local_dir}/**/*.json", recursive=True)
+    eval_requests = []
+    for json_filepath in json_files:
+        with open(json_filepath) as fp:
+            data = json.load(fp)
+        if data["status"] in job_status:
+            # import pdb
+            # breakpoint()
+            data["json_filepath"] = json_filepath
+            del data['job_id']
+            eval_request = EvalRequest(**data)
+            eval_requests.append(eval_request)
+    return eval_requests
+def check_completed_evals(
+    api: HfApi,
+    hf_repo: str,
+    local_dir: str,
+    checked_status: str,
+    completed_status: str,
+    failed_status: str,
+    hf_repo_results: str,
+    local_dir_results: str,
+):
+    """Checks if the currently running evals are completed, if yes, update their status on the hub."""
+    snapshot_download(repo_id=hf_repo_results, revision="main", local_dir=local_dir_results, repo_type="dataset", max_workers=60)
+    running_evals = get_eval_requests(checked_status, hf_repo=hf_repo, local_dir=local_dir)
+    for eval_request in running_evals:
+        model = eval_request.model
+        print("====================================")
+        print(f"Checking {model}")
+        output_path = model
+        output_file = f"{local_dir_results}/{output_path}/results*.json"
+        output_file_exists = len(glob.glob(output_file)) > 0
+        if output_file_exists:
+            print(
+                f"EXISTS output file exists for {model} setting it to {completed_status}"
+            )
+            set_eval_request(api, eval_request, completed_status, hf_repo, local_dir)
+        else:
+            print(
+                f"No result file found for {model} setting it to {failed_status}"
+            )
+            set_eval_request(api, eval_request, failed_status, hf_repo, local_dir)

src/backend/run_eval_suite.py ADDED Viewed

	@@ -0,0 +1,36 @@

+from lm_eval import tasks, evaluator, utils
+from src.backend.manage_requests import EvalRequest
+import logging
+logging.getLogger("openai").setLevel(logging.WARNING)
+def run_evaluation(eval_request: EvalRequest, task_names, num_fewshot, batch_size, device, no_cache=True, limit=None):
+    if limit:
+        print("WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT.")
+    task_names = utils.pattern_match(task_names, tasks.ALL_TASKS)
+    print(f"Selected Tasks: {task_names}")
+    results = evaluator.simple_evaluate(
+        model="hf-causal-experimental",  # "hf-causal"
+        model_args=eval_request.get_model_args(),
+        tasks=task_names,
+        num_fewshot=num_fewshot,
+        batch_size=batch_size,
+        device=device,
+        no_cache=no_cache,
+        limit=limit,
+        write_out=True,
+        output_base_path="logs"
+    )
+    results["config"]["model_dtype"] = eval_request.precision
+    results["config"]["model_name"] = eval_request.model
+    results["config"]["model_sha"] = eval_request.revision
+    print(evaluator.make_table(results))
+    return results

src/backend/sort_queue.py ADDED Viewed

	@@ -0,0 +1,28 @@

+from dataclasses import dataclass
+from huggingface_hub import HfApi
+from src.backend.manage_requests import EvalRequest
+@dataclass
+class ModelMetadata:
+    likes: int = 0
+    size: int = 15
+def sort_models_by_priority(api: HfApi, models: list[EvalRequest]) -> list[EvalRequest]:
+    private_models = [model for model in models if model.private]
+    public_models = [model for model in models if not model.private]
+    return sort_by_submit_date(private_models) + sort_by_submit_date(public_models)
+def sort_by_submit_date(eval_requests: list[EvalRequest]) -> list[EvalRequest]:
+    return sorted(eval_requests, key=lambda x: x.submitted_time, reverse=False)
+def sort_by_size(eval_requests: list[EvalRequest]) -> list[EvalRequest]:
+    return sorted(eval_requests, key=lambda x: x.params, reverse=False)
+def sort_by_likes(eval_requests: list[EvalRequest]) -> list[EvalRequest]:
+    return sorted(eval_requests, key=lambda x: x.likes, reverse=False)