Spaces:

hallucinations-leaderboard
/

leaderboard

Restarting on CPU Upgrade

App Files Files Community

pminervini commited on Nov 29, 2023

Commit

6c79b12

1 Parent(s): 84fb473

update

Browse files

Files changed (2) hide show

backend-cli.py +18 -12
src/backend/envs.py +5 -4

backend-cli.py CHANGED Viewed

@@ -8,14 +8,14 @@ from huggingface_hub import snapshot_download
 from src.backend.run_eval_suite import run_evaluation
 from src.backend.manage_requests import check_completed_evals, get_eval_requests, set_eval_request
 from src.backend.sort_queue import sort_models_by_priority
-from src.backend.envs import Tasks, NUM_FEWSHOT, EVAL_REQUESTS_PATH_BACKEND,EVAL_RESULTS_PATH_BACKEND, DEVICE, LIMIT
 from src.envs import QUEUE_REPO, RESULTS_REPO, API
 import logging
 import pprint
-TASKS_HARNESS = [task.value.benchmark for task in Tasks]
 logging.getLogger("openai").setLevel(logging.WARNING)
@@ -56,19 +56,25 @@ def run_auto_eval():
     set_eval_request(api=API, eval_request=eval_request, set_to_status=RUNNING_STATUS, hf_repo=QUEUE_REPO,
                      local_dir=EVAL_REQUESTS_PATH_BACKEND)
-    results = run_evaluation(eval_request=eval_request, task_names=TASKS_HARNESS, num_fewshot=NUM_FEWSHOT,
-                             batch_size=1, device=DEVICE, no_cache=True, limit=LIMIT)
-    dumped = json.dumps(results, indent=2)
-    print(dumped)
-    output_path = os.path.join(EVAL_RESULTS_PATH_BACKEND, *eval_request.model.split("/"), f"results_{datetime.now()}.json")
-    os.makedirs(os.path.dirname(output_path), exist_ok=True)
-    with open(output_path, "w") as f:
-        f.write(dumped)
-    API.upload_file(path_or_fileobj=output_path, path_in_repo=f"{eval_request.model}/results_{datetime.now()}.json",
-                    repo_id=RESULTS_REPO, repo_type="dataset")
     set_eval_request(api=API, eval_request=eval_request, set_to_status=FINISHED_STATUS, hf_repo=QUEUE_REPO,
                      local_dir=EVAL_REQUESTS_PATH_BACKEND)

 from src.backend.run_eval_suite import run_evaluation
 from src.backend.manage_requests import check_completed_evals, get_eval_requests, set_eval_request
 from src.backend.sort_queue import sort_models_by_priority
+from src.backend.envs import Tasks, EVAL_REQUESTS_PATH_BACKEND,EVAL_RESULTS_PATH_BACKEND, DEVICE, LIMIT
 from src.envs import QUEUE_REPO, RESULTS_REPO, API
 import logging
 import pprint
+# TASKS_HARNESS = [task.value.benchmark for task in Tasks]
 logging.getLogger("openai").setLevel(logging.WARNING)
     set_eval_request(api=API, eval_request=eval_request, set_to_status=RUNNING_STATUS, hf_repo=QUEUE_REPO,
                      local_dir=EVAL_REQUESTS_PATH_BACKEND)
+    # results = run_evaluation(eval_request=eval_request, task_names=TASKS_HARNESS, num_fewshot=NUM_FEWSHOT,
+    #                          batch_size=1, device=DEVICE, no_cache=True, limit=LIMIT)
+    TASKS_HARNESS = [task.value for task in Tasks]
+    for task in TASKS_HARNESS:
+        results = run_evaluation(eval_request=eval_request, task_names=[task.benchmark], num_fewshot=task.num_fewshot,
+                                 batch_size=1, device=DEVICE, no_cache=True, limit=LIMIT)
+        dumped = json.dumps(results, indent=2)
+        print(dumped)
+        output_path = os.path.join(EVAL_RESULTS_PATH_BACKEND, *eval_request.model.split("/"), f"results_{datetime.now()}.json")
+        os.makedirs(os.path.dirname(output_path), exist_ok=True)
+        with open(output_path, "w") as f:
+            f.write(dumped)
+        API.upload_file(path_or_fileobj=output_path, path_in_repo=f"{eval_request.model}/results_{datetime.now()}.json",
+                        repo_id=RESULTS_REPO, repo_type="dataset")
     set_eval_request(api=API, eval_request=eval_request, set_to_status=FINISHED_STATUS, hf_repo=QUEUE_REPO,
                      local_dir=EVAL_REQUESTS_PATH_BACKEND)

src/backend/envs.py CHANGED Viewed

@@ -13,21 +13,22 @@ class Task:
     benchmark: str
     metric: str
     col_name: str
 class Tasks(Enum):
     # task_key in the json file, metric_key in the json file, name to display in the leaderboard
     # task0 = Task("anli_r1", "acc", "ANLI")
     # task1 = Task("logiqa", "acc_norm", "LogiQA")
-    task0 = Task("nq_open", "em", "NQ Open")
-    task1 = Task("triviaqa", "em", "TriviaQA")
-NUM_FEWSHOT = 64  # Change with your few shot
 EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
 EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
 DEVICE = "cuda:0" if torch.cuda.is_available() else 'cpu'
-LIMIT = 32  # Testing; needs to be None

     benchmark: str
     metric: str
     col_name: str
+    num_fewshot: int
 class Tasks(Enum):
     # task_key in the json file, metric_key in the json file, name to display in the leaderboard
     # task0 = Task("anli_r1", "acc", "ANLI")
     # task1 = Task("logiqa", "acc_norm", "LogiQA")
+    task0 = Task("nq_open", "em", "NQ Open", 64)
+    task1 = Task("triviaqa", "em", "TriviaQA", 64)
+# NUM_FEWSHOT = 64  # Change with your few shot
 EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
 EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
 DEVICE = "cuda:0" if torch.cuda.is_available() else 'cpu'
+LIMIT = None  # Testing; needs to be None