Spaces:

hallucinations-leaderboard
/

leaderboard

Running on CPU Upgrade

App Files Files Community

pminervini commited on Feb 8

Commit

13218df

•

1 Parent(s): e034fec

update

Browse files

Files changed (1) hide show

backend-cli.py +17 -12

backend-cli.py CHANGED Viewed

@@ -122,7 +122,7 @@ def process_evaluation(task: Task, eval_request: EvalRequest) -> dict:
     return results
-def process_finished_requests(thr: int) -> bool:
     sanity_checks()
     current_finished_status = [FINISHED_STATUS, FAILED_STATUS]
@@ -155,7 +155,11 @@ def process_finished_requests(thr: int) -> bool:
             for task in task_lst:
                 task_name = task.benchmark
-                if eval_result is None or task_name not in eval_result.results:
                     eval_request: EvalRequest = result_name_to_request[result_name]
                     my_snapshot_download(repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset", max_workers=60)
@@ -171,7 +175,7 @@ def process_finished_requests(thr: int) -> bool:
     return False
-def maybe_refresh_results(thr: int) -> bool:
     sanity_checks()
     current_finished_status = [PENDING_STATUS, FINISHED_STATUS, FAILED_STATUS]
@@ -195,8 +199,6 @@ def maybe_refresh_results(thr: int) -> bool:
             # Check the corresponding result
             eval_result: Optional[EvalResult] = result_name_to_result[result_name] if result_name in result_name_to_result else None
-            # breakpoint()
             task_lst = TASKS_HARNESS.copy()
             random.shuffle(task_lst)
@@ -204,11 +206,12 @@ def maybe_refresh_results(thr: int) -> bool:
             for task in task_lst:
                 task_name = task.benchmark
-                # task_lst = ['nq', 'trivia', 'tqa', 'self', 'xsum', 'cnn', 'memo']
                 task_lst = ['nq', 'trivia', 'tqa', 'self']
-                if (eval_result is None or
-                        task_name not in eval_result.results or
                         any(ss in task_name for ss in task_lst)):
                     eval_request: EvalRequest = result_name_to_request[result_name]
@@ -262,9 +265,11 @@ def process_pending_requests() -> bool:
 if __name__ == "__main__":
     wait = True
     if socket.gethostname() in {'hamburg', 'neuromancer'} or os.path.isdir("/home/pminervi"):
         wait = False
     if wait:
         time.sleep(60 * random.randint(5, 10))
@@ -277,14 +282,14 @@ if __name__ == "__main__":
     if res is False:
         if random.randint(0, 1) == 0:
-            res = maybe_refresh_results(100)
         else:
-            res = process_finished_requests(100)
     time.sleep(60)
     if res is False:
         if random.randint(0, 1) == 0:
-            res = maybe_refresh_results(0)
         else:
-            res = process_finished_requests(0)

     return results
+def process_finished_requests(thr: int, hard_task_lst: Optional[list[str]] = None) -> bool:
     sanity_checks()
     current_finished_status = [FINISHED_STATUS, FAILED_STATUS]
             for task in task_lst:
                 task_name = task.benchmark
+                do_run_task = False
+                if hard_task_lst is None or any(ss in task_name for ss in hard_task_lst):
+                    do_run_task = True
+                if (eval_result is None or task_name not in eval_result.results) and do_run_task:
                     eval_request: EvalRequest = result_name_to_request[result_name]
                     my_snapshot_download(repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset", max_workers=60)
     return False
+def maybe_refresh_results(thr: int, hard_task_lst: Optional[list[str]] = None) -> bool:
     sanity_checks()
     current_finished_status = [PENDING_STATUS, FINISHED_STATUS, FAILED_STATUS]
             # Check the corresponding result
             eval_result: Optional[EvalResult] = result_name_to_result[result_name] if result_name in result_name_to_result else None
             task_lst = TASKS_HARNESS.copy()
             random.shuffle(task_lst)
             for task in task_lst:
                 task_name = task.benchmark
+                do_run_task = False
+                if hard_task_lst is None or any(ss in task_name for ss in hard_task_lst):
+                    do_run_task = True
                 task_lst = ['nq', 'trivia', 'tqa', 'self']
+                if (eval_result is None or do_run_task or task_name not in eval_result.results or
                         any(ss in task_name for ss in task_lst)):
                     eval_request: EvalRequest = result_name_to_request[result_name]
 if __name__ == "__main__":
     wait = True
+    hard_task_lst = None
     if socket.gethostname() in {'hamburg', 'neuromancer'} or os.path.isdir("/home/pminervi"):
         wait = False
+        hard_task_lst = ['nq', 'trivia', 'tqa']
     if wait:
         time.sleep(60 * random.randint(5, 10))
     if res is False:
         if random.randint(0, 1) == 0:
+            res = maybe_refresh_results(100, hard_task_lst=hard_task_lst)
         else:
+            res = process_finished_requests(100, hard_task_lst=hard_task_lst)
     time.sleep(60)
     if res is False:
         if random.randint(0, 1) == 0:
+            res = maybe_refresh_results(0, hard_task_lst=hard_task_lst)
         else:
+            res = process_finished_requests(0, hard_task_lst=hard_task_lst)