pminervini commited on
Commit
d9f893d
1 Parent(s): c8ae03b
Files changed (3) hide show
  1. backend-cli.py +0 -2
  2. beta-cli.py +55 -5
  3. src/leaderboard/read_evals.py +3 -8
backend-cli.py CHANGED
@@ -105,7 +105,6 @@ def process_finished_requests() -> bool:
105
 
106
  for eval_request in eval_requests:
107
  result_name: str = request_to_result_name(eval_request)
108
- print(result_name, result_name in result_name_to_result)
109
 
110
  # Check the corresponding result
111
  eval_result: EvalResult = result_name_to_result[result_name]
@@ -115,7 +114,6 @@ def process_finished_requests() -> bool:
115
  task_name = task.benchmark
116
 
117
  if task_name not in eval_result.results:
118
- print(task_name)
119
  results = process_evaluation(task, eval_request)
120
  return True
121
 
 
105
 
106
  for eval_request in eval_requests:
107
  result_name: str = request_to_result_name(eval_request)
 
108
 
109
  # Check the corresponding result
110
  eval_result: EvalResult = result_name_to_result[result_name]
 
114
  task_name = task.benchmark
115
 
116
  if task_name not in eval_result.results:
 
117
  results = process_evaluation(task, eval_request)
118
  return True
119
 
beta-cli.py CHANGED
@@ -4,13 +4,63 @@ from huggingface_hub import snapshot_download
4
  from src.leaderboard.read_evals import get_raw_eval_results
5
  from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, RESULTS_REPO
6
 
 
 
 
 
 
 
 
 
 
 
7
  snapshot_download(repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30)
8
  snapshot_download(repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30)
9
 
10
- raw_data = get_raw_eval_results(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
 
12
- for entry in raw_data:
13
- # if '125m' in entry.eval_name:
14
- print(entry)
15
 
16
- # print(raw_data)
 
 
4
  from src.leaderboard.read_evals import get_raw_eval_results
5
  from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, RESULTS_REPO
6
 
7
+ from src.backend.run_eval_suite import run_evaluation
8
+ from src.backend.manage_requests import check_completed_evals, get_eval_requests, set_eval_request
9
+ from src.backend.sort_queue import sort_models_by_priority
10
+ from src.backend.envs import Tasks, EVAL_REQUESTS_PATH_BACKEND, EVAL_RESULTS_PATH_BACKEND, DEVICE, LIMIT, Task
11
+
12
+ from src.leaderboard.read_evals import get_raw_eval_results
13
+
14
+ from src.backend.manage_requests import EvalRequest
15
+ from src.leaderboard.read_evals import EvalResult
16
+
17
  snapshot_download(repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30)
18
  snapshot_download(repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30)
19
 
20
+ PENDING_STATUS = "PENDING"
21
+ RUNNING_STATUS = "RUNNING"
22
+ FINISHED_STATUS = "FINISHED"
23
+ FAILED_STATUS = "FAILED"
24
+
25
+ TASKS_HARNESS = [task.value for task in Tasks]
26
+
27
+ current_finished_status = [FINISHED_STATUS]
28
+
29
+
30
+ def request_to_result_name(request: EvalRequest) -> str:
31
+ org_and_model = request.model.split("/", 1)
32
+ if len(org_and_model) == 1:
33
+ model = org_and_model[0]
34
+ res = f"{model}_{request.precision}"
35
+ else:
36
+ org = org_and_model[0]
37
+ model = org_and_model[1]
38
+ res = f"{org}_{model}_{request.precision}"
39
+ return res
40
+
41
+
42
+ # Get all eval request that are FINISHED, if you want to run other evals, change this parameter
43
+ eval_requests: list[EvalRequest] = get_eval_requests(job_status=current_finished_status, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND)
44
+ # Sort the evals by priority (first submitted first run)
45
+ eval_requests: list[EvalRequest] = sort_models_by_priority(api=API, models=eval_requests)
46
+
47
+ eval_results: list[EvalResult] = get_raw_eval_results(EVAL_RESULTS_PATH_BACKEND, EVAL_REQUESTS_PATH_BACKEND)
48
+
49
+ result_name_to_request = {request_to_result_name(r): r for r in eval_requests}
50
+ result_name_to_result = {r.eval_name: r for r in eval_results}
51
+
52
+ print('Requests', sorted(result_name_to_request.keys()))
53
+ print('Results', sorted(result_name_to_result.keys()))
54
+
55
+ for eval_request in eval_requests:
56
+ result_name: str = request_to_result_name(eval_request)
57
+
58
+ # Check the corresponding result
59
+ eval_result: EvalResult = result_name_to_result[result_name]
60
 
61
+ # Iterate over tasks and, if we do not have results for a task, run the relevant evaluations
62
+ for task in TASKS_HARNESS:
63
+ task_name = task.benchmark
64
 
65
+ if task_name not in eval_result.results:
66
+ print('RUN THIS ONE!', result_name, task_name)
src/leaderboard/read_evals.py CHANGED
@@ -144,7 +144,8 @@ class EvalResult:
144
  }
145
 
146
  for task in Tasks:
147
- data_dict[task.value.col_name] = self.results[task.value.benchmark]
 
148
 
149
  return data_dict
150
 
@@ -209,12 +210,6 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
209
 
210
  results = []
211
  for v in eval_results.values():
212
- try:
213
- v.to_dict() # we test if the dict version is complete
214
- results.append(v)
215
- except KeyError: # not all eval values present
216
- continue
217
-
218
- # print('XXX', results_path, requests_path, results)
219
 
220
  return results
 
144
  }
145
 
146
  for task in Tasks:
147
+ if task.value.benchmark in self.results: # XXX
148
+ data_dict[task.value.col_name] = self.results[task.value.benchmark]
149
 
150
  return data_dict
151
 
 
210
 
211
  results = []
212
  for v in eval_results.values():
213
+ results.append(v)
 
 
 
 
 
 
214
 
215
  return results