Shaltiel commited on
Commit
0f5c75a
·
1 Parent(s): b88b6bf

Added caps management

Browse files
main_backend_lighteval.py CHANGED
@@ -6,7 +6,7 @@ from huggingface_hub import snapshot_download
6
  logging.getLogger("openai").setLevel(logging.WARNING)
7
 
8
  from src.backend.run_eval_suite_lighteval import run_evaluation
9
- from src.backend.manage_requests import check_completed_evals, get_eval_requests, set_eval_request
10
  from src.backend.sort_queue import sort_models_by_priority
11
 
12
  from src.envs import QUEUE_REPO, EVAL_REQUESTS_PATH_BACKEND, RESULTS_REPO, EVAL_RESULTS_PATH_BACKEND, API, LIMIT, TOKEN, ACCELERATOR, VENDOR, REGION
@@ -40,7 +40,7 @@ def run_auto_eval():
40
  )
41
 
42
  # Get all eval request that are PENDING, if you want to run other evals, change this parameter
43
- eval_requests = get_eval_requests(job_status=current_pending_status, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND)
44
  # Sort the evals by priority (first submitted first run)
45
  eval_requests = sort_models_by_priority(api=API, models=eval_requests)
46
 
@@ -52,29 +52,68 @@ def run_auto_eval():
52
  eval_request = eval_requests[0]
53
  pp.pprint(eval_request)
54
 
55
- set_eval_request(
56
- api=API,
57
- eval_request=eval_request,
58
- set_to_status=RUNNING_STATUS,
59
- hf_repo=QUEUE_REPO,
60
- local_dir=EVAL_REQUESTS_PATH_BACKEND,
61
- )
62
-
63
- # This needs to be done
64
- #instance_size, instance_type = get_instance_for_model(eval_request)
65
  # For GPU
66
  if not eval_request or eval_request.params < 0:
67
  raise ValueError("Couldn't detect number of params, please make sure the metadata is available")
68
  elif eval_request.params < 4:
69
- instance_size, instance_type = "small", "g4dn.xlarge"
70
  elif eval_request.params < 9:
71
- instance_size, instance_type = "medium", "g5.2xlarge"
72
  elif eval_request.params < 24:
73
- instance_size, instance_type = "xxlarge", "g5.12xlarge"
74
  else:
75
  raise ValueError("Number of params too big, can't run this model")
76
- # For CPU
77
- # instance_size, instance_type = "medium", "c6i"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
 
79
  run_evaluation(
80
  eval_request=eval_request,
 
6
  logging.getLogger("openai").setLevel(logging.WARNING)
7
 
8
  from src.backend.run_eval_suite_lighteval import run_evaluation
9
+ from src.backend.manage_requests import check_completed_evals, get_eval_requests, set_eval_request, set_requests_seen
10
  from src.backend.sort_queue import sort_models_by_priority
11
 
12
  from src.envs import QUEUE_REPO, EVAL_REQUESTS_PATH_BACKEND, RESULTS_REPO, EVAL_RESULTS_PATH_BACKEND, API, LIMIT, TOKEN, ACCELERATOR, VENDOR, REGION
 
40
  )
41
 
42
  # Get all eval request that are PENDING, if you want to run other evals, change this parameter
43
+ eval_requests, requests_seen = get_eval_requests(job_status=current_pending_status, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND)
44
  # Sort the evals by priority (first submitted first run)
45
  eval_requests = sort_models_by_priority(api=API, models=eval_requests)
46
 
 
52
  eval_request = eval_requests[0]
53
  pp.pprint(eval_request)
54
 
 
 
 
 
 
 
 
 
 
 
55
  # For GPU
56
  if not eval_request or eval_request.params < 0:
57
  raise ValueError("Couldn't detect number of params, please make sure the metadata is available")
58
  elif eval_request.params < 4:
59
+ instance_size, instance_type, cap = "small", "g4dn.xlarge", 20
60
  elif eval_request.params < 9:
61
+ instance_size, instance_type, cap = "medium", "g5.2xlarge", 35
62
  elif eval_request.params < 24:
63
+ instance_size, instance_type, cap = "xxlarge", "g5.12xlarge", 15
64
  else:
65
  raise ValueError("Number of params too big, can't run this model")
66
+
67
+ counter_key = f'count_{instance_type}'
68
+ if not counter_key in requests_seen:
69
+ requests_seen[counter_key] = 0
70
+ if requests_seen[counter_key] >= cap:
71
+ set_eval_request(
72
+ api=API,
73
+ eval_request=eval_request,
74
+ set_to_status=FAILED_STATUS,
75
+ hf_repo=QUEUE_REPO,
76
+ local_dir=EVAL_REQUESTS_PATH_BACKEND,
77
+ )
78
+ pp.pprint(dict(message="Reached maximum cap for requests of this instance type this month", counter=counter_key, instance_type=instance_type, cap=cap))
79
+ return
80
+
81
+ # next, check to see who made the last commit to this repo - keep track of that. One person shouldn't commit more
82
+ # than 4 models in one month.
83
+ commits = API.list_repo_commits(eval_request.model, revision=eval_request.revision)
84
+ users = commits[0].authors
85
+ for user in users:
86
+ if user in requests_seen and len(requests_seen[user]) >= 4:
87
+ set_eval_request(
88
+ api=API,
89
+ eval_request=eval_request,
90
+ set_to_status=FAILED_STATUS,
91
+ hf_repo=QUEUE_REPO,
92
+ local_dir=EVAL_REQUESTS_PATH_BACKEND,
93
+ )
94
+ pp.pprint(dict(message="Reached maximum cap for requests for this user this month", counter=counter_key, user=user))
95
+ return
96
+ if not user in requests_seen:
97
+ requests_seen[user] = []
98
+ requests_seen[user].append(dict(model_id=eval_request.model, revision=eval_request.revision))
99
+
100
+ requests_seen[counter_key] += 1
101
+ set_requests_seen(
102
+ api=API,
103
+ requests_seen=requests_seen,
104
+ hf_repo=QUEUE_REPO,
105
+ local_dir=EVAL_REQUESTS_PATH_BACKEND
106
+ )
107
+
108
+ set_eval_request(
109
+ api=API,
110
+ eval_request=eval_request,
111
+ set_to_status=RUNNING_STATUS,
112
+ hf_repo=QUEUE_REPO,
113
+ local_dir=EVAL_REQUESTS_PATH_BACKEND,
114
+ )
115
+
116
+
117
 
118
  run_evaluation(
119
  eval_request=eval_request,
src/backend/manage_requests.py CHANGED
@@ -1,7 +1,9 @@
1
  import glob
2
  import json
3
  from dataclasses import dataclass
 
4
  from typing import Optional
 
5
 
6
  from huggingface_hub import HfApi, snapshot_download
7
  from src.envs import TOKEN
@@ -42,6 +44,18 @@ class EvalRequest:
42
 
43
  return model_args
44
 
 
 
 
 
 
 
 
 
 
 
 
 
45
 
46
  def set_eval_request(api: HfApi, eval_request: EvalRequest, set_to_status: str, hf_repo: str, local_dir: str):
47
  """Updates a given eval request with its new status on the hub (running, completed, failed, ...)"""
@@ -62,7 +76,6 @@ def set_eval_request(api: HfApi, eval_request: EvalRequest, set_to_status: str,
62
  repo_type="dataset",
63
  )
64
 
65
-
66
  def get_eval_requests(job_status: list, local_dir: str, hf_repo: str) -> list[EvalRequest]:
67
  """Get all pending evaluation requests and return a list in which private
68
  models appearing first, followed by public models sorted by the number of
@@ -83,8 +96,16 @@ def get_eval_requests(job_status: list, local_dir: str, hf_repo: str) -> list[Ev
83
  eval_request = EvalRequest(**data)
84
  eval_requests.append(eval_request)
85
 
86
- return eval_requests
 
 
 
 
 
 
87
 
 
 
88
 
89
  def check_completed_evals(
90
  api: HfApi,
@@ -99,7 +120,7 @@ def check_completed_evals(
99
  """Checks if the currently running evals are completed, if yes, update their status on the hub."""
100
  snapshot_download(repo_id=hf_repo_results, revision="main", local_dir=local_dir_results, repo_type="dataset", max_workers=60, token=TOKEN)
101
 
102
- running_evals = get_eval_requests(checked_status, hf_repo=hf_repo, local_dir=local_dir)
103
 
104
  for eval_request in running_evals:
105
  model = eval_request.model
 
1
  import glob
2
  import json
3
  from dataclasses import dataclass
4
+ import os
5
  from typing import Optional
6
+ import datetime
7
 
8
  from huggingface_hub import HfApi, snapshot_download
9
  from src.envs import TOKEN
 
44
 
45
  return model_args
46
 
47
+ def set_requests_seen(api: HfApi, requests_seen: dict, hf_repo: str, local_dir: str):
48
+ """Updates a given eval request with its new status on the hub (running, completed, failed, ...)"""
49
+ json_filepath = get_requests_seen_json_file(local_dir)
50
+ with open(json_filepath, "w") as f:
51
+ f.write(json.dumps(requests_seen))
52
+
53
+ api.upload_file(
54
+ path_or_fileobj=json_filepath,
55
+ path_in_repo=json_filepath.replace(local_dir, ""),
56
+ repo_id=hf_repo,
57
+ repo_type="dataset",
58
+ )
59
 
60
  def set_eval_request(api: HfApi, eval_request: EvalRequest, set_to_status: str, hf_repo: str, local_dir: str):
61
  """Updates a given eval request with its new status on the hub (running, completed, failed, ...)"""
 
76
  repo_type="dataset",
77
  )
78
 
 
79
  def get_eval_requests(job_status: list, local_dir: str, hf_repo: str) -> list[EvalRequest]:
80
  """Get all pending evaluation requests and return a list in which private
81
  models appearing first, followed by public models sorted by the number of
 
96
  eval_request = EvalRequest(**data)
97
  eval_requests.append(eval_request)
98
 
99
+ requests_seen_json = get_requests_seen_json_file(local_dir)
100
+ requests_seen = {}
101
+ if os.path.isfile(requests_seen_json):
102
+ with open(requests_seen_json, 'r', encoding='utf8') as r:
103
+ requests_seen = json.load(r)
104
+
105
+ return eval_requests, requests_seen
106
 
107
+ def get_requests_seen_json_file(local_dir):
108
+ return f"{local_dir}/counters/{datetime.datetime.now().strftime("%B-%Y")}.json"
109
 
110
  def check_completed_evals(
111
  api: HfApi,
 
120
  """Checks if the currently running evals are completed, if yes, update their status on the hub."""
121
  snapshot_download(repo_id=hf_repo_results, revision="main", local_dir=local_dir_results, repo_type="dataset", max_workers=60, token=TOKEN)
122
 
123
+ running_evals, _ = get_eval_requests(checked_status, hf_repo=hf_repo, local_dir=local_dir)
124
 
125
  for eval_request in running_evals:
126
  model = eval_request.model