Spaces:
Paused
Paused
Added caps management
Browse files- main_backend_lighteval.py +56 -17
- src/backend/manage_requests.py +24 -3
main_backend_lighteval.py
CHANGED
@@ -6,7 +6,7 @@ from huggingface_hub import snapshot_download
|
|
6 |
logging.getLogger("openai").setLevel(logging.WARNING)
|
7 |
|
8 |
from src.backend.run_eval_suite_lighteval import run_evaluation
|
9 |
-
from src.backend.manage_requests import check_completed_evals, get_eval_requests, set_eval_request
|
10 |
from src.backend.sort_queue import sort_models_by_priority
|
11 |
|
12 |
from src.envs import QUEUE_REPO, EVAL_REQUESTS_PATH_BACKEND, RESULTS_REPO, EVAL_RESULTS_PATH_BACKEND, API, LIMIT, TOKEN, ACCELERATOR, VENDOR, REGION
|
@@ -40,7 +40,7 @@ def run_auto_eval():
|
|
40 |
)
|
41 |
|
42 |
# Get all eval request that are PENDING, if you want to run other evals, change this parameter
|
43 |
-
eval_requests = get_eval_requests(job_status=current_pending_status, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND)
|
44 |
# Sort the evals by priority (first submitted first run)
|
45 |
eval_requests = sort_models_by_priority(api=API, models=eval_requests)
|
46 |
|
@@ -52,29 +52,68 @@ def run_auto_eval():
|
|
52 |
eval_request = eval_requests[0]
|
53 |
pp.pprint(eval_request)
|
54 |
|
55 |
-
set_eval_request(
|
56 |
-
api=API,
|
57 |
-
eval_request=eval_request,
|
58 |
-
set_to_status=RUNNING_STATUS,
|
59 |
-
hf_repo=QUEUE_REPO,
|
60 |
-
local_dir=EVAL_REQUESTS_PATH_BACKEND,
|
61 |
-
)
|
62 |
-
|
63 |
-
# This needs to be done
|
64 |
-
#instance_size, instance_type = get_instance_for_model(eval_request)
|
65 |
# For GPU
|
66 |
if not eval_request or eval_request.params < 0:
|
67 |
raise ValueError("Couldn't detect number of params, please make sure the metadata is available")
|
68 |
elif eval_request.params < 4:
|
69 |
-
instance_size, instance_type = "small", "g4dn.xlarge"
|
70 |
elif eval_request.params < 9:
|
71 |
-
instance_size, instance_type = "medium", "g5.2xlarge"
|
72 |
elif eval_request.params < 24:
|
73 |
-
instance_size, instance_type = "xxlarge", "g5.12xlarge"
|
74 |
else:
|
75 |
raise ValueError("Number of params too big, can't run this model")
|
76 |
-
|
77 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
78 |
|
79 |
run_evaluation(
|
80 |
eval_request=eval_request,
|
|
|
6 |
logging.getLogger("openai").setLevel(logging.WARNING)
|
7 |
|
8 |
from src.backend.run_eval_suite_lighteval import run_evaluation
|
9 |
+
from src.backend.manage_requests import check_completed_evals, get_eval_requests, set_eval_request, set_requests_seen
|
10 |
from src.backend.sort_queue import sort_models_by_priority
|
11 |
|
12 |
from src.envs import QUEUE_REPO, EVAL_REQUESTS_PATH_BACKEND, RESULTS_REPO, EVAL_RESULTS_PATH_BACKEND, API, LIMIT, TOKEN, ACCELERATOR, VENDOR, REGION
|
|
|
40 |
)
|
41 |
|
42 |
# Get all eval request that are PENDING, if you want to run other evals, change this parameter
|
43 |
+
eval_requests, requests_seen = get_eval_requests(job_status=current_pending_status, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND)
|
44 |
# Sort the evals by priority (first submitted first run)
|
45 |
eval_requests = sort_models_by_priority(api=API, models=eval_requests)
|
46 |
|
|
|
52 |
eval_request = eval_requests[0]
|
53 |
pp.pprint(eval_request)
|
54 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
55 |
# For GPU
|
56 |
if not eval_request or eval_request.params < 0:
|
57 |
raise ValueError("Couldn't detect number of params, please make sure the metadata is available")
|
58 |
elif eval_request.params < 4:
|
59 |
+
instance_size, instance_type, cap = "small", "g4dn.xlarge", 20
|
60 |
elif eval_request.params < 9:
|
61 |
+
instance_size, instance_type, cap = "medium", "g5.2xlarge", 35
|
62 |
elif eval_request.params < 24:
|
63 |
+
instance_size, instance_type, cap = "xxlarge", "g5.12xlarge", 15
|
64 |
else:
|
65 |
raise ValueError("Number of params too big, can't run this model")
|
66 |
+
|
67 |
+
counter_key = f'count_{instance_type}'
|
68 |
+
if not counter_key in requests_seen:
|
69 |
+
requests_seen[counter_key] = 0
|
70 |
+
if requests_seen[counter_key] >= cap:
|
71 |
+
set_eval_request(
|
72 |
+
api=API,
|
73 |
+
eval_request=eval_request,
|
74 |
+
set_to_status=FAILED_STATUS,
|
75 |
+
hf_repo=QUEUE_REPO,
|
76 |
+
local_dir=EVAL_REQUESTS_PATH_BACKEND,
|
77 |
+
)
|
78 |
+
pp.pprint(dict(message="Reached maximum cap for requests of this instance type this month", counter=counter_key, instance_type=instance_type, cap=cap))
|
79 |
+
return
|
80 |
+
|
81 |
+
# next, check to see who made the last commit to this repo - keep track of that. One person shouldn't commit more
|
82 |
+
# than 4 models in one month.
|
83 |
+
commits = API.list_repo_commits(eval_request.model, revision=eval_request.revision)
|
84 |
+
users = commits[0].authors
|
85 |
+
for user in users:
|
86 |
+
if user in requests_seen and len(requests_seen[user]) >= 4:
|
87 |
+
set_eval_request(
|
88 |
+
api=API,
|
89 |
+
eval_request=eval_request,
|
90 |
+
set_to_status=FAILED_STATUS,
|
91 |
+
hf_repo=QUEUE_REPO,
|
92 |
+
local_dir=EVAL_REQUESTS_PATH_BACKEND,
|
93 |
+
)
|
94 |
+
pp.pprint(dict(message="Reached maximum cap for requests for this user this month", counter=counter_key, user=user))
|
95 |
+
return
|
96 |
+
if not user in requests_seen:
|
97 |
+
requests_seen[user] = []
|
98 |
+
requests_seen[user].append(dict(model_id=eval_request.model, revision=eval_request.revision))
|
99 |
+
|
100 |
+
requests_seen[counter_key] += 1
|
101 |
+
set_requests_seen(
|
102 |
+
api=API,
|
103 |
+
requests_seen=requests_seen,
|
104 |
+
hf_repo=QUEUE_REPO,
|
105 |
+
local_dir=EVAL_REQUESTS_PATH_BACKEND
|
106 |
+
)
|
107 |
+
|
108 |
+
set_eval_request(
|
109 |
+
api=API,
|
110 |
+
eval_request=eval_request,
|
111 |
+
set_to_status=RUNNING_STATUS,
|
112 |
+
hf_repo=QUEUE_REPO,
|
113 |
+
local_dir=EVAL_REQUESTS_PATH_BACKEND,
|
114 |
+
)
|
115 |
+
|
116 |
+
|
117 |
|
118 |
run_evaluation(
|
119 |
eval_request=eval_request,
|
src/backend/manage_requests.py
CHANGED
@@ -1,7 +1,9 @@
|
|
1 |
import glob
|
2 |
import json
|
3 |
from dataclasses import dataclass
|
|
|
4 |
from typing import Optional
|
|
|
5 |
|
6 |
from huggingface_hub import HfApi, snapshot_download
|
7 |
from src.envs import TOKEN
|
@@ -42,6 +44,18 @@ class EvalRequest:
|
|
42 |
|
43 |
return model_args
|
44 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
45 |
|
46 |
def set_eval_request(api: HfApi, eval_request: EvalRequest, set_to_status: str, hf_repo: str, local_dir: str):
|
47 |
"""Updates a given eval request with its new status on the hub (running, completed, failed, ...)"""
|
@@ -62,7 +76,6 @@ def set_eval_request(api: HfApi, eval_request: EvalRequest, set_to_status: str,
|
|
62 |
repo_type="dataset",
|
63 |
)
|
64 |
|
65 |
-
|
66 |
def get_eval_requests(job_status: list, local_dir: str, hf_repo: str) -> list[EvalRequest]:
|
67 |
"""Get all pending evaluation requests and return a list in which private
|
68 |
models appearing first, followed by public models sorted by the number of
|
@@ -83,8 +96,16 @@ def get_eval_requests(job_status: list, local_dir: str, hf_repo: str) -> list[Ev
|
|
83 |
eval_request = EvalRequest(**data)
|
84 |
eval_requests.append(eval_request)
|
85 |
|
86 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
87 |
|
|
|
|
|
88 |
|
89 |
def check_completed_evals(
|
90 |
api: HfApi,
|
@@ -99,7 +120,7 @@ def check_completed_evals(
|
|
99 |
"""Checks if the currently running evals are completed, if yes, update their status on the hub."""
|
100 |
snapshot_download(repo_id=hf_repo_results, revision="main", local_dir=local_dir_results, repo_type="dataset", max_workers=60, token=TOKEN)
|
101 |
|
102 |
-
running_evals = get_eval_requests(checked_status, hf_repo=hf_repo, local_dir=local_dir)
|
103 |
|
104 |
for eval_request in running_evals:
|
105 |
model = eval_request.model
|
|
|
1 |
import glob
|
2 |
import json
|
3 |
from dataclasses import dataclass
|
4 |
+
import os
|
5 |
from typing import Optional
|
6 |
+
import datetime
|
7 |
|
8 |
from huggingface_hub import HfApi, snapshot_download
|
9 |
from src.envs import TOKEN
|
|
|
44 |
|
45 |
return model_args
|
46 |
|
47 |
+
def set_requests_seen(api: HfApi, requests_seen: dict, hf_repo: str, local_dir: str):
|
48 |
+
"""Updates a given eval request with its new status on the hub (running, completed, failed, ...)"""
|
49 |
+
json_filepath = get_requests_seen_json_file(local_dir)
|
50 |
+
with open(json_filepath, "w") as f:
|
51 |
+
f.write(json.dumps(requests_seen))
|
52 |
+
|
53 |
+
api.upload_file(
|
54 |
+
path_or_fileobj=json_filepath,
|
55 |
+
path_in_repo=json_filepath.replace(local_dir, ""),
|
56 |
+
repo_id=hf_repo,
|
57 |
+
repo_type="dataset",
|
58 |
+
)
|
59 |
|
60 |
def set_eval_request(api: HfApi, eval_request: EvalRequest, set_to_status: str, hf_repo: str, local_dir: str):
|
61 |
"""Updates a given eval request with its new status on the hub (running, completed, failed, ...)"""
|
|
|
76 |
repo_type="dataset",
|
77 |
)
|
78 |
|
|
|
79 |
def get_eval_requests(job_status: list, local_dir: str, hf_repo: str) -> list[EvalRequest]:
|
80 |
"""Get all pending evaluation requests and return a list in which private
|
81 |
models appearing first, followed by public models sorted by the number of
|
|
|
96 |
eval_request = EvalRequest(**data)
|
97 |
eval_requests.append(eval_request)
|
98 |
|
99 |
+
requests_seen_json = get_requests_seen_json_file(local_dir)
|
100 |
+
requests_seen = {}
|
101 |
+
if os.path.isfile(requests_seen_json):
|
102 |
+
with open(requests_seen_json, 'r', encoding='utf8') as r:
|
103 |
+
requests_seen = json.load(r)
|
104 |
+
|
105 |
+
return eval_requests, requests_seen
|
106 |
|
107 |
+
def get_requests_seen_json_file(local_dir):
|
108 |
+
return f"{local_dir}/counters/{datetime.datetime.now().strftime("%B-%Y")}.json"
|
109 |
|
110 |
def check_completed_evals(
|
111 |
api: HfApi,
|
|
|
120 |
"""Checks if the currently running evals are completed, if yes, update their status on the hub."""
|
121 |
snapshot_download(repo_id=hf_repo_results, revision="main", local_dir=local_dir_results, repo_type="dataset", max_workers=60, token=TOKEN)
|
122 |
|
123 |
+
running_evals, _ = get_eval_requests(checked_status, hf_repo=hf_repo, local_dir=local_dir)
|
124 |
|
125 |
for eval_request in running_evals:
|
126 |
model = eval_request.model
|