Spaces:
Runtime error
Runtime error
pminervini
commited on
Commit
•
894c4b4
1
Parent(s):
e504efd
update
Browse files- backend-cli.py +80 -0
- src/backend/envs.py +33 -0
- src/backend/manage_requests.py +126 -0
- src/backend/run_eval_suite.py +36 -0
- src/backend/sort_queue.py +28 -0
backend-cli.py
ADDED
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import json
|
3 |
+
|
4 |
+
from datetime import datetime
|
5 |
+
|
6 |
+
from huggingface_hub import snapshot_download
|
7 |
+
|
8 |
+
from src.backend.run_eval_suite import run_evaluation
|
9 |
+
from src.backend.manage_requests import check_completed_evals, get_eval_requests, set_eval_request
|
10 |
+
from src.backend.sort_queue import sort_models_by_priority
|
11 |
+
from src.backend.envs import Tasks, NUM_FEWSHOT, EVAL_REQUESTS_PATH_BACKEND,EVAL_RESULTS_PATH_BACKEND, DEVICE, LIMIT
|
12 |
+
|
13 |
+
from src.envs import QUEUE_REPO, RESULTS_REPO, API
|
14 |
+
|
15 |
+
import logging
|
16 |
+
import pprint
|
17 |
+
|
18 |
+
TASKS_HARNESS = [task.value.benchmark for task in Tasks]
|
19 |
+
|
20 |
+
logging.getLogger("openai").setLevel(logging.WARNING)
|
21 |
+
|
22 |
+
logging.basicConfig(level=logging.ERROR)
|
23 |
+
pp = pprint.PrettyPrinter(width=80)
|
24 |
+
|
25 |
+
PENDING_STATUS = "PENDING"
|
26 |
+
RUNNING_STATUS = "RUNNING"
|
27 |
+
FINISHED_STATUS = "FINISHED"
|
28 |
+
FAILED_STATUS = "FAILED"
|
29 |
+
|
30 |
+
snapshot_download(repo_id=RESULTS_REPO, revision="main", local_dir=EVAL_RESULTS_PATH_BACKEND, repo_type="dataset", max_workers=60)
|
31 |
+
snapshot_download(repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset", max_workers=60)
|
32 |
+
|
33 |
+
|
34 |
+
def run_auto_eval():
|
35 |
+
current_pending_status = [PENDING_STATUS]
|
36 |
+
|
37 |
+
# pull the eval dataset from the hub and parse any eval requests
|
38 |
+
# check completed evals and set them to finished
|
39 |
+
check_completed_evals(api=API, checked_status=RUNNING_STATUS, completed_status=FINISHED_STATUS,
|
40 |
+
failed_status=FAILED_STATUS, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND,
|
41 |
+
hf_repo_results=RESULTS_REPO, local_dir_results=EVAL_RESULTS_PATH_BACKEND)
|
42 |
+
|
43 |
+
# Get all eval request that are PENDING, if you want to run other evals, change this parameter
|
44 |
+
eval_requests = get_eval_requests(job_status=current_pending_status, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND)
|
45 |
+
# Sort the evals by priority (first submitted first run)
|
46 |
+
eval_requests = sort_models_by_priority(api=API, models=eval_requests)
|
47 |
+
|
48 |
+
print(f"Found {len(eval_requests)} {','.join(current_pending_status)} eval requests")
|
49 |
+
|
50 |
+
if len(eval_requests) == 0:
|
51 |
+
return
|
52 |
+
|
53 |
+
eval_request = eval_requests[0]
|
54 |
+
pp.pprint(eval_request)
|
55 |
+
|
56 |
+
set_eval_request(api=API, eval_request=eval_request, set_to_status=RUNNING_STATUS, hf_repo=QUEUE_REPO,
|
57 |
+
local_dir=EVAL_REQUESTS_PATH_BACKEND)
|
58 |
+
|
59 |
+
results = run_evaluation(eval_request=eval_request, task_names=TASKS_HARNESS, num_fewshot=NUM_FEWSHOT,
|
60 |
+
batch_size=1, device=DEVICE, no_cache=True, limit=LIMIT)
|
61 |
+
|
62 |
+
dumped = json.dumps(results, indent=2)
|
63 |
+
print(dumped)
|
64 |
+
|
65 |
+
output_path = os.path.join(EVAL_RESULTS_PATH_BACKEND, *eval_request.model.split("/"), f"results_{datetime.now()}.json")
|
66 |
+
os.makedirs(os.path.dirname(output_path), exist_ok=True)
|
67 |
+
with open(output_path, "w") as f:
|
68 |
+
f.write(dumped)
|
69 |
+
|
70 |
+
API.upload_file(path_or_fileobj=output_path, path_in_repo=f"{eval_request.model}/results_{datetime.now()}.json",
|
71 |
+
repo_id=RESULTS_REPO, repo_type="dataset")
|
72 |
+
|
73 |
+
set_eval_request(api=API, eval_request=eval_request, set_to_status=FINISHED_STATUS, hf_repo=QUEUE_REPO,
|
74 |
+
local_dir=EVAL_REQUESTS_PATH_BACKEND)
|
75 |
+
|
76 |
+
# breakpoint()
|
77 |
+
|
78 |
+
|
79 |
+
if __name__ == "__main__":
|
80 |
+
run_auto_eval()
|
src/backend/envs.py
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
|
3 |
+
import torch
|
4 |
+
|
5 |
+
from dataclasses import dataclass
|
6 |
+
from enum import Enum
|
7 |
+
|
8 |
+
from src.envs import CACHE_PATH
|
9 |
+
|
10 |
+
|
11 |
+
@dataclass
|
12 |
+
class Task:
|
13 |
+
benchmark: str
|
14 |
+
metric: str
|
15 |
+
col_name: str
|
16 |
+
|
17 |
+
|
18 |
+
class Tasks(Enum):
|
19 |
+
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
20 |
+
# task0 = Task("anli_r1", "acc", "ANLI")
|
21 |
+
# task1 = Task("logiqa", "acc_norm", "LogiQA")
|
22 |
+
task0 = Task("nq_open", "em", "NQ Open")
|
23 |
+
task1 = Task("triviaqa", "em", "TriviaQA")
|
24 |
+
|
25 |
+
|
26 |
+
NUM_FEWSHOT = 64 # Change with your few shot
|
27 |
+
|
28 |
+
EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
|
29 |
+
EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
|
30 |
+
|
31 |
+
DEVICE = "cuda:0" if torch.cuda.is_available() else 'cpu'
|
32 |
+
|
33 |
+
LIMIT = 32 # Testing; needs to be None
|
src/backend/manage_requests.py
ADDED
@@ -0,0 +1,126 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import glob
|
2 |
+
import json
|
3 |
+
from dataclasses import dataclass
|
4 |
+
from typing import Optional
|
5 |
+
|
6 |
+
from huggingface_hub import HfApi, snapshot_download
|
7 |
+
|
8 |
+
@dataclass
|
9 |
+
class EvalRequest:
|
10 |
+
model: str
|
11 |
+
private: bool
|
12 |
+
status: str
|
13 |
+
json_filepath: str
|
14 |
+
weight_type: str = "Original"
|
15 |
+
model_type: str = "" # pretrained, finetuned, with RL
|
16 |
+
precision: str = "" # float16, bfloat16
|
17 |
+
base_model: Optional[str] = None # for adapter models
|
18 |
+
revision: str = "main" # commit
|
19 |
+
submitted_time: Optional[str] = "2022-05-18T11:40:22.519222" # random date just so that we can still order requests by date
|
20 |
+
model_type: Optional[str] = None
|
21 |
+
likes: Optional[int] = 0
|
22 |
+
params: Optional[int] = None
|
23 |
+
license: Optional[str] = ""
|
24 |
+
|
25 |
+
def get_model_args(self):
|
26 |
+
model_args = f"pretrained={self.model},revision={self.revision}"
|
27 |
+
|
28 |
+
if self.precision in ["float16", "float32", "bfloat16"]:
|
29 |
+
model_args += f",dtype={self.precision}"
|
30 |
+
# Quantized models need some added config, the install of bits and bytes, etc
|
31 |
+
#elif self.precision == "8bit":
|
32 |
+
# model_args += ",load_in_8bit=True"
|
33 |
+
#elif self.precision == "4bit":
|
34 |
+
# model_args += ",load_in_4bit=True"
|
35 |
+
#elif self.precision == "GPTQ":
|
36 |
+
# A GPTQ model does not need dtype to be specified,
|
37 |
+
# it will be inferred from the config
|
38 |
+
pass
|
39 |
+
else:
|
40 |
+
raise Exception(f"Unknown precision {self.precision}.")
|
41 |
+
|
42 |
+
return model_args
|
43 |
+
|
44 |
+
|
45 |
+
def set_eval_request(api: HfApi, eval_request: EvalRequest, set_to_status: str, hf_repo: str, local_dir: str):
|
46 |
+
"""Updates a given eval request with its new status on the hub (running, completed, failed, ...)"""
|
47 |
+
json_filepath = eval_request.json_filepath
|
48 |
+
|
49 |
+
with open(json_filepath) as fp:
|
50 |
+
data = json.load(fp)
|
51 |
+
|
52 |
+
data["status"] = set_to_status
|
53 |
+
|
54 |
+
with open(json_filepath, "w") as f:
|
55 |
+
f.write(json.dumps(data))
|
56 |
+
|
57 |
+
api.upload_file(
|
58 |
+
path_or_fileobj=json_filepath,
|
59 |
+
path_in_repo=json_filepath.replace(local_dir, ""),
|
60 |
+
repo_id=hf_repo,
|
61 |
+
repo_type="dataset",
|
62 |
+
)
|
63 |
+
|
64 |
+
|
65 |
+
def get_eval_requests(job_status: list, local_dir: str, hf_repo: str) -> list[EvalRequest]:
|
66 |
+
"""Get all pending evaluation requests and return a list in which private
|
67 |
+
models appearing first, followed by public models sorted by the number of
|
68 |
+
likes.
|
69 |
+
|
70 |
+
Returns:
|
71 |
+
`list[EvalRequest]`: a list of model info dicts.
|
72 |
+
"""
|
73 |
+
snapshot_download(repo_id=hf_repo, revision="main", local_dir=local_dir, repo_type="dataset", max_workers=60)
|
74 |
+
json_files = glob.glob(f"{local_dir}/**/*.json", recursive=True)
|
75 |
+
|
76 |
+
eval_requests = []
|
77 |
+
for json_filepath in json_files:
|
78 |
+
with open(json_filepath) as fp:
|
79 |
+
data = json.load(fp)
|
80 |
+
if data["status"] in job_status:
|
81 |
+
# import pdb
|
82 |
+
# breakpoint()
|
83 |
+
data["json_filepath"] = json_filepath
|
84 |
+
|
85 |
+
del data['job_id']
|
86 |
+
|
87 |
+
eval_request = EvalRequest(**data)
|
88 |
+
eval_requests.append(eval_request)
|
89 |
+
|
90 |
+
return eval_requests
|
91 |
+
|
92 |
+
|
93 |
+
def check_completed_evals(
|
94 |
+
api: HfApi,
|
95 |
+
hf_repo: str,
|
96 |
+
local_dir: str,
|
97 |
+
checked_status: str,
|
98 |
+
completed_status: str,
|
99 |
+
failed_status: str,
|
100 |
+
hf_repo_results: str,
|
101 |
+
local_dir_results: str,
|
102 |
+
):
|
103 |
+
"""Checks if the currently running evals are completed, if yes, update their status on the hub."""
|
104 |
+
snapshot_download(repo_id=hf_repo_results, revision="main", local_dir=local_dir_results, repo_type="dataset", max_workers=60)
|
105 |
+
|
106 |
+
running_evals = get_eval_requests(checked_status, hf_repo=hf_repo, local_dir=local_dir)
|
107 |
+
|
108 |
+
for eval_request in running_evals:
|
109 |
+
model = eval_request.model
|
110 |
+
print("====================================")
|
111 |
+
print(f"Checking {model}")
|
112 |
+
|
113 |
+
output_path = model
|
114 |
+
output_file = f"{local_dir_results}/{output_path}/results*.json"
|
115 |
+
output_file_exists = len(glob.glob(output_file)) > 0
|
116 |
+
|
117 |
+
if output_file_exists:
|
118 |
+
print(
|
119 |
+
f"EXISTS output file exists for {model} setting it to {completed_status}"
|
120 |
+
)
|
121 |
+
set_eval_request(api, eval_request, completed_status, hf_repo, local_dir)
|
122 |
+
else:
|
123 |
+
print(
|
124 |
+
f"No result file found for {model} setting it to {failed_status}"
|
125 |
+
)
|
126 |
+
set_eval_request(api, eval_request, failed_status, hf_repo, local_dir)
|
src/backend/run_eval_suite.py
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from lm_eval import tasks, evaluator, utils
|
2 |
+
from src.backend.manage_requests import EvalRequest
|
3 |
+
|
4 |
+
import logging
|
5 |
+
|
6 |
+
logging.getLogger("openai").setLevel(logging.WARNING)
|
7 |
+
|
8 |
+
|
9 |
+
def run_evaluation(eval_request: EvalRequest, task_names, num_fewshot, batch_size, device, no_cache=True, limit=None):
|
10 |
+
if limit:
|
11 |
+
print("WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT.")
|
12 |
+
|
13 |
+
task_names = utils.pattern_match(task_names, tasks.ALL_TASKS)
|
14 |
+
|
15 |
+
print(f"Selected Tasks: {task_names}")
|
16 |
+
|
17 |
+
results = evaluator.simple_evaluate(
|
18 |
+
model="hf-causal-experimental", # "hf-causal"
|
19 |
+
model_args=eval_request.get_model_args(),
|
20 |
+
tasks=task_names,
|
21 |
+
num_fewshot=num_fewshot,
|
22 |
+
batch_size=batch_size,
|
23 |
+
device=device,
|
24 |
+
no_cache=no_cache,
|
25 |
+
limit=limit,
|
26 |
+
write_out=True,
|
27 |
+
output_base_path="logs"
|
28 |
+
)
|
29 |
+
|
30 |
+
results["config"]["model_dtype"] = eval_request.precision
|
31 |
+
results["config"]["model_name"] = eval_request.model
|
32 |
+
results["config"]["model_sha"] = eval_request.revision
|
33 |
+
|
34 |
+
print(evaluator.make_table(results))
|
35 |
+
|
36 |
+
return results
|
src/backend/sort_queue.py
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from dataclasses import dataclass
|
2 |
+
from huggingface_hub import HfApi
|
3 |
+
from src.backend.manage_requests import EvalRequest
|
4 |
+
|
5 |
+
|
6 |
+
@dataclass
|
7 |
+
class ModelMetadata:
|
8 |
+
likes: int = 0
|
9 |
+
size: int = 15
|
10 |
+
|
11 |
+
|
12 |
+
def sort_models_by_priority(api: HfApi, models: list[EvalRequest]) -> list[EvalRequest]:
|
13 |
+
private_models = [model for model in models if model.private]
|
14 |
+
public_models = [model for model in models if not model.private]
|
15 |
+
|
16 |
+
return sort_by_submit_date(private_models) + sort_by_submit_date(public_models)
|
17 |
+
|
18 |
+
|
19 |
+
def sort_by_submit_date(eval_requests: list[EvalRequest]) -> list[EvalRequest]:
|
20 |
+
return sorted(eval_requests, key=lambda x: x.submitted_time, reverse=False)
|
21 |
+
|
22 |
+
|
23 |
+
def sort_by_size(eval_requests: list[EvalRequest]) -> list[EvalRequest]:
|
24 |
+
return sorted(eval_requests, key=lambda x: x.params, reverse=False)
|
25 |
+
|
26 |
+
|
27 |
+
def sort_by_likes(eval_requests: list[EvalRequest]) -> list[EvalRequest]:
|
28 |
+
return sorted(eval_requests, key=lambda x: x.likes, reverse=False)
|