Spaces:
Runtime error
Runtime error
pminervini
commited on
Commit
•
6c79b12
1
Parent(s):
84fb473
update
Browse files- backend-cli.py +18 -12
- src/backend/envs.py +5 -4
backend-cli.py
CHANGED
@@ -8,14 +8,14 @@ from huggingface_hub import snapshot_download
|
|
8 |
from src.backend.run_eval_suite import run_evaluation
|
9 |
from src.backend.manage_requests import check_completed_evals, get_eval_requests, set_eval_request
|
10 |
from src.backend.sort_queue import sort_models_by_priority
|
11 |
-
from src.backend.envs import Tasks,
|
12 |
|
13 |
from src.envs import QUEUE_REPO, RESULTS_REPO, API
|
14 |
|
15 |
import logging
|
16 |
import pprint
|
17 |
|
18 |
-
TASKS_HARNESS = [task.value.benchmark for task in Tasks]
|
19 |
|
20 |
logging.getLogger("openai").setLevel(logging.WARNING)
|
21 |
|
@@ -56,19 +56,25 @@ def run_auto_eval():
|
|
56 |
set_eval_request(api=API, eval_request=eval_request, set_to_status=RUNNING_STATUS, hf_repo=QUEUE_REPO,
|
57 |
local_dir=EVAL_REQUESTS_PATH_BACKEND)
|
58 |
|
59 |
-
results = run_evaluation(eval_request=eval_request, task_names=TASKS_HARNESS, num_fewshot=NUM_FEWSHOT,
|
60 |
-
|
61 |
|
62 |
-
|
63 |
-
print(dumped)
|
64 |
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
f.write(dumped)
|
69 |
|
70 |
-
|
71 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
72 |
|
73 |
set_eval_request(api=API, eval_request=eval_request, set_to_status=FINISHED_STATUS, hf_repo=QUEUE_REPO,
|
74 |
local_dir=EVAL_REQUESTS_PATH_BACKEND)
|
|
|
8 |
from src.backend.run_eval_suite import run_evaluation
|
9 |
from src.backend.manage_requests import check_completed_evals, get_eval_requests, set_eval_request
|
10 |
from src.backend.sort_queue import sort_models_by_priority
|
11 |
+
from src.backend.envs import Tasks, EVAL_REQUESTS_PATH_BACKEND,EVAL_RESULTS_PATH_BACKEND, DEVICE, LIMIT
|
12 |
|
13 |
from src.envs import QUEUE_REPO, RESULTS_REPO, API
|
14 |
|
15 |
import logging
|
16 |
import pprint
|
17 |
|
18 |
+
# TASKS_HARNESS = [task.value.benchmark for task in Tasks]
|
19 |
|
20 |
logging.getLogger("openai").setLevel(logging.WARNING)
|
21 |
|
|
|
56 |
set_eval_request(api=API, eval_request=eval_request, set_to_status=RUNNING_STATUS, hf_repo=QUEUE_REPO,
|
57 |
local_dir=EVAL_REQUESTS_PATH_BACKEND)
|
58 |
|
59 |
+
# results = run_evaluation(eval_request=eval_request, task_names=TASKS_HARNESS, num_fewshot=NUM_FEWSHOT,
|
60 |
+
# batch_size=1, device=DEVICE, no_cache=True, limit=LIMIT)
|
61 |
|
62 |
+
TASKS_HARNESS = [task.value for task in Tasks]
|
|
|
63 |
|
64 |
+
for task in TASKS_HARNESS:
|
65 |
+
results = run_evaluation(eval_request=eval_request, task_names=[task.benchmark], num_fewshot=task.num_fewshot,
|
66 |
+
batch_size=1, device=DEVICE, no_cache=True, limit=LIMIT)
|
|
|
67 |
|
68 |
+
dumped = json.dumps(results, indent=2)
|
69 |
+
print(dumped)
|
70 |
+
|
71 |
+
output_path = os.path.join(EVAL_RESULTS_PATH_BACKEND, *eval_request.model.split("/"), f"results_{datetime.now()}.json")
|
72 |
+
os.makedirs(os.path.dirname(output_path), exist_ok=True)
|
73 |
+
with open(output_path, "w") as f:
|
74 |
+
f.write(dumped)
|
75 |
+
|
76 |
+
API.upload_file(path_or_fileobj=output_path, path_in_repo=f"{eval_request.model}/results_{datetime.now()}.json",
|
77 |
+
repo_id=RESULTS_REPO, repo_type="dataset")
|
78 |
|
79 |
set_eval_request(api=API, eval_request=eval_request, set_to_status=FINISHED_STATUS, hf_repo=QUEUE_REPO,
|
80 |
local_dir=EVAL_REQUESTS_PATH_BACKEND)
|
src/backend/envs.py
CHANGED
@@ -13,21 +13,22 @@ class Task:
|
|
13 |
benchmark: str
|
14 |
metric: str
|
15 |
col_name: str
|
|
|
16 |
|
17 |
|
18 |
class Tasks(Enum):
|
19 |
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
20 |
# task0 = Task("anli_r1", "acc", "ANLI")
|
21 |
# task1 = Task("logiqa", "acc_norm", "LogiQA")
|
22 |
-
task0 = Task("nq_open", "em", "NQ Open")
|
23 |
-
task1 = Task("triviaqa", "em", "TriviaQA")
|
24 |
|
25 |
|
26 |
-
NUM_FEWSHOT = 64 # Change with your few shot
|
27 |
|
28 |
EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
|
29 |
EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
|
30 |
|
31 |
DEVICE = "cuda:0" if torch.cuda.is_available() else 'cpu'
|
32 |
|
33 |
-
LIMIT =
|
|
|
13 |
benchmark: str
|
14 |
metric: str
|
15 |
col_name: str
|
16 |
+
num_fewshot: int
|
17 |
|
18 |
|
19 |
class Tasks(Enum):
|
20 |
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
21 |
# task0 = Task("anli_r1", "acc", "ANLI")
|
22 |
# task1 = Task("logiqa", "acc_norm", "LogiQA")
|
23 |
+
task0 = Task("nq_open", "em", "NQ Open", 64)
|
24 |
+
task1 = Task("triviaqa", "em", "TriviaQA", 64)
|
25 |
|
26 |
|
27 |
+
# NUM_FEWSHOT = 64 # Change with your few shot
|
28 |
|
29 |
EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
|
30 |
EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
|
31 |
|
32 |
DEVICE = "cuda:0" if torch.cuda.is_available() else 'cpu'
|
33 |
|
34 |
+
LIMIT = None # Testing; needs to be None
|