Clémentine
commited on
Commit
·
1ffc326
1
Parent(s):
943f952
now with a functionning backend
Browse files- .gitignore +3 -5
- README.md +6 -2
- app.py +10 -1
- main_backend.py +78 -0
- requirements.txt +3 -1
- scripts/fix_harness_import.py +11 -0
- src/{display/about.py → about.py} +6 -2
- src/backend/manage_requests.py +123 -0
- src/backend/run_eval_suite.py +57 -0
- src/backend/sort_queue.py +28 -0
- src/display/formatting.py +0 -9
- src/display/utils.py +1 -1
- src/envs.py +11 -3
- src/leaderboard/read_evals.py +1 -1
.gitignore
CHANGED
@@ -6,10 +6,8 @@ __pycache__/
|
|
6 |
*ipynb
|
7 |
.vscode/
|
8 |
|
9 |
-
gpt_4_evals/
|
10 |
-
human_evals/
|
11 |
eval-queue/
|
12 |
eval-results/
|
13 |
-
|
14 |
-
|
15 |
-
|
|
|
6 |
*ipynb
|
7 |
.vscode/
|
8 |
|
|
|
|
|
9 |
eval-queue/
|
10 |
eval-results/
|
11 |
+
eval-queue-bk/
|
12 |
+
eval-results-bk/
|
13 |
+
logs/
|
README.md
CHANGED
@@ -12,7 +12,7 @@ license: apache-2.0
|
|
12 |
|
13 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
14 |
|
15 |
-
Most of the variables to change for a default leaderboard are in env (replace the path for your leaderboard) and src/
|
16 |
|
17 |
Results files should have the following format:
|
18 |
```
|
@@ -33,4 +33,8 @@ Results files should have the following format:
|
|
33 |
}
|
34 |
```
|
35 |
|
36 |
-
Request files are created automatically by this tool.
|
|
|
|
|
|
|
|
|
|
12 |
|
13 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
14 |
|
15 |
+
Most of the variables to change for a default leaderboard are in src/env (replace the path for your leaderboard) and src/about.
|
16 |
|
17 |
Results files should have the following format:
|
18 |
```
|
|
|
33 |
}
|
34 |
```
|
35 |
|
36 |
+
Request files are created automatically by this tool.
|
37 |
+
|
38 |
+
If you encounter problem on the space, don't hesitate to restart it to remove the create eval-queue, eval-queue-bk, eval-results and eval-results-bk created folder.
|
39 |
+
|
40 |
+
If you want to run your own backend, you only need to change the logic in src/backend/run_eval_suite, which at the moment launches the Eleuther AI Harness.
|
app.py
CHANGED
@@ -1,9 +1,10 @@
|
|
|
|
1 |
import gradio as gr
|
2 |
import pandas as pd
|
3 |
from apscheduler.schedulers.background import BackgroundScheduler
|
4 |
from huggingface_hub import snapshot_download
|
5 |
|
6 |
-
from src.
|
7 |
CITATION_BUTTON_LABEL,
|
8 |
CITATION_BUTTON_TEXT,
|
9 |
EVALUATION_QUEUE_TEXT,
|
@@ -30,9 +31,14 @@ from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
|
30 |
from src.submission.submit import add_new_eval
|
31 |
|
32 |
|
|
|
|
|
33 |
def restart_space():
|
34 |
API.restart_space(repo_id=REPO_ID, token=TOKEN)
|
35 |
|
|
|
|
|
|
|
36 |
try:
|
37 |
print(EVAL_REQUESTS_PATH)
|
38 |
snapshot_download(
|
@@ -342,5 +348,8 @@ with demo:
|
|
342 |
|
343 |
scheduler = BackgroundScheduler()
|
344 |
scheduler.add_job(restart_space, "interval", seconds=1800)
|
|
|
345 |
scheduler.start()
|
346 |
demo.queue(default_concurrency_limit=40).launch()
|
|
|
|
|
|
1 |
+
import subprocess
|
2 |
import gradio as gr
|
3 |
import pandas as pd
|
4 |
from apscheduler.schedulers.background import BackgroundScheduler
|
5 |
from huggingface_hub import snapshot_download
|
6 |
|
7 |
+
from src.about import (
|
8 |
CITATION_BUTTON_LABEL,
|
9 |
CITATION_BUTTON_TEXT,
|
10 |
EVALUATION_QUEUE_TEXT,
|
|
|
31 |
from src.submission.submit import add_new_eval
|
32 |
|
33 |
|
34 |
+
subprocess.run(["python", "scripts/fix_harness_import.py"])
|
35 |
+
|
36 |
def restart_space():
|
37 |
API.restart_space(repo_id=REPO_ID, token=TOKEN)
|
38 |
|
39 |
+
def launch_backend():
|
40 |
+
_ = subprocess.run(["python", "main_backend.py"])
|
41 |
+
|
42 |
try:
|
43 |
print(EVAL_REQUESTS_PATH)
|
44 |
snapshot_download(
|
|
|
348 |
|
349 |
scheduler = BackgroundScheduler()
|
350 |
scheduler.add_job(restart_space, "interval", seconds=1800)
|
351 |
+
scheduler.add_job(launch_backend, "interval", seconds=100) # will only allow one job to be run at the same time
|
352 |
scheduler.start()
|
353 |
demo.queue(default_concurrency_limit=40).launch()
|
354 |
+
|
355 |
+
restart_space()
|
main_backend.py
ADDED
@@ -0,0 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
import pprint
|
3 |
+
|
4 |
+
from huggingface_hub import snapshot_download
|
5 |
+
|
6 |
+
logging.getLogger("openai").setLevel(logging.WARNING)
|
7 |
+
|
8 |
+
from src.backend.run_eval_suite import run_evaluation
|
9 |
+
from src.backend.manage_requests import check_completed_evals, get_eval_requests, set_eval_request
|
10 |
+
from src.backend.sort_queue import sort_models_by_priority
|
11 |
+
|
12 |
+
from src.envs import QUEUE_REPO, EVAL_REQUESTS_PATH_BACKEND, RESULTS_REPO, EVAL_RESULTS_PATH_BACKEND, DEVICE, API, LIMIT
|
13 |
+
from src.about import Tasks, NUM_FEWSHOT
|
14 |
+
TASKS_HARNESS = [task.value.benchmark for task in Tasks]
|
15 |
+
|
16 |
+
logging.basicConfig(level=logging.ERROR)
|
17 |
+
pp = pprint.PrettyPrinter(width=80)
|
18 |
+
|
19 |
+
PENDING_STATUS = "PENDING"
|
20 |
+
RUNNING_STATUS = "RUNNING"
|
21 |
+
FINISHED_STATUS = "FINISHED"
|
22 |
+
FAILED_STATUS = "FAILED"
|
23 |
+
|
24 |
+
snapshot_download(repo_id=RESULTS_REPO, revision="main", local_dir=EVAL_RESULTS_PATH_BACKEND, repo_type="dataset", max_workers=60)
|
25 |
+
snapshot_download(repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset", max_workers=60)
|
26 |
+
|
27 |
+
def run_auto_eval():
|
28 |
+
current_pending_status = [PENDING_STATUS]
|
29 |
+
|
30 |
+
# pull the eval dataset from the hub and parse any eval requests
|
31 |
+
# check completed evals and set them to finished
|
32 |
+
check_completed_evals(
|
33 |
+
api=API,
|
34 |
+
checked_status=RUNNING_STATUS,
|
35 |
+
completed_status=FINISHED_STATUS,
|
36 |
+
failed_status=FAILED_STATUS,
|
37 |
+
hf_repo=QUEUE_REPO,
|
38 |
+
local_dir=EVAL_REQUESTS_PATH_BACKEND,
|
39 |
+
hf_repo_results=RESULTS_REPO,
|
40 |
+
local_dir_results=EVAL_RESULTS_PATH_BACKEND
|
41 |
+
)
|
42 |
+
|
43 |
+
# Get all eval request that are PENDING, if you want to run other evals, change this parameter
|
44 |
+
eval_requests = get_eval_requests(job_status=current_pending_status, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND)
|
45 |
+
# Sort the evals by priority (first submitted first run)
|
46 |
+
eval_requests = sort_models_by_priority(api=API, models=eval_requests)
|
47 |
+
|
48 |
+
print(f"Found {len(eval_requests)} {','.join(current_pending_status)} eval requests")
|
49 |
+
|
50 |
+
if len(eval_requests) == 0:
|
51 |
+
return
|
52 |
+
|
53 |
+
eval_request = eval_requests[0]
|
54 |
+
pp.pprint(eval_request)
|
55 |
+
|
56 |
+
set_eval_request(
|
57 |
+
api=API,
|
58 |
+
eval_request=eval_request,
|
59 |
+
set_to_status=RUNNING_STATUS,
|
60 |
+
hf_repo=QUEUE_REPO,
|
61 |
+
local_dir=EVAL_REQUESTS_PATH_BACKEND,
|
62 |
+
)
|
63 |
+
|
64 |
+
run_evaluation(
|
65 |
+
eval_request=eval_request,
|
66 |
+
task_names=TASKS_HARNESS,
|
67 |
+
num_fewshot=NUM_FEWSHOT,
|
68 |
+
local_dir=EVAL_RESULTS_PATH_BACKEND,
|
69 |
+
results_repo=RESULTS_REPO,
|
70 |
+
batch_size=1,
|
71 |
+
device=DEVICE,
|
72 |
+
no_cache=True,
|
73 |
+
limit=LIMIT
|
74 |
+
)
|
75 |
+
|
76 |
+
|
77 |
+
if __name__ == "__main__":
|
78 |
+
run_auto_eval()
|
requirements.txt
CHANGED
@@ -12,4 +12,6 @@ python-dateutil==2.8.2
|
|
12 |
requests==2.28.2
|
13 |
tqdm==4.65.0
|
14 |
transformers==4.35.2
|
15 |
-
tokenizers>=0.15.0
|
|
|
|
|
|
12 |
requests==2.28.2
|
13 |
tqdm==4.65.0
|
14 |
transformers==4.35.2
|
15 |
+
tokenizers>=0.15.0
|
16 |
+
git+https://github.com/EleutherAI/lm-evaluation-harness.git@b281b0921b636bc36ad05c0b0b0763bd6dd43463#egg=lm-eval
|
17 |
+
accelerate==0.24.1
|
scripts/fix_harness_import.py
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""This file should be used after pip install -r requirements.
|
2 |
+
It creates a folder not ported during harness package creation (as they don't use a Manifest file atm and it ignore `.json` files).
|
3 |
+
It will need to be updated if we want to use the harness' version of big bench to actually copy the json files.
|
4 |
+
"""
|
5 |
+
import os
|
6 |
+
|
7 |
+
import lm_eval
|
8 |
+
|
9 |
+
if __name__ == "__main__":
|
10 |
+
lm_eval_path = lm_eval.__path__[0]
|
11 |
+
os.makedirs(os.path.join(lm_eval_path, "datasets", "bigbench_resources"), exist_ok=True)
|
src/{display/about.py → about.py}
RENAMED
@@ -11,8 +11,12 @@ class Task:
|
|
11 |
# Init: to update with your specific keys
|
12 |
class Tasks(Enum):
|
13 |
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
14 |
-
task0 = Task("
|
15 |
-
task1 = Task("
|
|
|
|
|
|
|
|
|
16 |
|
17 |
|
18 |
# Your leaderboard name
|
|
|
11 |
# Init: to update with your specific keys
|
12 |
class Tasks(Enum):
|
13 |
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
14 |
+
task0 = Task("anli_r1", "acc", "ANLI")
|
15 |
+
task1 = Task("logiqa", "acc_norm", "LogiQA")
|
16 |
+
|
17 |
+
TASKS_HARNESS = [task.value.benchmark for task in Tasks]
|
18 |
+
|
19 |
+
NUM_FEWSHOT = 0 # Change with your few shot
|
20 |
|
21 |
|
22 |
# Your leaderboard name
|
src/backend/manage_requests.py
ADDED
@@ -0,0 +1,123 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import glob
|
2 |
+
import json
|
3 |
+
from dataclasses import dataclass
|
4 |
+
from typing import Optional
|
5 |
+
|
6 |
+
from huggingface_hub import HfApi, snapshot_download
|
7 |
+
from src.envs import TOKEN
|
8 |
+
|
9 |
+
@dataclass
|
10 |
+
class EvalRequest:
|
11 |
+
model: str
|
12 |
+
private: bool
|
13 |
+
status: str
|
14 |
+
json_filepath: str
|
15 |
+
weight_type: str = "Original"
|
16 |
+
model_type: str = "" # pretrained, finetuned, with RL
|
17 |
+
precision: str = "" # float16, bfloat16, 8bit, 4bit, GPTQ
|
18 |
+
base_model: Optional[str] = None # for adapter models
|
19 |
+
revision: str = "main" # commit
|
20 |
+
submitted_time: Optional[str] = "2022-05-18T11:40:22.519222" # random date just so that we can still order requests by date
|
21 |
+
model_type: Optional[str] = None
|
22 |
+
likes: Optional[int] = 0
|
23 |
+
params: Optional[int] = None
|
24 |
+
license: Optional[str] = ""
|
25 |
+
|
26 |
+
def get_model_args(self):
|
27 |
+
model_args = f"pretrained={self.model},revision={self.revision}"
|
28 |
+
|
29 |
+
if self.precision in ["float16", "bfloat16"]:
|
30 |
+
model_args += f",dtype={self.precision}"
|
31 |
+
elif self.precision == "8bit":
|
32 |
+
model_args += ",load_in_8bit=True"
|
33 |
+
elif self.precision == "4bit":
|
34 |
+
model_args += ",load_in_4bit=True"
|
35 |
+
elif self.precision == "GPTQ":
|
36 |
+
# A GPTQ model does not need dtype to be specified,
|
37 |
+
# it will be inferred from the config
|
38 |
+
pass
|
39 |
+
else:
|
40 |
+
raise Exception(f"Unknown precision {self.precision}.")
|
41 |
+
|
42 |
+
return model_args
|
43 |
+
|
44 |
+
|
45 |
+
def set_eval_request(
|
46 |
+
api: HfApi, eval_request: EvalRequest, set_to_status: str, hf_repo: str, local_dir: str
|
47 |
+
):
|
48 |
+
"""Updates a given eval request with its new status on the hub (running, completed, failed, ...)"""
|
49 |
+
json_filepath = eval_request.json_filepath
|
50 |
+
|
51 |
+
with open(json_filepath) as fp:
|
52 |
+
data = json.load(fp)
|
53 |
+
|
54 |
+
data["status"] = set_to_status
|
55 |
+
|
56 |
+
with open(json_filepath, "w") as f:
|
57 |
+
f.write(json.dumps(data))
|
58 |
+
|
59 |
+
api.upload_file(
|
60 |
+
path_or_fileobj=json_filepath,
|
61 |
+
path_in_repo=json_filepath.replace(local_dir, ""),
|
62 |
+
repo_id=hf_repo,
|
63 |
+
repo_type="dataset",
|
64 |
+
)
|
65 |
+
|
66 |
+
|
67 |
+
def get_eval_requests(job_status: list, local_dir: str, hf_repo: str) -> list[EvalRequest]:
|
68 |
+
"""Get all pending evaluation requests and return a list in which private
|
69 |
+
models appearing first, followed by public models sorted by the number of
|
70 |
+
likes.
|
71 |
+
|
72 |
+
Returns:
|
73 |
+
`list[EvalRequest]`: a list of model info dicts.
|
74 |
+
"""
|
75 |
+
snapshot_download(repo_id=hf_repo, revision="main", local_dir=local_dir, repo_type="dataset", max_workers=60)
|
76 |
+
json_files = glob.glob(f"{local_dir}/**/*.json", recursive=True)
|
77 |
+
|
78 |
+
eval_requests = []
|
79 |
+
for json_filepath in json_files:
|
80 |
+
with open(json_filepath) as fp:
|
81 |
+
data = json.load(fp)
|
82 |
+
if data["status"] in job_status:
|
83 |
+
data["json_filepath"] = json_filepath
|
84 |
+
eval_request = EvalRequest(**data)
|
85 |
+
eval_requests.append(eval_request)
|
86 |
+
|
87 |
+
return eval_requests
|
88 |
+
|
89 |
+
|
90 |
+
def check_completed_evals(
|
91 |
+
api: HfApi,
|
92 |
+
hf_repo: str,
|
93 |
+
local_dir: str,
|
94 |
+
checked_status: str,
|
95 |
+
completed_status: str,
|
96 |
+
failed_status: str,
|
97 |
+
hf_repo_results: str,
|
98 |
+
local_dir_results: str,
|
99 |
+
):
|
100 |
+
"""Checks if the currently running evals are completed, if yes, update their status on the hub."""
|
101 |
+
snapshot_download(repo_id=hf_repo_results, revision="main", local_dir=local_dir_results, repo_type="dataset", max_workers=60)
|
102 |
+
|
103 |
+
running_evals = get_eval_requests(checked_status, hf_repo=hf_repo, local_dir=local_dir)
|
104 |
+
|
105 |
+
for eval_request in running_evals:
|
106 |
+
model = eval_request.model
|
107 |
+
print("====================================")
|
108 |
+
print(f"Checking {model}")
|
109 |
+
|
110 |
+
output_path = model
|
111 |
+
output_file = f"{local_dir_results}/{output_path}/results*.json"
|
112 |
+
output_file_exists = len(glob.glob(output_file)) > 0
|
113 |
+
|
114 |
+
if output_file_exists:
|
115 |
+
print(
|
116 |
+
f"EXISTS output file exists for {model} setting it to {completed_status}"
|
117 |
+
)
|
118 |
+
set_eval_request(api, eval_request, completed_status, hf_repo, local_dir)
|
119 |
+
else:
|
120 |
+
print(
|
121 |
+
f"No result file found for {model} setting it to {failed_status}"
|
122 |
+
)
|
123 |
+
set_eval_request(api, eval_request, failed_status, hf_repo, local_dir)
|
src/backend/run_eval_suite.py
ADDED
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import os
|
3 |
+
import logging
|
4 |
+
from datetime import datetime
|
5 |
+
|
6 |
+
from lm_eval import tasks, evaluator, utils
|
7 |
+
|
8 |
+
from src.envs import RESULTS_REPO, API
|
9 |
+
from src.backend.manage_requests import EvalRequest
|
10 |
+
|
11 |
+
logging.getLogger("openai").setLevel(logging.WARNING)
|
12 |
+
|
13 |
+
def run_evaluation(eval_request: EvalRequest, task_names, num_fewshot, batch_size, device, local_dir: str, results_repo: str, no_cache=True, limit=None):
|
14 |
+
if limit:
|
15 |
+
print(
|
16 |
+
"WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT."
|
17 |
+
)
|
18 |
+
|
19 |
+
task_names = utils.pattern_match(task_names, tasks.ALL_TASKS)
|
20 |
+
|
21 |
+
print(f"Selected Tasks: {task_names}")
|
22 |
+
|
23 |
+
results = evaluator.simple_evaluate(
|
24 |
+
model="hf-causal-experimental", # "hf-causal"
|
25 |
+
model_args=eval_request.get_model_args(),
|
26 |
+
tasks=task_names,
|
27 |
+
num_fewshot=num_fewshot,
|
28 |
+
batch_size=batch_size,
|
29 |
+
device=device,
|
30 |
+
no_cache=no_cache,
|
31 |
+
limit=limit,
|
32 |
+
write_out=True,
|
33 |
+
output_base_path="logs"
|
34 |
+
)
|
35 |
+
|
36 |
+
results["config"]["model_dtype"] = eval_request.precision
|
37 |
+
results["config"]["model_name"] = eval_request.model
|
38 |
+
results["config"]["model_sha"] = eval_request.revision
|
39 |
+
|
40 |
+
dumped = json.dumps(results, indent=2)
|
41 |
+
print(dumped)
|
42 |
+
|
43 |
+
output_path = os.path.join(local_dir, *eval_request.model.split("/"), f"results_{datetime.now()}.json")
|
44 |
+
os.makedirs(os.path.dirname(output_path), exist_ok=True)
|
45 |
+
with open(output_path, "w") as f:
|
46 |
+
f.write(dumped)
|
47 |
+
|
48 |
+
print(evaluator.make_table(results))
|
49 |
+
|
50 |
+
API.upload_file(
|
51 |
+
path_or_fileobj=output_path,
|
52 |
+
path_in_repo=f"{eval_request.model}/results_{datetime.now()}.json",
|
53 |
+
repo_id=results_repo,
|
54 |
+
repo_type="dataset",
|
55 |
+
)
|
56 |
+
|
57 |
+
return results
|
src/backend/sort_queue.py
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
from dataclasses import dataclass
|
3 |
+
|
4 |
+
from huggingface_hub import HfApi
|
5 |
+
|
6 |
+
from src.backend.manage_requests import EvalRequest
|
7 |
+
|
8 |
+
|
9 |
+
@dataclass
|
10 |
+
class ModelMetadata:
|
11 |
+
likes: int = 0
|
12 |
+
size: int = 15
|
13 |
+
|
14 |
+
|
15 |
+
def sort_models_by_priority(api: HfApi, models: list[EvalRequest]) -> list[EvalRequest]:
|
16 |
+
private_models = [model for model in models if model.private]
|
17 |
+
public_models = [model for model in models if not model.private]
|
18 |
+
|
19 |
+
return sort_by_submit_date(private_models) + sort_by_submit_date(public_models)
|
20 |
+
|
21 |
+
def sort_by_submit_date(eval_requests: list[EvalRequest]) -> list[EvalRequest]:
|
22 |
+
return sorted(eval_requests, key=lambda x: x.submitted_time, reverse=False)
|
23 |
+
|
24 |
+
def sort_by_size(eval_requests: list[EvalRequest]) -> list[EvalRequest]:
|
25 |
+
return sorted(eval_requests, key=lambda x: x.params, reverse=False)
|
26 |
+
|
27 |
+
def sort_by_likes(eval_requests: list[EvalRequest]) -> list[EvalRequest]:
|
28 |
+
return sorted(eval_requests, key=lambda x: x.likes, reverse=False)
|
src/display/formatting.py
CHANGED
@@ -1,12 +1,3 @@
|
|
1 |
-
import os
|
2 |
-
from datetime import datetime, timezone
|
3 |
-
|
4 |
-
from huggingface_hub import HfApi
|
5 |
-
from huggingface_hub.hf_api import ModelInfo
|
6 |
-
|
7 |
-
|
8 |
-
API = HfApi()
|
9 |
-
|
10 |
def model_hyperlink(link, model_name):
|
11 |
return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
|
12 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
def model_hyperlink(link, model_name):
|
2 |
return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
|
3 |
|
src/display/utils.py
CHANGED
@@ -3,7 +3,7 @@ from enum import Enum
|
|
3 |
|
4 |
import pandas as pd
|
5 |
|
6 |
-
from src.
|
7 |
|
8 |
def fields(raw_class):
|
9 |
return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
|
|
|
3 |
|
4 |
import pandas as pd
|
5 |
|
6 |
+
from src.about import Tasks
|
7 |
|
8 |
def fields(raw_class):
|
9 |
return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
|
src/envs.py
CHANGED
@@ -2,18 +2,26 @@ import os
|
|
2 |
|
3 |
from huggingface_hub import HfApi
|
4 |
|
5 |
-
#
|
6 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
|
8 |
-
OWNER = "demo-leaderboard"
|
9 |
REPO_ID = f"{OWNER}/leaderboard"
|
10 |
QUEUE_REPO = f"{OWNER}/requests"
|
11 |
RESULTS_REPO = f"{OWNER}/results"
|
12 |
|
|
|
13 |
CACHE_PATH=os.getenv("HF_HOME", ".")
|
14 |
|
15 |
# Local caches
|
16 |
EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
|
17 |
EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
|
|
|
|
|
18 |
|
19 |
API = HfApi(token=TOKEN)
|
|
|
2 |
|
3 |
from huggingface_hub import HfApi
|
4 |
|
5 |
+
# Info to change for your repository
|
6 |
+
# ----------------------------------
|
7 |
+
TOKEN = os.environ.get("TOKEN", None) # A read/write token for your org
|
8 |
+
|
9 |
+
OWNER = "demo-leaderboard-backend" # Change to your org - don't forget to create a results and request file
|
10 |
+
DEVICE = "cpu" # cuda:0 if you add compute
|
11 |
+
LIMIT = 20 # !!!! Should be None for actual evaluations!!!
|
12 |
+
# ----------------------------------
|
13 |
|
|
|
14 |
REPO_ID = f"{OWNER}/leaderboard"
|
15 |
QUEUE_REPO = f"{OWNER}/requests"
|
16 |
RESULTS_REPO = f"{OWNER}/results"
|
17 |
|
18 |
+
# If you setup a cache later, just change HF_HOME
|
19 |
CACHE_PATH=os.getenv("HF_HOME", ".")
|
20 |
|
21 |
# Local caches
|
22 |
EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
|
23 |
EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
|
24 |
+
EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
|
25 |
+
EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
|
26 |
|
27 |
API = HfApi(token=TOKEN)
|
src/leaderboard/read_evals.py
CHANGED
@@ -103,7 +103,7 @@ class EvalResult:
|
|
103 |
self.num_params = request.get("params", 0)
|
104 |
self.date = request.get("submitted_time", "")
|
105 |
except Exception:
|
106 |
-
print(f"Could not find request file for {self.org}/{self.model}")
|
107 |
|
108 |
def to_dict(self):
|
109 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
|
|
103 |
self.num_params = request.get("params", 0)
|
104 |
self.date = request.get("submitted_time", "")
|
105 |
except Exception:
|
106 |
+
print(f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}")
|
107 |
|
108 |
def to_dict(self):
|
109 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|