Spaces:
Runtime error
Runtime error
Quentin Gallouédec
commited on
Commit
•
74e3b17
1
Parent(s):
e462d51
back and front!
Browse files- .gitignore +3 -1
- Makefile +6 -6
- app.py +240 -40
- main_backend_harness.py +0 -102
- scripts/create_request_file.py +0 -73
- scripts/fix_harness_import.py +0 -11
- src/about.py +0 -27
- src/backend/manage_requests.py +0 -107
- src/backend/run_eval_suite_harness.py +0 -91
- src/backend/sort_queue.py +0 -23
- src/{display/css_html_js.py → css_html_js.py} +0 -0
- src/display/log_visualizer.py +0 -40
- src/envs.py +3 -7
- src/logging.py +0 -1
- src/populate.py +0 -56
.gitignore
CHANGED
@@ -12,4 +12,6 @@ eval-results/
|
|
12 |
eval-queue-bk/
|
13 |
eval-results-bk/
|
14 |
logs/
|
15 |
-
output.log
|
|
|
|
|
|
12 |
eval-queue-bk/
|
13 |
eval-results-bk/
|
14 |
logs/
|
15 |
+
output.log
|
16 |
+
env
|
17 |
+
.DS_Store
|
Makefile
CHANGED
@@ -2,12 +2,12 @@
|
|
2 |
|
3 |
|
4 |
style:
|
5 |
-
python -m black --line-length 119 .
|
6 |
-
python -m isort .
|
7 |
-
ruff check --fix .
|
8 |
|
9 |
|
10 |
quality:
|
11 |
-
python -m black --check --line-length 119 .
|
12 |
-
python -m isort --check-only .
|
13 |
-
ruff check .
|
|
|
2 |
|
3 |
|
4 |
style:
|
5 |
+
python -m black --line-length 119 scripts src app.py
|
6 |
+
python -m isort scripts src app.py
|
7 |
+
ruff check --fix scripts src app.py
|
8 |
|
9 |
|
10 |
quality:
|
11 |
+
python -m black --check --line-length 119 scripts src app.py
|
12 |
+
python -m isort --check-only scripts src app.py
|
13 |
+
ruff check scripts src app.py
|
app.py
CHANGED
@@ -1,62 +1,262 @@
|
|
|
|
|
|
|
|
1 |
import logging
|
2 |
-
|
|
|
3 |
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
|
|
|
|
|
|
|
|
8 |
|
9 |
-
from
|
|
|
|
|
10 |
|
11 |
-
|
12 |
-
|
13 |
-
from src.display.log_visualizer import log_file_to_html_string
|
14 |
-
from src.display.css_html_js import dark_mode_gradio_js
|
15 |
-
from src.envs import REFRESH_RATE, REPO_ID, QUEUE_REPO, RESULTS_REPO
|
16 |
-
from src.logging import setup_logger, log_file
|
17 |
|
18 |
-
|
19 |
logger = setup_logger(__name__)
|
20 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
|
22 |
-
|
23 |
-
|
24 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
25 |
"""
|
26 |
|
27 |
-
|
28 |
-
|
29 |
-
| Description | Link |
|
30 |
-
|-----------------|------|
|
31 |
-
| Leaderboard | [{REPO_ID}](https://huggingface.co/spaces/{REPO_ID}) |
|
32 |
-
| Queue Repo | [{QUEUE_REPO}](https://huggingface.co/datasets/{QUEUE_REPO}) |
|
33 |
-
| Results Repo | [{RESULTS_REPO}](https://huggingface.co/datasets/{RESULTS_REPO}) |
|
34 |
"""
|
35 |
|
|
|
|
|
|
|
36 |
|
37 |
-
def button_auto_eval():
|
38 |
-
logger.info("Manually triggering Auto Eval")
|
39 |
-
run_auto_eval()
|
40 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
41 |
|
42 |
-
reverse_order_checkbox = gr.Checkbox(label="Reverse Order", value=True)
|
43 |
|
44 |
with gr.Blocks(js=dark_mode_gradio_js) as demo:
|
45 |
-
gr.
|
46 |
-
|
47 |
-
|
48 |
-
with gr.
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
55 |
|
56 |
-
dummy = gr.Markdown(run_auto_eval, every=REFRESH_RATE, visible=False)
|
57 |
|
58 |
-
|
|
|
|
|
59 |
|
60 |
|
61 |
if __name__ == "__main__":
|
62 |
-
demo.queue(
|
|
|
1 |
+
import fnmatch
|
2 |
+
import glob
|
3 |
+
import json
|
4 |
import logging
|
5 |
+
import os
|
6 |
+
import pprint
|
7 |
|
8 |
+
import gradio as gr
|
9 |
+
import gymnasium as gym
|
10 |
+
import numpy as np
|
11 |
+
import pandas as pd
|
12 |
+
import torch
|
13 |
+
from apscheduler.schedulers.background import BackgroundScheduler
|
14 |
+
from huggingface_hub import hf_hub_download, snapshot_download
|
15 |
+
from huggingface_hub.utils._errors import EntryNotFoundError
|
16 |
|
17 |
+
from src.css_html_js import dark_mode_gradio_js
|
18 |
+
from src.envs import API, RESULTS_PATH, RESULTS_REPO, TOKEN
|
19 |
+
from src.logging import configure_root_logger, setup_logger
|
20 |
|
21 |
+
logging.getLogger("openai").setLevel(logging.WARNING)
|
22 |
+
logger = setup_logger(__name__)
|
|
|
|
|
|
|
|
|
23 |
|
24 |
+
configure_root_logger()
|
25 |
logger = setup_logger(__name__)
|
26 |
|
27 |
+
pp = pprint.PrettyPrinter(width=80)
|
28 |
+
|
29 |
+
|
30 |
+
ALL_ENV_IDS = [
|
31 |
+
"CartPole-v1",
|
32 |
+
# "BreakoutNoFrameskip-v4",
|
33 |
+
]
|
34 |
+
|
35 |
+
|
36 |
+
def model_hyperlink(link, model_id):
|
37 |
+
return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_id}</a>'
|
38 |
+
|
39 |
+
|
40 |
+
def make_clickable_model(model_id):
|
41 |
+
link = f"https://huggingface.co/{model_id}"
|
42 |
+
return model_hyperlink(link, model_id)
|
43 |
+
|
44 |
+
|
45 |
+
def pattern_match(patterns, source_list):
|
46 |
+
if isinstance(patterns, str):
|
47 |
+
patterns = [patterns]
|
48 |
+
|
49 |
+
env_ids = set()
|
50 |
+
for pattern in patterns:
|
51 |
+
for matching in fnmatch.filter(source_list, pattern):
|
52 |
+
env_ids.add(matching)
|
53 |
+
return sorted(list(env_ids))
|
54 |
+
|
55 |
+
|
56 |
+
def evaluate(model_id, revision):
|
57 |
+
tags = API.model_info(model_id, revision=revision).tags
|
58 |
+
|
59 |
+
# Extract the environment IDs from the tags (usually only one)
|
60 |
+
env_ids = pattern_match(tags, ALL_ENV_IDS)
|
61 |
+
logger.info(f"Selected environments: {env_ids}")
|
62 |
+
|
63 |
+
results = {}
|
64 |
+
|
65 |
+
# Check if the agent exists
|
66 |
+
try:
|
67 |
+
agent_path = hf_hub_download(repo_id=model_id, filename="agent.pt")
|
68 |
+
except EntryNotFoundError:
|
69 |
+
logger.error("Agent not found")
|
70 |
+
return None
|
71 |
+
|
72 |
+
# Check safety
|
73 |
+
security = next(iter(API.list_files_info(model_id, "agent.pt", expand=True))).security
|
74 |
+
if security is None or "safe" not in security:
|
75 |
+
logger.error("Agent safety not available")
|
76 |
+
return None
|
77 |
+
elif not security["safe"]:
|
78 |
+
logger.error("Agent not safe")
|
79 |
+
return None
|
80 |
+
|
81 |
+
# Load the agent
|
82 |
+
try:
|
83 |
+
agent = torch.jit.load(agent_path)
|
84 |
+
except Exception as e:
|
85 |
+
logger.error(f"Error loading agent: {e}")
|
86 |
+
return None
|
87 |
+
|
88 |
+
# Evaluate the agent on the environments
|
89 |
+
for env_id in env_ids:
|
90 |
+
episodic_rewards = []
|
91 |
+
env = gym.make(env_id)
|
92 |
+
for _ in range(10):
|
93 |
+
episodic_reward = 0.0
|
94 |
+
observation, info = env.reset()
|
95 |
+
done = False
|
96 |
+
while not done:
|
97 |
+
torch_observation = torch.from_numpy(np.array([observation]))
|
98 |
+
action = agent(torch_observation).numpy()[0]
|
99 |
+
observation, reward, terminated, truncated, info = env.step(action)
|
100 |
+
done = terminated or truncated
|
101 |
+
episodic_reward += reward
|
102 |
+
|
103 |
+
episodic_rewards.append(episodic_reward)
|
104 |
+
|
105 |
+
mean_reward = np.mean(episodic_rewards)
|
106 |
+
results[env_id] = {"episodic_return": mean_reward}
|
107 |
+
return results
|
108 |
|
109 |
+
|
110 |
+
def _backend_routine():
|
111 |
+
# List only the text classification models
|
112 |
+
rl_models = list(API.list_models(filter="reinforcement-learning"))
|
113 |
+
logger.info(f"Found {len(rl_models)} RL models")
|
114 |
+
compatible_models = []
|
115 |
+
for model in rl_models:
|
116 |
+
filenames = [sib.rfilename for sib in model.siblings]
|
117 |
+
if "agent.pt" in filenames:
|
118 |
+
compatible_models.append((model.modelId, model.sha))
|
119 |
+
|
120 |
+
logger.info(f"Found {len(compatible_models)} compatible models")
|
121 |
+
|
122 |
+
# Get the results
|
123 |
+
snapshot_download(
|
124 |
+
repo_id=RESULTS_REPO,
|
125 |
+
revision="main",
|
126 |
+
local_dir=RESULTS_PATH,
|
127 |
+
repo_type="dataset",
|
128 |
+
max_workers=60,
|
129 |
+
token=TOKEN,
|
130 |
+
)
|
131 |
+
json_files = glob.glob(f"{RESULTS_PATH}/**/*.json", recursive=True)
|
132 |
+
|
133 |
+
evaluated_models = set()
|
134 |
+
for json_filepath in json_files:
|
135 |
+
with open(json_filepath) as fp:
|
136 |
+
data = json.load(fp)
|
137 |
+
evaluated_models.add((data["config"]["model_id"], data["config"]["model_sha"]))
|
138 |
+
|
139 |
+
# Find the models that are not associated with any results
|
140 |
+
pending_models = set(compatible_models) - evaluated_models
|
141 |
+
logger.info(f"Found {len(pending_models)} pending models")
|
142 |
+
|
143 |
+
# Run an evaluation on the models
|
144 |
+
for model_id, sha in pending_models:
|
145 |
+
logger.info(f"Running evaluation on {model_id}")
|
146 |
+
report = {"config": {"model_id": model_id, "model_sha": sha}}
|
147 |
+
evaluations = evaluate(model_id, revision=sha)
|
148 |
+
if evaluations is not None:
|
149 |
+
report["results"] = evaluations
|
150 |
+
report["status"] = "DONE"
|
151 |
+
else:
|
152 |
+
report["status"] = "FAILED"
|
153 |
+
|
154 |
+
# Update the results
|
155 |
+
dumped = json.dumps(report, indent=2)
|
156 |
+
output_path = os.path.join(RESULTS_PATH, model_id, f"results_{sha}.json")
|
157 |
+
os.makedirs(os.path.dirname(output_path), exist_ok=True)
|
158 |
+
with open(output_path, "w") as f:
|
159 |
+
f.write(dumped)
|
160 |
+
|
161 |
+
# Upload the results to the results repo
|
162 |
+
API.upload_file(
|
163 |
+
path_or_fileobj=output_path,
|
164 |
+
path_in_repo=f"{model_id}/results_{sha}.json",
|
165 |
+
repo_id=RESULTS_REPO,
|
166 |
+
repo_type="dataset",
|
167 |
+
)
|
168 |
+
|
169 |
+
|
170 |
+
def backend_routine():
|
171 |
+
try:
|
172 |
+
_backend_routine()
|
173 |
+
except Exception as e:
|
174 |
+
logger.error(f"{e.__class__.__name__}: {str(e)}")
|
175 |
+
|
176 |
+
|
177 |
+
def get_leaderboard_df():
|
178 |
+
snapshot_download(
|
179 |
+
repo_id=RESULTS_REPO,
|
180 |
+
revision="main",
|
181 |
+
local_dir=RESULTS_PATH,
|
182 |
+
repo_type="dataset",
|
183 |
+
max_workers=60,
|
184 |
+
token=TOKEN,
|
185 |
+
)
|
186 |
+
|
187 |
+
json_files = glob.glob(f"{RESULTS_PATH}/**/*.json", recursive=True)
|
188 |
+
data = []
|
189 |
+
|
190 |
+
for json_filepath in json_files:
|
191 |
+
with open(json_filepath) as fp:
|
192 |
+
report = json.load(fp)
|
193 |
+
model_id = report["config"]["model_id"]
|
194 |
+
row = {"Agent": model_id, "Status": report["status"]}
|
195 |
+
if report["status"] == "DONE":
|
196 |
+
results = {env_id: result["episodic_return"] for env_id, result in report["results"].items()}
|
197 |
+
row.update(results)
|
198 |
+
data.append(row)
|
199 |
+
|
200 |
+
# Create DataFrame
|
201 |
+
df = pd.DataFrame(data)
|
202 |
+
# Replace NaN values with empty strings
|
203 |
+
df = df.fillna("")
|
204 |
+
return df
|
205 |
+
|
206 |
+
|
207 |
+
TITLE = """
|
208 |
+
🚀 Open RL Leaderboard
|
209 |
"""
|
210 |
|
211 |
+
INTRODUCTION_TEXT = """
|
212 |
+
Welcome to the Open RL Leaderboard! This is a community-driven benchmark for reinforcement learning models.
|
|
|
|
|
|
|
|
|
|
|
213 |
"""
|
214 |
|
215 |
+
ABOUT_TEXT = """
|
216 |
+
The Open RL Leaderboard is a community-driven benchmark for reinforcement learning models.
|
217 |
+
"""
|
218 |
|
|
|
|
|
|
|
219 |
|
220 |
+
def select_column(column_names, data):
|
221 |
+
column_names = [col for col in column_names if col in data.columns]
|
222 |
+
column_names = ["Agent"] + column_names # add model name column
|
223 |
+
df = data[column_names]
|
224 |
+
|
225 |
+
def check_row(row):
|
226 |
+
return not (row.drop("Agent") == "").all()
|
227 |
+
|
228 |
+
mask = df.apply(check_row, axis=1)
|
229 |
+
df = df[mask]
|
230 |
+
return df
|
231 |
|
|
|
232 |
|
233 |
with gr.Blocks(js=dark_mode_gradio_js) as demo:
|
234 |
+
gr.HTML(TITLE)
|
235 |
+
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
236 |
+
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
237 |
+
with gr.TabItem("🏅 Leaderboard", elem_id="llm-benchmark-tab-table", id=0):
|
238 |
+
full_df = get_leaderboard_df()
|
239 |
+
hidden_df = gr.components.Dataframe(full_df, visible=False) # hidden dataframe
|
240 |
+
|
241 |
+
env_checkboxes = gr.components.CheckboxGroup(
|
242 |
+
label="Environments",
|
243 |
+
choices=ALL_ENV_IDS,
|
244 |
+
value=[ALL_ENV_IDS[0]],
|
245 |
+
interactive=True,
|
246 |
+
)
|
247 |
+
leaderboard = gr.components.Dataframe(select_column([ALL_ENV_IDS[0]], full_df))
|
248 |
+
|
249 |
+
# Events
|
250 |
+
env_checkboxes.change(select_column, [env_checkboxes, hidden_df], leaderboard)
|
251 |
+
|
252 |
+
with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
|
253 |
+
gr.Markdown(ABOUT_TEXT)
|
254 |
|
|
|
255 |
|
256 |
+
scheduler = BackgroundScheduler()
|
257 |
+
scheduler.add_job(func=backend_routine, trigger="interval", seconds=30)
|
258 |
+
scheduler.start()
|
259 |
|
260 |
|
261 |
if __name__ == "__main__":
|
262 |
+
demo.queue().launch() # server_name="0.0.0.0", show_error=True, server_port=7860)
|
main_backend_harness.py
DELETED
@@ -1,102 +0,0 @@
|
|
1 |
-
import logging
|
2 |
-
import pprint
|
3 |
-
|
4 |
-
from huggingface_hub import snapshot_download
|
5 |
-
|
6 |
-
logging.getLogger("openai").setLevel(logging.WARNING)
|
7 |
-
|
8 |
-
from src.backend.run_eval_suite_harness import run_evaluation
|
9 |
-
from src.backend.manage_requests import check_completed_evals, get_eval_requests, set_eval_request
|
10 |
-
from src.backend.sort_queue import sort_models_by_priority
|
11 |
-
|
12 |
-
from src.envs import (
|
13 |
-
QUEUE_REPO,
|
14 |
-
EVAL_REQUESTS_PATH_BACKEND,
|
15 |
-
RESULTS_REPO,
|
16 |
-
EVAL_RESULTS_PATH_BACKEND,
|
17 |
-
DEVICE,
|
18 |
-
API,
|
19 |
-
LIMIT,
|
20 |
-
TOKEN,
|
21 |
-
)
|
22 |
-
from src.about import Tasks, NUM_FEWSHOT
|
23 |
-
from src.logging import setup_logger
|
24 |
-
|
25 |
-
TASKS_HARNESS = [task.value.benchmark for task in Tasks]
|
26 |
-
|
27 |
-
# logging.basicConfig(level=logging.ERROR)
|
28 |
-
logger = setup_logger(__name__)
|
29 |
-
pp = pprint.PrettyPrinter(width=80)
|
30 |
-
|
31 |
-
PENDING_STATUS = "PENDING"
|
32 |
-
RUNNING_STATUS = "RUNNING"
|
33 |
-
FINISHED_STATUS = "FINISHED"
|
34 |
-
FAILED_STATUS = "FAILED"
|
35 |
-
|
36 |
-
snapshot_download(
|
37 |
-
repo_id=RESULTS_REPO,
|
38 |
-
revision="main",
|
39 |
-
local_dir=EVAL_RESULTS_PATH_BACKEND,
|
40 |
-
repo_type="dataset",
|
41 |
-
max_workers=60,
|
42 |
-
token=TOKEN,
|
43 |
-
)
|
44 |
-
snapshot_download(
|
45 |
-
repo_id=QUEUE_REPO,
|
46 |
-
revision="main",
|
47 |
-
local_dir=EVAL_REQUESTS_PATH_BACKEND,
|
48 |
-
repo_type="dataset",
|
49 |
-
max_workers=60,
|
50 |
-
token=TOKEN,
|
51 |
-
)
|
52 |
-
|
53 |
-
|
54 |
-
def run_auto_eval():
|
55 |
-
current_pending_status = [PENDING_STATUS]
|
56 |
-
|
57 |
-
# pull the eval dataset from the hub and parse any eval requests
|
58 |
-
# check completed evals and set them to finished
|
59 |
-
check_completed_evals(
|
60 |
-
api=API,
|
61 |
-
checked_status=RUNNING_STATUS,
|
62 |
-
completed_status=FINISHED_STATUS,
|
63 |
-
failed_status=FAILED_STATUS,
|
64 |
-
hf_repo=QUEUE_REPO,
|
65 |
-
local_dir=EVAL_REQUESTS_PATH_BACKEND,
|
66 |
-
hf_repo_results=RESULTS_REPO,
|
67 |
-
local_dir_results=EVAL_RESULTS_PATH_BACKEND,
|
68 |
-
)
|
69 |
-
|
70 |
-
# Get all eval request that are PENDING, if you want to run other evals, change this parameter
|
71 |
-
eval_requests = get_eval_requests(
|
72 |
-
job_status=current_pending_status, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND
|
73 |
-
)
|
74 |
-
# Sort the evals by priority (first submitted first run)
|
75 |
-
eval_requests = sort_models_by_priority(api=API, models=eval_requests)
|
76 |
-
|
77 |
-
print(f"Found {len(eval_requests)} {','.join(current_pending_status)} eval requests")
|
78 |
-
|
79 |
-
if len(eval_requests) == 0:
|
80 |
-
return
|
81 |
-
|
82 |
-
eval_request = eval_requests[0]
|
83 |
-
logger.info(pp.pformat(eval_request))
|
84 |
-
|
85 |
-
set_eval_request(
|
86 |
-
api=API,
|
87 |
-
eval_request=eval_request,
|
88 |
-
set_to_status=RUNNING_STATUS,
|
89 |
-
hf_repo=QUEUE_REPO,
|
90 |
-
local_dir=EVAL_REQUESTS_PATH_BACKEND,
|
91 |
-
)
|
92 |
-
|
93 |
-
run_evaluation(
|
94 |
-
eval_request=eval_request,
|
95 |
-
task_names=TASKS_HARNESS,
|
96 |
-
local_dir=EVAL_RESULTS_PATH_BACKEND,
|
97 |
-
results_repo=RESULTS_REPO,
|
98 |
-
)
|
99 |
-
|
100 |
-
|
101 |
-
if __name__ == "__main__":
|
102 |
-
run_auto_eval()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
scripts/create_request_file.py
DELETED
@@ -1,73 +0,0 @@
|
|
1 |
-
import json
|
2 |
-
import os
|
3 |
-
import pprint
|
4 |
-
from datetime import datetime, timezone
|
5 |
-
|
6 |
-
import click
|
7 |
-
from colorama import Fore
|
8 |
-
from huggingface_hub import HfApi, snapshot_download
|
9 |
-
from src.envs import TOKEN, EVAL_REQUESTS_PATH, QUEUE_REPO
|
10 |
-
|
11 |
-
|
12 |
-
def main():
|
13 |
-
api = HfApi()
|
14 |
-
current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
15 |
-
snapshot_download(
|
16 |
-
repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", token=TOKEN
|
17 |
-
)
|
18 |
-
|
19 |
-
model_name = click.prompt("Enter model name")
|
20 |
-
revision = click.prompt("Enter revision", default="main")
|
21 |
-
status = click.prompt("Enter status", default="FINISHED")
|
22 |
-
|
23 |
-
try:
|
24 |
-
model_info = api.model_info(repo_id=model_name, revision=revision)
|
25 |
-
except Exception as e:
|
26 |
-
print(f"{Fore.RED}Could not find model info for {model_name} on the Hub\n{e}{Fore.RESET}")
|
27 |
-
return 1
|
28 |
-
|
29 |
-
try:
|
30 |
-
license = model_info.cardData["license"]
|
31 |
-
except Exception:
|
32 |
-
license = "?"
|
33 |
-
|
34 |
-
eval_entry = {
|
35 |
-
"model": model_name,
|
36 |
-
"revision": revision,
|
37 |
-
"status": status,
|
38 |
-
"submitted_time": current_time,
|
39 |
-
"likes": model_info.likes,
|
40 |
-
"license": license,
|
41 |
-
}
|
42 |
-
|
43 |
-
user_name = ""
|
44 |
-
model_path = model_name
|
45 |
-
if "/" in model_name:
|
46 |
-
user_name = model_name.split("/")[0]
|
47 |
-
model_path = model_name.split("/")[1]
|
48 |
-
|
49 |
-
pprint.pprint(eval_entry)
|
50 |
-
|
51 |
-
if click.confirm("Do you want to continue? This request file will be pushed to the hub"):
|
52 |
-
click.echo("continuing...")
|
53 |
-
|
54 |
-
out_dir = f"{EVAL_REQUESTS_PATH}/{user_name}"
|
55 |
-
os.makedirs(out_dir, exist_ok=True)
|
56 |
-
out_path = f"{out_dir}/{model_path}_eval_request.json"
|
57 |
-
|
58 |
-
with open(out_path, "w") as f:
|
59 |
-
f.write(json.dumps(eval_entry))
|
60 |
-
|
61 |
-
api.upload_file(
|
62 |
-
path_or_fileobj=out_path,
|
63 |
-
path_in_repo=out_path.split(f"{EVAL_REQUESTS_PATH}/")[1],
|
64 |
-
repo_id=QUEUE_REPO,
|
65 |
-
repo_type="dataset",
|
66 |
-
commit_message=f"Add {model_name} to eval queue",
|
67 |
-
)
|
68 |
-
else:
|
69 |
-
click.echo("aborting...")
|
70 |
-
|
71 |
-
|
72 |
-
if __name__ == "__main__":
|
73 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
scripts/fix_harness_import.py
DELETED
@@ -1,11 +0,0 @@
|
|
1 |
-
"""This file should be used after pip install -r requirements.
|
2 |
-
It creates a folder not ported during harness package creation (as they don't use a Manifest file atm and it ignore `.json` files).
|
3 |
-
It will need to be updated if we want to use the harness' version of big bench to actually copy the json files.
|
4 |
-
"""
|
5 |
-
import os
|
6 |
-
|
7 |
-
import lm_eval
|
8 |
-
|
9 |
-
if __name__ == "__main__":
|
10 |
-
lm_eval_path = lm_eval.__path__[0]
|
11 |
-
os.makedirs(os.path.join(lm_eval_path, "datasets", "bigbench_resources"), exist_ok=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/about.py
DELETED
@@ -1,27 +0,0 @@
|
|
1 |
-
from dataclasses import dataclass
|
2 |
-
from enum import Enum
|
3 |
-
|
4 |
-
|
5 |
-
@dataclass
|
6 |
-
class Task:
|
7 |
-
benchmark: str
|
8 |
-
metric: str
|
9 |
-
col_name: str
|
10 |
-
|
11 |
-
|
12 |
-
# Change for your tasks here
|
13 |
-
# ---------------------------------------------------
|
14 |
-
class Tasks(Enum):
|
15 |
-
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
16 |
-
# task0 = Task("PongNoFrameskip-v4", "episodic_return", "PongNoFrameskip-v4")
|
17 |
-
task1 = Task("BreakoutNoFrameskip-v4", "episodic_return", "BreakoutNoFrameskip-v4")
|
18 |
-
task2 = Task("CartPole-v1", "episodic_return", "CartPole-v1")
|
19 |
-
|
20 |
-
|
21 |
-
NUM_FEWSHOT = 0 # Change with your few shot
|
22 |
-
|
23 |
-
TASKS_HARNESS = [task.value.benchmark for task in Tasks]
|
24 |
-
# ---------------------------------------------------
|
25 |
-
|
26 |
-
TASKS_LIGHTEVAL = "lighteval|anli:r1|0|0,lighteval|logiqa|0|0"
|
27 |
-
# custom|myothertask|0|0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/backend/manage_requests.py
DELETED
@@ -1,107 +0,0 @@
|
|
1 |
-
import glob
|
2 |
-
import json
|
3 |
-
from dataclasses import dataclass
|
4 |
-
from typing import Optional
|
5 |
-
|
6 |
-
from huggingface_hub import HfApi, snapshot_download
|
7 |
-
from src.envs import TOKEN
|
8 |
-
from src.logging import setup_logger
|
9 |
-
|
10 |
-
logger = setup_logger(__name__)
|
11 |
-
|
12 |
-
|
13 |
-
@dataclass
|
14 |
-
class EvalRequest:
|
15 |
-
model: str
|
16 |
-
status: str
|
17 |
-
json_filepath: str
|
18 |
-
revision: str = "main" # commit
|
19 |
-
submitted_time: Optional[
|
20 |
-
str
|
21 |
-
] = "2022-05-18T11:40:22.519222" # random date just so that we can still order requests by date
|
22 |
-
likes: Optional[int] = 0
|
23 |
-
license: Optional[str] = ""
|
24 |
-
|
25 |
-
|
26 |
-
def set_eval_request(api: HfApi, eval_request: EvalRequest, set_to_status: str, hf_repo: str, local_dir: str):
|
27 |
-
"""Updates a given eval request with its new status on the hub (running, completed, failed, ...)"""
|
28 |
-
json_filepath = eval_request.json_filepath
|
29 |
-
|
30 |
-
with open(json_filepath) as fp:
|
31 |
-
data = json.load(fp)
|
32 |
-
|
33 |
-
data["status"] = set_to_status
|
34 |
-
|
35 |
-
with open(json_filepath, "w") as f:
|
36 |
-
f.write(json.dumps(data))
|
37 |
-
|
38 |
-
api.upload_file(
|
39 |
-
path_or_fileobj=json_filepath,
|
40 |
-
path_in_repo=json_filepath.replace(local_dir, ""),
|
41 |
-
repo_id=hf_repo,
|
42 |
-
repo_type="dataset",
|
43 |
-
)
|
44 |
-
|
45 |
-
|
46 |
-
def get_eval_requests(job_status: list, local_dir: str, hf_repo: str) -> list[EvalRequest]:
|
47 |
-
"""Get all pending evaluation requests and return a list in which private
|
48 |
-
models appearing first, followed by public models sorted by the number of
|
49 |
-
likes.
|
50 |
-
|
51 |
-
Returns:
|
52 |
-
`list[EvalRequest]`: a list of model info dicts.
|
53 |
-
"""
|
54 |
-
snapshot_download(
|
55 |
-
repo_id=hf_repo, revision="main", local_dir=local_dir, repo_type="dataset", max_workers=60, token=TOKEN
|
56 |
-
)
|
57 |
-
json_files = glob.glob(f"{local_dir}/**/*.json", recursive=True)
|
58 |
-
|
59 |
-
eval_requests = []
|
60 |
-
for json_filepath in json_files:
|
61 |
-
with open(json_filepath) as fp:
|
62 |
-
data = json.load(fp)
|
63 |
-
if data["status"] in job_status:
|
64 |
-
data["json_filepath"] = json_filepath
|
65 |
-
eval_request = EvalRequest(**data)
|
66 |
-
eval_requests.append(eval_request)
|
67 |
-
|
68 |
-
return eval_requests
|
69 |
-
|
70 |
-
|
71 |
-
def check_completed_evals(
|
72 |
-
api: HfApi,
|
73 |
-
hf_repo: str,
|
74 |
-
local_dir: str,
|
75 |
-
checked_status: str,
|
76 |
-
completed_status: str,
|
77 |
-
failed_status: str,
|
78 |
-
hf_repo_results: str,
|
79 |
-
local_dir_results: str,
|
80 |
-
):
|
81 |
-
"""Checks if the currently running evals are completed, if yes, update their status on the hub."""
|
82 |
-
snapshot_download(
|
83 |
-
repo_id=hf_repo_results,
|
84 |
-
revision="main",
|
85 |
-
local_dir=local_dir_results,
|
86 |
-
repo_type="dataset",
|
87 |
-
max_workers=60,
|
88 |
-
token=TOKEN,
|
89 |
-
)
|
90 |
-
|
91 |
-
running_evals = get_eval_requests(checked_status, hf_repo=hf_repo, local_dir=local_dir)
|
92 |
-
|
93 |
-
for eval_request in running_evals:
|
94 |
-
model = eval_request.model
|
95 |
-
logger.info("====================================")
|
96 |
-
logger.info(f"Checking {model}")
|
97 |
-
|
98 |
-
output_path = model
|
99 |
-
output_file = f"{local_dir_results}/{output_path}/results*.json"
|
100 |
-
output_file_exists = len(glob.glob(output_file)) > 0
|
101 |
-
|
102 |
-
if output_file_exists:
|
103 |
-
logger.info(f"EXISTS output file exists for {model} setting it to {completed_status}")
|
104 |
-
set_eval_request(api, eval_request, completed_status, hf_repo, local_dir)
|
105 |
-
else:
|
106 |
-
logger.info(f"No result file found for {model} setting it to {failed_status}")
|
107 |
-
set_eval_request(api, eval_request, failed_status, hf_repo, local_dir)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/backend/run_eval_suite_harness.py
DELETED
@@ -1,91 +0,0 @@
|
|
1 |
-
import json
|
2 |
-
import os
|
3 |
-
import logging
|
4 |
-
from datetime import datetime
|
5 |
-
|
6 |
-
from src.envs import RESULTS_REPO, API
|
7 |
-
from src.backend.manage_requests import EvalRequest
|
8 |
-
from src.logging import setup_logger
|
9 |
-
import fnmatch
|
10 |
-
import torch
|
11 |
-
from torch import nn
|
12 |
-
from huggingface_hub.utils._errors import EntryNotFoundError
|
13 |
-
|
14 |
-
import gymnasium as gym
|
15 |
-
|
16 |
-
|
17 |
-
import numpy as np
|
18 |
-
from typing import List
|
19 |
-
from huggingface_hub import hf_hub_download
|
20 |
-
from src.backend.manage_requests import EvalRequest
|
21 |
-
|
22 |
-
logging.getLogger("openai").setLevel(logging.WARNING)
|
23 |
-
logger = setup_logger(__name__)
|
24 |
-
|
25 |
-
|
26 |
-
def pattern_match(patterns, source_list):
|
27 |
-
if isinstance(patterns, str):
|
28 |
-
patterns = [patterns]
|
29 |
-
|
30 |
-
task_names = set()
|
31 |
-
for pattern in patterns:
|
32 |
-
for matching in fnmatch.filter(source_list, pattern):
|
33 |
-
task_names.add(matching)
|
34 |
-
return sorted(list(task_names))
|
35 |
-
|
36 |
-
|
37 |
-
def run_evaluation(eval_request: EvalRequest, task_names, local_dir: str, results_repo: str):
|
38 |
-
tags = API.model_info(eval_request.model).tags
|
39 |
-
task_names = pattern_match(tags, task_names)
|
40 |
-
|
41 |
-
logger.info(f"Selected Tasks: {task_names}")
|
42 |
-
|
43 |
-
results = {
|
44 |
-
"config": {
|
45 |
-
"model_name": eval_request.model,
|
46 |
-
"model_sha": eval_request.revision,
|
47 |
-
},
|
48 |
-
"results": {},
|
49 |
-
}
|
50 |
-
try:
|
51 |
-
agent_path = hf_hub_download(repo_id=eval_request.model, filename="agent.pt")
|
52 |
-
except EntryNotFoundError:
|
53 |
-
logger.error("Agent not found")
|
54 |
-
return
|
55 |
-
agent = torch.jit.load(agent_path)
|
56 |
-
|
57 |
-
episodic_rewards = []
|
58 |
-
for task_name in task_names:
|
59 |
-
env = gym.make(task_name)
|
60 |
-
for _ in range(10):
|
61 |
-
episodic_reward = 0.0
|
62 |
-
observation, info = env.reset()
|
63 |
-
done = False
|
64 |
-
while not done:
|
65 |
-
torch_observation = torch.from_numpy(np.array([observation]))
|
66 |
-
action = agent(torch_observation).numpy()[0]
|
67 |
-
observation, reward, terminated, truncated, info = env.step(action)
|
68 |
-
done = terminated or truncated
|
69 |
-
episodic_reward += reward
|
70 |
-
|
71 |
-
episodic_rewards.append(episodic_reward)
|
72 |
-
|
73 |
-
mean_reward = np.mean(episodic_rewards)
|
74 |
-
results[task_name] = {"episodic_return": mean_reward}
|
75 |
-
|
76 |
-
dumped = json.dumps(results, indent=2)
|
77 |
-
logger.info(dumped)
|
78 |
-
|
79 |
-
output_path = os.path.join(local_dir, *eval_request.model.split("/"), f"results_{datetime.now()}.json")
|
80 |
-
os.makedirs(os.path.dirname(output_path), exist_ok=True)
|
81 |
-
with open(output_path, "w") as f:
|
82 |
-
f.write(dumped)
|
83 |
-
|
84 |
-
API.upload_file(
|
85 |
-
path_or_fileobj=output_path,
|
86 |
-
path_in_repo=f"{eval_request.model}/results_{datetime.now()}.json",
|
87 |
-
repo_id=results_repo,
|
88 |
-
repo_type="dataset",
|
89 |
-
)
|
90 |
-
|
91 |
-
return results
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/backend/sort_queue.py
DELETED
@@ -1,23 +0,0 @@
|
|
1 |
-
import re
|
2 |
-
from dataclasses import dataclass
|
3 |
-
|
4 |
-
from huggingface_hub import HfApi
|
5 |
-
|
6 |
-
from src.backend.manage_requests import EvalRequest
|
7 |
-
|
8 |
-
|
9 |
-
@dataclass
|
10 |
-
class ModelMetadata:
|
11 |
-
likes: int = 0
|
12 |
-
|
13 |
-
|
14 |
-
def sort_models_by_priority(api: HfApi, models: list[EvalRequest]) -> list[EvalRequest]:
|
15 |
-
return sort_by_submit_date(models)
|
16 |
-
|
17 |
-
|
18 |
-
def sort_by_submit_date(eval_requests: list[EvalRequest]) -> list[EvalRequest]:
|
19 |
-
return sorted(eval_requests, key=lambda x: x.submitted_time, reverse=False)
|
20 |
-
|
21 |
-
|
22 |
-
def sort_by_likes(eval_requests: list[EvalRequest]) -> list[EvalRequest]:
|
23 |
-
return sorted(eval_requests, key=lambda x: x.likes, reverse=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/{display/css_html_js.py → css_html_js.py}
RENAMED
File without changes
|
src/display/log_visualizer.py
DELETED
@@ -1,40 +0,0 @@
|
|
1 |
-
from io import StringIO
|
2 |
-
from pathlib import Path
|
3 |
-
|
4 |
-
from bs4 import BeautifulSoup
|
5 |
-
from rich.console import Console
|
6 |
-
from rich.syntax import Syntax
|
7 |
-
|
8 |
-
from src.display.css_html_js import style_content
|
9 |
-
from src.envs import NUM_LINES_VISUALIZE
|
10 |
-
from src.logging import log_file
|
11 |
-
|
12 |
-
|
13 |
-
def log_file_to_html_string(reverse=True):
|
14 |
-
with open(log_file, "rt") as f:
|
15 |
-
lines = f.readlines()
|
16 |
-
lines = lines[-NUM_LINES_VISUALIZE:]
|
17 |
-
|
18 |
-
if reverse:
|
19 |
-
lines = reversed(lines)
|
20 |
-
|
21 |
-
output = "".join(lines)
|
22 |
-
syntax = Syntax(output, "python", theme="monokai", word_wrap=True)
|
23 |
-
|
24 |
-
console = Console(record=True, width=150, style="#272822", file=StringIO())
|
25 |
-
console.print(syntax)
|
26 |
-
html_content = console.export_html(inline_styles=True)
|
27 |
-
|
28 |
-
# Parse the HTML content using BeautifulSoup
|
29 |
-
soup = BeautifulSoup(html_content, "lxml")
|
30 |
-
|
31 |
-
# Modify the <pre> tag and add custom styles
|
32 |
-
pre_tag = soup.pre
|
33 |
-
pre_tag["class"] = "scrollable"
|
34 |
-
del pre_tag["style"]
|
35 |
-
|
36 |
-
# Add your custom styles and the .scrollable CSS to the <style> tag
|
37 |
-
style_tag = soup.style
|
38 |
-
style_tag.append(style_content)
|
39 |
-
|
40 |
-
return soup.prettify()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/envs.py
CHANGED
@@ -8,8 +8,8 @@ TOKEN = os.environ.get("TOKEN") # A read/write token for your org
|
|
8 |
|
9 |
OWNER = "open-rl-leaderboard" # Change to your org - don't forget to create a results and request file
|
10 |
|
11 |
-
# For
|
12 |
-
DEVICE = "cpu" # "cuda:0" if you add compute, for
|
13 |
LIMIT = 20 # !!!! Should be None for actual evaluations!!!
|
14 |
|
15 |
# For lighteval evaluations
|
@@ -19,17 +19,13 @@ VENDOR = "aws"
|
|
19 |
# ----------------------------------
|
20 |
|
21 |
REPO_ID = f"{OWNER}/backend"
|
22 |
-
QUEUE_REPO = f"{OWNER}/requests"
|
23 |
RESULTS_REPO = f"{OWNER}/results"
|
24 |
|
25 |
# If you setup a cache later, just change HF_HOME
|
26 |
CACHE_PATH = os.getenv("HF_HOME", ".")
|
27 |
|
28 |
# Local caches
|
29 |
-
|
30 |
-
EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
|
31 |
-
EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
|
32 |
-
EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
|
33 |
|
34 |
REFRESH_RATE = 1 * 60 # 1 min
|
35 |
NUM_LINES_VISUALIZE = 300
|
|
|
8 |
|
9 |
OWNER = "open-rl-leaderboard" # Change to your org - don't forget to create a results and request file
|
10 |
|
11 |
+
# For evaluations
|
12 |
+
DEVICE = "cpu" # "cuda:0" if you add compute, for evaluations
|
13 |
LIMIT = 20 # !!!! Should be None for actual evaluations!!!
|
14 |
|
15 |
# For lighteval evaluations
|
|
|
19 |
# ----------------------------------
|
20 |
|
21 |
REPO_ID = f"{OWNER}/backend"
|
|
|
22 |
RESULTS_REPO = f"{OWNER}/results"
|
23 |
|
24 |
# If you setup a cache later, just change HF_HOME
|
25 |
CACHE_PATH = os.getenv("HF_HOME", ".")
|
26 |
|
27 |
# Local caches
|
28 |
+
RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
|
|
|
|
|
|
|
29 |
|
30 |
REFRESH_RATE = 1 * 60 # 1 min
|
31 |
NUM_LINES_VISUALIZE = 300
|
src/logging.py
CHANGED
@@ -1,4 +1,3 @@
|
|
1 |
-
import sys
|
2 |
from pathlib import Path
|
3 |
|
4 |
proj_dir = Path(__file__).parents[1]
|
|
|
|
|
1 |
from pathlib import Path
|
2 |
|
3 |
proj_dir = Path(__file__).parents[1]
|
src/populate.py
DELETED
@@ -1,56 +0,0 @@
|
|
1 |
-
import json
|
2 |
-
import os
|
3 |
-
|
4 |
-
import pandas as pd
|
5 |
-
|
6 |
-
from src.display.formatting import has_no_nan_values, make_clickable_model
|
7 |
-
from src.display.utils import AutoEvalColumn, EvalQueueColumn
|
8 |
-
from src.leaderboard.read_evals import get_raw_eval_results
|
9 |
-
|
10 |
-
|
11 |
-
def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
|
12 |
-
raw_data = get_raw_eval_results(results_path, requests_path)
|
13 |
-
all_data_json = [v.to_dict() for v in raw_data]
|
14 |
-
|
15 |
-
df = pd.DataFrame.from_records(all_data_json)
|
16 |
-
df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
|
17 |
-
df = df[cols].round(decimals=2)
|
18 |
-
|
19 |
-
# filter out if any of the benchmarks have not been produced
|
20 |
-
df = df[has_no_nan_values(df, benchmark_cols)]
|
21 |
-
return raw_data, df
|
22 |
-
|
23 |
-
|
24 |
-
def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
|
25 |
-
entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
|
26 |
-
all_evals = []
|
27 |
-
|
28 |
-
for entry in entries:
|
29 |
-
if ".json" in entry:
|
30 |
-
file_path = os.path.join(save_path, entry)
|
31 |
-
with open(file_path) as fp:
|
32 |
-
data = json.load(fp)
|
33 |
-
|
34 |
-
data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
|
35 |
-
data[EvalQueueColumn.revision.name] = data.get("revision", "main")
|
36 |
-
|
37 |
-
all_evals.append(data)
|
38 |
-
elif ".md" not in entry:
|
39 |
-
# this is a folder
|
40 |
-
sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if not e.startswith(".")]
|
41 |
-
for sub_entry in sub_entries:
|
42 |
-
file_path = os.path.join(save_path, entry, sub_entry)
|
43 |
-
with open(file_path) as fp:
|
44 |
-
data = json.load(fp)
|
45 |
-
|
46 |
-
data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
|
47 |
-
data[EvalQueueColumn.revision.name] = data.get("revision", "main")
|
48 |
-
all_evals.append(data)
|
49 |
-
|
50 |
-
pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
|
51 |
-
running_list = [e for e in all_evals if e["status"] == "RUNNING"]
|
52 |
-
finished_list = [e for e in all_evals if e["status"].startswith("FINISHED") or e["status"] == "PENDING_NEW_EVAL"]
|
53 |
-
df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
|
54 |
-
df_running = pd.DataFrame.from_records(running_list, columns=cols)
|
55 |
-
df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
|
56 |
-
return df_finished[cols], df_running[cols], df_pending[cols]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|